# Imports

In [54]:
import random

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Task 1


In [36]:
file_path = "/content/shakespeare.txt"

with open(file_path, 'r') as file:
  text = file.read()

print(len(text))
print(text[:1000])


1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for re

# Task 2

In [37]:
vocabulary = sorted(set(text))
print(f'Vocabulary size: {len(vocabulary)}')
print(''.join(vocabulary))

Vocabulary size: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Task 3

In [38]:
itos = dict(enumerate(vocabulary))


stoi = {chr: idx for idx, chr in itos.items()}

def token_to_id(token):
  return [stoi[chr] for chr in token]

def id_to_token(id):
  return ''.join([itos[idx] for idx in id])


res = token_to_id('hi therre')
print(res)
print(id_to_token(res))



[46, 47, 1, 58, 46, 43, 56, 56, 43]
hi therre


# Task 4

In [39]:
file_path = "/content/shakespeare.txt"

with open(file_path, 'r') as file:
  text = file.read()


text_tokenised = token_to_id(text)
text_tokenised_tensor = torch.tensor(text_tokenised, dtype=torch.int64)
print(f'Shape of result:{text_tokenised_tensor.shape}') #My text is 1 character smaller than yours
print(text_tokenised_tensor[:1000])

Shape of result:torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
  

# Task 5

In [40]:
split_idx = int(0.9 * len(text_tokenised_tensor))
train_data = text_tokenised_tensor[:split_idx]
test_data = text_tokenised_tensor[split_idx:]

print(f'Shape of dataset for training : {train_data.shape}')
print(f'Shape of dataset for testing :  {test_data.shape}')

Shape of dataset for training : torch.Size([1003854])
Shape of dataset for testing :  torch.Size([111540])


# Task 6

In [41]:
from random import sample
block_size = 8


def get_sample(text_tensor, idx, block_size = 8):
  X = text_tensor[max(0, idx - block_size):idx+1]
  y = text_tensor[idx+1]
  return X,y


samples = []
for i in range(9):
  samples.append(get_sample(train_data, i, block_size))

for i in range(8):
  X, y = samples[i]
  print(f'Sample {i}: {X} , predict: {y}')


Sample 0: tensor([18]) , predict: 47
Sample 1: tensor([18, 47]) , predict: 56
Sample 2: tensor([18, 47, 56]) , predict: 57
Sample 3: tensor([18, 47, 56, 57]) , predict: 58
Sample 4: tensor([18, 47, 56, 57, 58]) , predict: 1
Sample 5: tensor([18, 47, 56, 57, 58,  1]) , predict: 15
Sample 6: tensor([18, 47, 56, 57, 58,  1, 15]) , predict: 47
Sample 7: tensor([18, 47, 56, 57, 58,  1, 15, 47]) , predict: 58


# Task 7

In [32]:
!wget raw.githubusercontent.com/SimeonHristov99/DL_24-25/refs/heads/main/DATA/shakespeare.txt

--2025-06-02 16:49:52--  http://raw.githubusercontent.com/SimeonHristov99/DL_24-25/refs/heads/main/DATA/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/SimeonHristov99/DL_24-25/refs/heads/main/DATA/shakespeare.txt [following]
--2025-06-02 16:49:52--  https://raw.githubusercontent.com/SimeonHristov99/DL_24-25/refs/heads/main/DATA/shakespeare.txt
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2025-06-02 16:49:53 (98.3 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



In [139]:
torch.Tensor(batch_size, block_size).type_as(torch.int64)

TypeError: type_as(): argument 'other' (position 1) must be Tensor, not torch.dtype

In [131]:
torch.manual_seed(42)
batch_size = 4

def get_batch(text_tensor, block_size = 8, batch_size = 4):
  idxs = torch.randint(0, len(text_tensor) - block_size, (batch_size,))
  Xs, Ys = torch.Tensor(batch_size, block_size).astype('int'), torch.Tensor(batch_size, block_size).astype('int')
  for i in range(batch_size):
    Xs[i] = text_tensor[idxs[i]:idxs[i]+block_size]
    Ys[i] = text_tensor[idxs[i]+1:idxs[i]+block_size+1]
  return Xs, Ys

Xs, Ys = get_batch(text_tokenised_tensor)
print(Xs)
print(Ys)

def get_sample(Xs, Ys, idx):
  X = Xs[idx // block_size][0:(idx % block_size + 1)]
  y = Ys[idx // block_size][(idx % block_size)]
  return X,y


for i in range(32):
  X, y = get_sample(Xs, Ys, i)
  print(f'Sample {i}: {X} , predict: {y}')


AttributeError: 'Tensor' object has no attribute 'astype'

# Task 8

In [126]:
Xs

tensor([[42.,  1., 58., 46., 59., 57.,  1., 21.],
        [54., 56., 47., 43., 57., 58., 11.,  0.],
        [49., 47., 52., 45., 12.,  1., 58., 46.],
        [58., 46., 53., 59., 58.,  1., 56., 43.]])

In [125]:
embedding_table = nn.Embedding(len(vocabulary), len(vocabulary))
logit = embedding_table(Xs)
print(logit.shape)


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)