# Imports

In [58]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.nn import functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Task 1


In [5]:
file_path = '/content/drive/My Drive/DL_24-25/Data/shakespere.txt'

with open(file_path, 'r') as file:
  text = file.read()

print(len(text))
print(text[:1000])


1115393
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for re

# Task 2

In [7]:
vocabulary = sorted(set(text))
print(f'Vocabulary size: {len(vocabulary)}')
print(''.join(vocabulary))

Vocabulary size: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Task 3

In [40]:
itos = dict(enumerate(vocabulary))


stoi = {chr: idx for idx, chr in itos.items()}

def token_to_id(token):
  return [stoi[chr] for chr in token]

def id_to_token(id):
  return ''.join([itos[idx] for idx in id])


res = token_to_id('hi therre')
print(res)
print(id_to_token(res))



[46, 47, 1, 58, 46, 43, 56, 56, 43]
hi therre


# Task 4

In [49]:
file_path = '/content/drive/My Drive/DL_24-25/Data/shakespere.txt'

with open(file_path, 'r') as file:
  text = file.read()


text_tokenised = token_to_id(text)
text_tokenised_tensor = torch.tensor(text_tokenised, dtype=torch.int64)
print(f'Shape of result:{text_tokenised_tensor.shape}') #My text is 1 character smaller than yours
print(text_tokenised_tensor[:1000])

1115393
Shape of result:torch.Size([1115393])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56

# Task 5

In [51]:
split_idx = int(0.9 * len(text_tokenised_tensor))
train_data = text_tokenised_tensor[:split_idx]
test_data = text_tokenised_tensor[split_idx:]

print(f'Shape of dataset for training : {train_data.shape}')
print(f'Shape of dataset for testing :  {test_data.shape}')

Shape of dataset for training : torch.Size([1003853])
Shape of dataset for testing :  torch.Size([111540])


# Task 6

In [66]:
from random import sample
block_size = 8


def get_sample(text_tensor, idx, block_size = 8):
  X = text_tensor[max(0, idx - block_size):idx+1]
  y = text_tensor[idx+1]
  return X,y


samples = []
for i in range(9):
  samples.append(get_sample(train_data, i, block_size))

for i in range(8):
  X, y = samples[i]
  print(f'Sample {i}: {X} , predict: {y}')


Sample 0: tensor([18]) , predict: 47
Sample 1: tensor([18, 47]) , predict: 56
Sample 2: tensor([18, 47, 56]) , predict: 57
Sample 3: tensor([18, 47, 56, 57]) , predict: 58
Sample 4: tensor([18, 47, 56, 57, 58]) , predict: 1
Sample 5: tensor([18, 47, 56, 57, 58,  1]) , predict: 15
Sample 6: tensor([18, 47, 56, 57, 58,  1, 15]) , predict: 47
Sample 7: tensor([18, 47, 56, 57, 58,  1, 15, 47]) , predict: 58


# Task 7

In [None]:
torch.manual_seed(42)
batch_size = 4

def get_batch(text_tensor, block_size = 8):
  idx = torch.randint(0, len(text_tensor) - block_size, (batch_size,))
