<a href="https://colab.research.google.com/github/sindhu213/Pytorch/blob/main/NLP/rnn_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive",force_remount=False)

Mounted at /content/drive


In [3]:
import re
import math
import torch
from pathlib import Path
from torch import nn, Tensor
from torchtext.vocab import vocab
from collections import Counter,OrderedDict
from torch.utils.data import DataLoader

## Text Preprocessing

In [4]:
file_dir = Path('drive/My Drive/assets/AndThenThereWereNone.txt')
with open(file_dir, 'r') as file:
  TEXT = file.read()

In [5]:
def get_tokenizer(sentence):
  tokenized = re.sub(r'[^\w\s]+',' ',sentence.lower())
  return tokenized.split()

In [6]:
counter = Counter(get_tokenizer(TEXT))
sorted_by_freq = sorted(counter.items(),key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq)

Vocab = vocab(ordered_dict,min_freq=1)    
Vocab.set_default_index(0)

In [7]:
text_pipeline = lambda x: Vocab(get_tokenizer(x))  

In [8]:
def build_dataset(input:str,seq_length:int,stride:int):
  container = []
  input_encoded = text_pipeline(input)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
  for i in range(0,len(input_encoded)-seq_length,stride):
    data_encoded = input_encoded[i:i+seq_length+1]
    container.append(torch.tensor(data_encoded,device=device,dtype=torch.int64))
  return container

In [9]:
torch.manual_seed(42)

SEQ_LENGTH = 100       
BATCH_SIZE = 32
STRIDE = 64

DATA = build_dataset(TEXT,SEQ_LENGTH,STRIDE)
TRAIN_DL = DataLoader(DATA,batch_size=BATCH_SIZE,drop_last=True,shuffle=True)  

In [10]:
print("Length of TRAIN_DL: ",len(TRAIN_DL))
print("Total no. of tokens: ",len(get_tokenizer(TEXT)))
print("Total no. of unique tokens: ",len(Vocab))

Length of TRAIN_DL:  26
Total no. of tokens:  55200
Total no. of unique tokens:  5304


## Model Definition and Initialization

In [25]:
class LanguageModelling(nn.Module):

  def __init__(self,vocab_size,embed_dim,num_layers,rnn_hidden_dim,tie_weights=False):
    super().__init__()
    self.num_layers = num_layers
    self.rnn_hidden_dim = rnn_hidden_dim
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    self.embedding = nn.Embedding(vocab_size, embed_dim)  
    self.lstm = nn.LSTM(embed_dim,rnn_hidden_dim,num_layers,batch_first=True)   
    self.fc = nn.Linear(rnn_hidden_dim,vocab_size)
    if tie_weights:
      assert rnn_hidden_dim == embed_dim, "rnn_hidden_dim must be equal to embed_dim if tie_weights is set to True."
      self.fc.weight = self.embedding.weight


  def forward(self,input,hidden,cell):
    """
    INPUT:
        input: [batch_size,seq_length]
        hidden: [num_layers,batch_size,rnn_hidden_dim]
        cell: [num_layers,batch_size,rnn_hidden_dim]

    RETURNS:
        out: [batch_size,seq_length,vocab_size]
        hidden: [num_layers,batch_size,rnn_hidden_dim]
        cell: [num_layers,batch_size,rnn_hidden_dim]
    """
    # out: [batch_size,seq_length,embed_dim]
    out = self.embedding(input) 

    # out: [batch_size,seq_length,rnn_hidden_dim]
    # hidden: [num_layers,batch_size,rnn_hidden_dim]
    # cell: [num_layers,batch_size,rnn_hidden_dim]
    out,(hidden,cell) = self.lstm(out,(hidden,cell))   

    # out: [batch_size,seq_length,vocab_size]
    out = self.fc(out)
    return out,hidden,cell 


  def init_hidden_and_cell(self,batch_size):
    hidden = torch.zeros(self.num_layers,batch_size,self.rnn_hidden_dim)    
    cell = torch.zeros(self.num_layers,batch_size,self.rnn_hidden_dim)
    return hidden.to(device), cell.to(device)

In [26]:
VOCAB_SIZE = len(Vocab)
EMBED_DIM = 128
RNN_HIDDEN_DIM = 128
NUM_LAYERS = 2
TIE_WEIGHTS = False

In [127]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LanguageModelling(VOCAB_SIZE,EMBED_DIM,NUM_LAYERS,RNN_HIDDEN_DIM,TIE_WEIGHTS)
model.to(device)

LanguageModelling(
  (embedding): Embedding(5304, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=5304, bias=True)
)

In [128]:
def count_parameters(model):
  return sum(param.numel() for param in model.parameters() if param.requires_grad)

print("Total no. of trainable parameters: {: ,}".format(count_parameters(model)))

Total no. of trainable parameters:  1,627,320


## Model Training

In [129]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.005) 

In [130]:
def train(data_iter):
  model.train()
  epoch_loss =  0.0
  # hidden: [num_layers,batch_size,rnn_hidden_dim]
  # cell: [num_layers,batch_size,rnn_hidden_dim]
  hidden,cell = model.init_hidden_and_cell(BATCH_SIZE)

  # input: [batch_size,seq_length+1]
  for input in data_iter:   
    optimizer.zero_grad()
    hidden = hidden.detach()
    cell = cell.detach()
    # prediction: [batch_size,seq_length,vocab_size]
    # hidden: [num_layers,batch_size,rnn_hidden_dim]
    # cell: [num_layers,batch_size,rnn_hidden_dim]
    prediction, hidden, cell = model(input[:,:-1],hidden,cell)

    # prediction: [batch_size*seq_length,vocab_size]
    prediction = prediction.reshape(-1,VOCAB_SIZE)

    # label: [batch_size*seq_length]
    label = input[:,1:].reshape(-1)

    # loss: [batch_size*seq_length]
    loss = loss_fn(prediction,label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()*SEQ_LENGTH
      
  return epoch_loss/len(data_iter)

In [137]:
NUM_EPOCHS = 100
for epoch in range(NUM_EPOCHS):
  training_loss = train(TRAIN_DL)
  if epoch%5 == 0: 
    print(f"EPOCH: {epoch}")
    print(f"-------TRAIN_LOSS: {training_loss: .4f}")

EPOCH: 0
-------TRAIN_LOSS:  28.8596
EPOCH: 5
-------TRAIN_LOSS:  23.9844
EPOCH: 10
-------TRAIN_LOSS:  23.1942
EPOCH: 15
-------TRAIN_LOSS:  22.6879
EPOCH: 20
-------TRAIN_LOSS:  21.9664
EPOCH: 25
-------TRAIN_LOSS:  21.8178
EPOCH: 30
-------TRAIN_LOSS:  21.3195
EPOCH: 35
-------TRAIN_LOSS:  20.1657
EPOCH: 40
-------TRAIN_LOSS:  19.8128
EPOCH: 45
-------TRAIN_LOSS:  19.4703
EPOCH: 50
-------TRAIN_LOSS:  20.1425
EPOCH: 55
-------TRAIN_LOSS:  19.1655
EPOCH: 60
-------TRAIN_LOSS:  19.6734
EPOCH: 65
-------TRAIN_LOSS:  17.5920
EPOCH: 70
-------TRAIN_LOSS:  17.8701
EPOCH: 75
-------TRAIN_LOSS:  16.9511
EPOCH: 80
-------TRAIN_LOSS:  17.0411
EPOCH: 85
-------TRAIN_LOSS:  18.2822
EPOCH: 90
-------TRAIN_LOSS:  17.6315
EPOCH: 95
-------TRAIN_LOSS:  21.2692


# Generation

In [138]:
def generate(seeded_text,temperature,max_len=10):
    model.eval()
    if seeded_text is None: 
      seeded_text = "<bos>"
    BATCH_SIZE = 1
    tokens = text_pipeline(seeded_text)
    hidden, cell = model.init_hidden_and_cell(BATCH_SIZE)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
      for i in range(max_len):
        input = torch.tensor(tokens,device=device).unsqueeze(0)
        prediction, hidden, cell = model(input, hidden, cell)
        probs = torch.softmax(prediction[:, -1]/temperature, dim=-1) 
        prediction = torch.multinomial(probs, num_samples=1).item()
        tokens.append(prediction)

    tokens = [Vocab.get_itos()[i] for i in tokens]
    return tokens

In [140]:
' '.join(generate("they thought",temperature=0.5))

'they thought to him what sort of thing can t happen it'

In [148]:
' '.join(generate(None,temperature=1.5))

'the same boat we might light a bonfire tonight lombard said'

In [182]:
' '.join(generate("we must",temperature=2))

'we must enjoy a sensible young man where do we warn quickly'