<a href="https://colab.research.google.com/github/Sindhu213/Pytorch/blob/main/NLP/rnn_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/My\ Drive/assets

/content/drive/My Drive/assets


In [32]:
import re
import torch
import numpy as np
from pathlib import Path
from torch import nn, Tensor
from typing import List,Tuple
from torchtext.vocab import vocab
from collections import Counter,OrderedDict
from torch.utils.data import DataLoader,Dataset

## Text Preprocessing

In [33]:
file_dir = Path('./AndThenThereWereNone.txt')
with open(file_dir, 'r') as file:
  text = file.read()

In [34]:
def get_tokenizer(sentence):
  tokenized = re.sub(r'[^\w\s]+',' ',sentence.lower())
  return tokenized.split()

In [35]:
counter = Counter(get_tokenizer(text))
sorted_by_freq = sorted(counter.items(),key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq)

Vocab = vocab(ordered_dict,min_freq=2)    
Vocab.set_default_index(0)

In [36]:
text_pipeline = lambda x: Vocab(get_tokenizer(x))  

In [37]:
class TextDataset:

  def __init__(self,input:List[str],seq_length:int):    
    self.input = input
    self.sl = seq_length

  def collate(self) -> Tuple[Tensor,Tensor]:
    container = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(0,len(self.input)-self.sl,self.sl):
        data = torch.tensor(self.input[i:i+self.sl], device=device)   
        label = torch.tensor(self.input[i+1:i+self.sl+1], device=device)    
        container.append((data,label))

    return container

In [38]:
input_data = text_pipeline(text)

In [49]:
seq_length = 100       
batch_size = 32

torch.manual_seed(42)
input_dataset = TextDataset(input_data,seq_length).collate()
dataloader = DataLoader(input_dataset,batch_size=batch_size,drop_last=True,shuffle=True)  

## Model Definition

In [50]:
vocab_size = len(Vocab)
embed_dim = 128
rnn_hidden_size = 64

In [81]:
class LanguageModelling(nn.Module):

  def __init__(self,vocab_size,embed_dim,rnn_hidden_dim):
    super(LanguageModelling,self).__init__()
    self.rnn_hidden_size = rnn_hidden_dim
    self.embedding = nn.Embedding(vocab_size, embed_dim)  
    self.lstm = nn.LSTM(embed_dim,rnn_hidden_size,batch_first=True)   
    self.fc = nn.Linear(rnn_hidden_size,vocab_size)

  def forward(self,input,hidden):
    """
    INPUT:
        input: [batch_size,seq_length]
        hidden: [1,batch_size,rnn_hidden_size]

    RETURNS:
        out: [batch_size,seq_length,vocab_size]
        hidden: [1,batch_size,rnn_hidden_size]
    """
    out = self.embedding(input)   
    out,hidden = self.lstm(out,hidden)   
    out = self.fc(out)  
    return out, hidden

  def init_hidden(self,batch_size):
    hidden = torch.zeros(1,batch_size,self.rnn_hidden_size)    
    cell = torch.zeros(1,batch_size,self.rnn_hidden_size)
    return hidden, cell

  def detach_hidden(self, hidden):
    hidden, cell = hidden
    hidden = hidden.detach()
    cell = cell.detach()
    return hidden, cell

## Model Training and Evaluation

In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LanguageModelling(vocab_size,embed_dim,rnn_hidden_size)
model.to(device)

LanguageModelling(
  (embedding): Embedding(2617, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=2617, bias=True)
)

In [90]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.005) 

In [93]:
# x_batch: [batch_size,seq_length]
# y_batch: [batch_size,seq_length]

def train(data_iter):
  epoch_loss =  0.0
  model.train()
  hidden = model.init_hidden(batch_size)
  for x_batch,y_batch in dataloader:   
    optimizer.zero_grad()
    hidden = model.detach_hidden(hidden)
    prediction, hidden = model(x_batch,hidden)
    prediction = prediction.reshape(batch_size*seq_length,vocab_size)
    y_batch = y_batch.reshape(-1)
    loss = loss_fn(prediction,y_batch)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()*seq_length
      
  return epoch_loss/seq_length

In [94]:
num_epochs = 20
for epoch in range(num_epochs):
  training_loss = train(dataloader)
  print(f"Epoch: {epoch}, Training loss: {training_loss: .4f}")

Epoch: 0, Training loss:  82.2165
Epoch: 1, Training loss:  79.9211
Epoch: 2, Training loss:  78.4385
Epoch: 3, Training loss:  77.1364
Epoch: 4, Training loss:  75.9658
Epoch: 5, Training loss:  74.8989
Epoch: 6, Training loss:  73.9450
Epoch: 7, Training loss:  72.9598
Epoch: 8, Training loss:  72.0118
Epoch: 9, Training loss:  71.1567
Epoch: 10, Training loss:  70.2972
Epoch: 11, Training loss:  69.4304
Epoch: 12, Training loss:  68.6970
Epoch: 13, Training loss:  67.9010
Epoch: 14, Training loss:  67.0986
Epoch: 15, Training loss:  66.3437
Epoch: 16, Training loss:  65.6501
Epoch: 17, Training loss:  64.9205
Epoch: 18, Training loss:  64.2406
Epoch: 19, Training loss:  63.5496


In [95]:
torch.manual_seed(42)
def generate_text(seeded_text,text_len=10):
    model.eval()
    tokens = text_pipeline(seeded_text)
    batch_size = 1
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for i in range(text_len):
            input = torch.LongTensor([tokens])
            prediction, hidden = model(input, hidden)
            probs = torch.softmax(prediction[:, -1], dim=-1) 
            prediction = torch.multinomial(probs, num_samples=1).item()
            tokens.append(prediction)

    itos = Vocab.get_itos()
    tokens = [itos[i] for i in tokens]
    return tokens

In [96]:
generate_text("The man",10)

['the',
 'man',
 'who',
 'now',
 'just',
 'there',
 'were',
 'no',
 'one',
 'of',
 'them',
 'er']