<a href="https://colab.research.google.com/github/Sindhu213/Pytorch/blob/main/NLP/rnn_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/My\ Drive/assets

/content/drive/My Drive/assets


In [71]:
import re
import torch
import numpy as np
from pathlib import Path
from torch import nn, Tensor
from typing import List,Tuple
from torchtext.vocab import vocab
from collections import Counter,OrderedDict
from torch.utils.data import DataLoader,Dataset

## Text Preprocessing

In [4]:
file_dir = Path('./AndThenThereWereNone.txt')
with open(file_dir, 'r') as file:
  text = file.read()

In [5]:
def get_tokenizer(sentence):
  tokenized = re.sub(r'[^\w\s]+',' ',sentence.lower())
  return tokenized.split()

In [6]:
counter = Counter(get_tokenizer(text))
sorted_by_freq = sorted(counter.items(),key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq)

Vocab = vocab(ordered_dict,min_freq=2,specials=["<unk>"])    
Vocab.set_default_index(0)

In [110]:
def onehotvector(list_of_ints):
  overall_list = np.zeros((len(list_of_ints),len(Vocab)))   
  for index, value in enumerate(list_of_ints):
    overall_list[index,value] = 1
  return overall_list

In [111]:
text_pipeline = lambda x: Vocab(get_tokenizer(x))  

In [112]:
class TextDataset:

  def __init__(self,input:List[str],seq_length:int):    
    self.input = input
    self.sl = seq_length

  def collate(self) -> Tuple[Tensor,Tensor]:
    container = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(0,len(self.input)-self.sl,self.sl):
        data = torch.tensor(onehotvector(self.input[i:i+self.sl]), dtype=torch.float32, device=device)   
        label = torch.tensor(onehotvector(self.input[i+1:i+self.sl+1]), dtype=torch.float32, device=device)    
        container.append((data,label))

    return container

In [113]:
input_data = text_pipeline(text)

In [114]:
seq_length = 60       ## small seq_length due to long term dependency problem, might change 
batch_size = 32

torch.manual_seed(42)
input_dataset = TextDataset(input_data,seq_length).collate()
dataloader = DataLoader(input_dataset,batch_size=batch_size,drop_last=True,shuffle=True)  

In [115]:
x_batch, y_batch = next(iter(dataloader))
print(x_batch.shape,y_batch.shape)

torch.Size([32, 60, 2618]) torch.Size([32, 60, 2618])


## Model Definition

In [116]:
vocab_size = len(Vocab)
embed_dim = 128
rnn_hidden_size = 64

In [125]:
class LanguageModelling(nn.Module):

  def __init__(self,vocab_size,embed_dim,rnn_hidden_dim):
    super(LanguageModelling,self).__init__()
    self.vocab_size = vocab_size
    self.rnn_hidden_size = rnn_hidden_dim
    self.embedding = nn.Embedding(vocab_size, embed_dim)  
    self.lstm = nn.LSTM(embed_dim,rnn_hidden_size,batch_first=True)   
    self.fc = nn.Linear(rnn_hidden_size,vocab_size)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input,hidden,cell):
    input = input.to(torch.int64) 
    out = self.embedding(input)
    out,(hidden,cell) = self.lstm(out,(hidden,cell))      
    out = self.fc(out).reshape(self.vocab_size,-1) 
    out = self.softmax(out) 
    return out,hidden,cell

  def init_hidden_and_cell(self,batch_size):
    hidden = torch.zeros(1,batch_size,self.rnn_hidden_size)
    cell = torch.zeros(1,batch_size,self.rnn_hidden_size)
    return hidden, cell

In [126]:
hidden,cell = model.init_hidden_and_cell(32)
xbatch,ynatch = next(iter(dataloader))
out, hidden, cell = model(xbatch[:,0],hidden,cell)
print(out.shape)

torch.Size([32, 6853924])


## Model Training and Evaluation

In [118]:
model = LanguageModelling(vocab_size,embed_dim,rnn_hidden_size)
model

LanguageModelling(
  (embedding): Embedding(2618, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=2618, bias=True)
  (softmax): Softmax(dim=1)
)

In [119]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001) 

In [120]:
num_epochs = 100
def train(data_iter):
  for epoch in range(num_epochs):
    hidden,cell = model.init_hidden_and_cell(batch_size)
    x_batch, y_batch = next(iter(data_iter))
    optimizer.zero_grad()
    loss = 0.0
    for c in range(seq_length):
      out,hidden,cell = model(x_batch[:,c],hidden,cell)
      loss += loss_fn(y_batch[:,c],out)     ##y_batch not of right shape
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 10 == 0:
      print(f'Epoch {epoch} loss: {loss:.4f}')

In [None]:
train(dataloader)

In [51]:
def generate_text(seeded_text,max_length):
  pass