<a href="https://colab.research.google.com/github/Sindhu213/Pytorch/blob/main/NLP/rnn_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/assets

/content/drive/My Drive/assets


In [None]:
import re
import torch
from pathlib import Path
from torch import nn, Tensor
from typing import List,Tuple
from torchtext.vocab import vocab
from collections import Counter,OrderedDict
from torch.utils.data import DataLoader,Dataset

## Text Preprocessing

In [None]:
file_dir = Path('./AndThenThereWereNone.txt')
with open(file_dir, 'r') as file:
  text = file.read()

In [None]:
def get_tokenizer(sentence):
  tokenized = re.sub(r'[^\w\s]+',' ',sentence.lower())
  return tokenized.split()

In [None]:
counter = Counter(get_tokenizer(text))
sorted_by_freq = sorted(counter.items(),key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq)

Vocab = vocab(ordered_dict,min_freq=2,specials=["<unk>"])     ## might add <eos>
Vocab.set_default_index(0)

In [None]:
text_pipeline = lambda x: Vocab(get_tokenizer(x))  

In [None]:
class TextDataset:

  def __init__(self,input:List[str],chunk_size:int):    
    self.input = input
    self.b_S = chunk_size

  def collate(self) -> Tuple[Tensor,Tensor]:
    container = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(0,len(self.input)-self.b_S,self.b_S):
        data = torch.tensor(self.input[i:i+self.b_S],dtype=torch.float32,device=device)
        label = torch.tensor(self.input[i+1:i+self.b_S+1],dtype=torch.float32,device=device)
        container.append((data,label))

    return container

In [None]:
input_data = text_pipeline(text)
chunk_size = 5
batch_size = 3

torch.manual_seed(42)
input_dataset = TextDataset(input_data,chunk_size).collate()
dataloader = DataLoader(input_dataset,batch_size=batch_size,drop_last=True,shuffle=True)  

In [None]:
## sanity check

torch.manual_seed(42)
for input,label in dataloader:
  print("Input: ",input)
  print("Label: ",label)
  break

Input:  tensor([[1.3400e+02, 6.0000e+00, 9.3000e+01, 4.0000e+00, 0.0000e+00],
        [3.8600e+02, 3.0000e+01, 1.3900e+02, 5.5900e+02, 1.2000e+01],
        [1.0000e+00, 1.6100e+02, 1.9000e+01, 1.9970e+03, 2.4900e+02]])
Label:  tensor([[   6.,   93.,    4.,    0.,   86.],
        [  30.,  139.,  559.,   12.,  174.],
        [ 161.,   19., 1997.,  249.,    2.]])


## Model Definition

In [None]:
## likely to change
vocab_size = len(Vocab)
embed_dim = 128
rnn_hidden_size = 64
fc_hidden_size = 64

In [None]:
class LanguageModelling(nn.Module):

  def __init__(self,vocab_size,embed_dim,rnn_hidden_dim,fc_hidden_dim,):
    super(LanguageModelling,self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim,rnn_hidden_dim,batch_first=True)
    self.fc = nn.Linear(rnn_hidden_dim,fc_hidden_dim)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input):
    ## initial hidden state and cell state default to zero vector
    out = self.embedding(input)
    out,(hidden,cell) = self.lstm(out)
    out = self.fc(out)
    out = self.softmax(out)
    return out, hidden, cell

## Model Training and Evaluation

In [None]:
model = LanguageModelling(vocab_size,embed_dim,rnn_hidden_size,fc_hidden_size)
model

LanguageModelling(
  (embedding): Embedding(2618, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=64, bias=True)
  (softmax): Softmax(dim=1)
)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001) 

In [None]:
def train(data_iter):
  pass