In [1]:
from functools import partial
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim

from torchtext import datasets
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
if torch.cuda.is_available():
    device=torch.device(type='cuda')
else:
    device=torch.device(type='cpu')

In [3]:
train_data=datasets.IMDB(split='train')
eval_data=datasets.IMDB(split='test')

print(type(train_data))  #type of train_data and test_data is iterdatapipe 

<class 'torch.utils.data.datapipes.iter.sharding.ShardingFilterIterDataPipe'>


In [4]:
train_map_data=to_map_style_dataset(train_data) 
#for sentiment analysis we want label and review so we are converting them into mapstyledataset
#this train_map_data will have list of tuple and tuple will have label and review means pair of int and str 
print(type(train_map_data))
test_map_data=to_map_style_dataset(eval_data)

<class 'torchtext.data.functional.to_map_style_dataset.<locals>._MapStyleDataset'>


In [5]:
tokenizer=get_tokenizer('basic_english',language='en')


In [6]:
#build vocab

def build_vocab(train_map_data,tokenizer):
    reviews=[]           #for creating vocab we don't want labels
    for label,review in train_map_data:
      reviews.append(review)
    vocab=build_vocab_from_iterator(
        map(tokenizer,reviews),
        min_freq=2,
        specials=["<unk>","<eos>","<pad>"]
    )
    vocab.set_default_index(vocab["<unk>"])

    return vocab

In [7]:
vocab=build_vocab(train_map_data,tokenizer)

In [8]:
vocab_size=vocab.__len__()
print(vocab_size)

51719


In [9]:
max_seq_len=300
max_norm=1
embed_dim=300
batch_size=32
text_pipeline = lambda x: vocab(tokenizer(x)) 

In [23]:
def collate_data(batch, text_pipeline):
     reviews, targets = [], [] 
     pad=2
     eos=1
     for label,review in batch:
        
         review_tokens_ids = text_pipeline(review)
         review_tokens_ids = review_tokens_ids[:max_seq_len]
         review_tokens_ids.append(eos)
         l=len(review_tokens_ids)
         x=[pad]*301
         x[:l]=review_tokens_ids
         reviews.append(x)
         targets.append(label)
     reviews = torch.tensor(reviews, dtype=torch.long)
     targets = torch.tensor(targets, dtype=torch.long)
     
     return reviews, targets

In [24]:
#creating datasets
traindl=DataLoader(
  train_map_data,
  batch_size=batch_size,
  shuffle=True,
  collate_fn=partial(collate_data,text_pipeline=text_pipeline)
)
evaldl=DataLoader(
  test_map_data,
  batch_size=batch_size,
  shuffle=False,
  collate_fn=partial(collate_data,text_pipeline=text_pipeline)
)

In [31]:
#architecture of GRU and Neural Network

class SNN(nn.Module):
  def __init__(self, input_size, embed_size, hidden_size):
    super().__init__()
    self.embedding=nn.Embedding(input_size,embed_size) # here input_dim=max_seq_len*batch_size and emedding_size
    self.rnn=nn.GRU(embed_size,hidden_size,batch_first=True)  #hidden_dim how many recurrent neurons in one GRU
    self.dropout=nn.Dropout(0.2) #for preventing overfitting
    self.out=nn.Linear(in_features=hidden_size,out_features=2)  #for sentiment analysis output will be positive and negative so for binary  classification we have 2 neurons


  def forward(self,x): # x=(batch_size*max_seq_len)
    x=self.embedding(x) # embedding layer
    #after embedding x=(batch_size*max_seq_len*embed_size)
    x=self.dropout(x) # dropout layer
    outputs, hidden=self.rnn(x) # GRU layer
    # GRU gives two outputs one is all the outputs of final GRU layers which ha dimension =(batch_size*D*max_seq_len*h_out) hout=hidden_size and D= w for bidirectional GRU * num_layer of GRU 
    #second output is final hidden state of GRU whichi will have dimension of (batch_size*D*h_out)
    hidden.squeeze_(0) #now, batch_size x hidden_size 
    logits=self.out(hidden)
    return logits


In [32]:
embed_size=300
hidden_size=100

sentinn=SNN(vocab_size,embed_size,hidden_size).to(device) 
loss_fn=nn.CrossEntropyLoss(ignore_index=2).to(device)
lr=0.001
opt=optim.Adam(params=sentinn.parameters(), lr=lr)

In [41]:
def train_one_epoch():
    sentinn.train()
    track_loss = 0
    num_corrects = 0
    total_samples = 0
    for i, (reviews_ids, sentiments) in enumerate(traindl):
        reviews_ids = reviews_ids.to(device)
        sentiments = (sentiments.to(device)) - 1   #because review will have 2 labels 1 and 2 so coverting into 0 and 1
        logits = sentinn(reviews_ids)
        # print(f"logits shape: {logits.shape}, sentiments shape: {sentiments.shape}")
        loss = loss_fn(logits, sentiments)
        track_loss += loss.item()
        num_corrects += (torch.argmax(logits, dim=1) == sentiments).type(torch.float).sum().item()
        total_samples += reviews_ids.shape[0]
        running_loss = track_loss / (i + 1)  # Average loss over batches
        running_acc = (num_corrects / total_samples) * 100  # Accuracy in percentage
        print("Running Loss is",running_loss)
        print("Running Accuracy is",running_acc)
        opt.zero_grad()
        loss.backward()
        opt.step()

    epoch_loss = running_loss
    epoch_acc = running_acc
    return epoch_loss, epoch_acc

def eval_one_epoch():
    sentinn.eval()
    track_loss = 0
    num_corrects = 0
    total_samples = 0
    with torch.no_grad():  # Disable gradient computation for evaluation
        for i, (reviews_ids, sentiments) in enumerate(evaldl):
            reviews_ids = reviews_ids.to(device)
            sentiments = sentiments.to(device) - 1
            logits = sentinn(reviews_ids)
            loss = loss_fn(logits, sentiments)
            track_loss += loss.item()
            num_corrects += (torch.argmax(logits, dim=1) == sentiments).type(torch.float).sum().item()
            total_samples += reviews_ids.shape[0]
            running_loss = track_loss / (i + 1)  # Average loss over batches
            running_acc = (num_corrects / total_samples) * 100  # Accuracy in percentage
            print("Running Loss is",running_loss)
            print("Running Accuracy is",running_acc)
    epoch_loss = running_loss
    epoch_acc = running_acc
    return epoch_loss, epoch_acc


In [42]:
n_epochs=5
for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    epoch_loss,epoch_acc=train_one_epoch()
    print("Train Loss=", epoch_loss, "Train Acc", epoch_acc)
    epoch_loss,epoch_acc=eval_one_epoch()
    print("Eval Loss=", epoch_loss, "Eval Acc", epoch_acc)

Epoch=1, Running Loss is 0.6803618669509888
Running Accuracy is 56.25
Running Loss is 0.6928572058677673
Running Accuracy is 48.4375
Running Loss is 0.696658194065094
Running Accuracy is 45.83333333333333
Running Loss is 0.6981592625379562
Running Accuracy is 46.875
Running Loss is 0.6992671251296997
Running Accuracy is 46.875
Running Loss is 0.694473018248876
Running Accuracy is 49.47916666666667
Running Loss is 0.6962308543069022
Running Accuracy is 47.767857142857146
Running Loss is 0.6930588930845261
Running Accuracy is 49.21875
Running Loss is 0.6926735440889994
Running Accuracy is 49.65277777777778
Running Loss is 0.6949877262115478
Running Accuracy is 49.0625
Running Loss is 0.6935983354395087
Running Accuracy is 49.14772727272727
Running Loss is 0.6957847227652868
Running Accuracy is 47.91666666666667
Running Loss is 0.6953497804128207
Running Accuracy is 48.07692307692308


KeyboardInterrupt: 