<a href="https://colab.research.google.com/github/Sagnik-Nandi/PDFQueryBot---Chatbot-over-PDFs-using-RAG/blob/main/assnmt%202%20-%20Sentiment%20Classifier/sentiment_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Data and Installing Dependencies

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# !ls drive/MyDrive/'Colab Notebooks'/'WiDS 2024'
# !pip uninstall torchtext torch -y
# !pip install torch==2.2.0 torchtext==0.17.0

import torch
# import torchtext
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
# from torchtext import datasets
# from torchtext.vocab import vocab
from gensim.utils import tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
# nltk.download('wordnet')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
import re
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
df=pd.read_csv("drive/MyDrive/Colab Notebooks/WiDS 2024/reviews.csv")
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

for i in range(5):
  rev=df.iloc[i]['review'] # iloc gives the i'th row
  print(rev)
  print("No of paras:", len(rev.split('<br /><br />')))
  print("No of sentences:", len(rev.split('.')))
  print("No of words:", len(rev.split()))
  print("Label:", df.iloc[i]['sentiment'])

df['len']=df['review'].apply(lambda x: len(x.split()))
# df.describe()
# print(max(df['review'].apply(lambda x: len(x.split()))))
# print(min(df['review'].apply(lambda x: len(x.split()))))

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

## Tokenization and Preprocessing

In [5]:
# List of stopwords (overused words that could lead to overfitting)
stops=set(stopwords.words('english'))
capstops=[word.capitalize() for word in stops]
stops.update(capstops)
stops=list(stops)

# Stemmer and Lemmatizer for normalizing the words to root words
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [6]:
def custom_tokenize(text):
  text=re.sub('<.*>', '', text) # Filter out html tags like <br/>
  tokens=list(tokenize(text))
  tokens=[token for token in tokens if token not in stops]
  # can do lower case as normalization
  # tokens=[stemmer.stem(token) for token in tokens]
  # tokens=[lemmatizer.lemmatize(token) for token in tokens]
  return tokens

for i in range(5) :
  rev=df.iloc[i]['review']
  print(custom_tokenize(rev))

['One', 'reviewers', 'mentioned', 'watching', 'Oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'dare', 'Forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'forget', 'charm', 'forget', 'romance', 'OZ', 'mess', 'around', 'first', 'episode', 'ever', 'saw', 'struck', 'nasty', 'surreal', 'say', 'ready', 'watched', 'developed', 'taste', 'Oz', 'got', 'accustomed', 'high', 'levels', 'graphic', 'violence', 'violence', 'injustice', 'crooked', 'guards', 'sold', 'nickel', 'inmates', 'kill', 'order', 'get', 'away', 'well', 'mannered', 'middle', 'class', 'inmates', 'turned', 'prison', 'bitches', 'due', 'lack', 'street', 'skills', 'prison', 'experience', 'Watching', 'Oz', 'may', 'become', 'comfortable', 'uncomfortable', 'viewing', 'thats', 'get', 'touch', 'darker', 'side']
['wonderful', 'little', 'production', 'realism', 'really', 'comes', 'home', 'little', 'things', 'fantasy', 'guard', 'rather', 'us

## Train-Test Split

In [7]:
train1 = df.sample(frac=0.9, random_state=25)
train = train1.sample(frac=0.8889, random_state=25)
valid = train1.drop(train.index)
test = df.drop(train1.index)

## Vectorization and Mapping to a Vocabulary

In [16]:
# Min_frequency to filter rare words
min_freq = 5
specials=["<unk>", "<pad>"]
# Max len of a review for training
maxLen=100
# train_vocab = vocab(counter, min_freq=min_freq, specials=specials)
# train_vocab.set_default_index(train_vocab["<pad>"])
def build_vocab(text_iterator, min_freq=min_freq, specials=specials):
    token_counts = dict()
    for text in text_iterator:
        for token in text:
          if token in token_counts:
            token_counts[token] += 1
          else :
            token_counts[token]=1
    vocab = {token: idx for idx, (token, count) in enumerate(token_counts.items()) if count >= min_freq}
    for special in specials:
        if special not in vocab:
            vocab[special] = len(vocab)
    return vocab


def custom_transform(text, vocab):
  tokens=custom_tokenize(text)
  sequence=[vocab[token] if token in vocab else vocab['<unk>'] for token in tokens]
  sequence=sequence[:maxLen]
  # sequence=pad_sequence(sequence, batch_first=True, padding_value=vocab["<pad>"])
  return sequence

train['tokenized']=train['review'].apply(custom_tokenize)
train_vocab = build_vocab(train['tokenized'])
vocab_size = len(train_vocab)
print(list(train_vocab)[:10])

for i in range(5) :
  rev=df.iloc[i]['review']
  print(custom_transform(rev, train_vocab))

['adaptation', 'Pearl', 'Buck', 'film', 'certainly', 'classic', 'true', 'Hollywood', 'epic', 'things']
[610, 2412, 3130, 260, 9571, 1126, 2232, 319, 6936, 1224, 272, 345, 25, 2863, 265, 1078, 193, 1522, 687, 3458, 16138, 526, 1164, 1163, 2132, 890, 639, 6956, 639, 5280, 12089, 3350, 332, 87, 1126, 395, 1096, 4321, 1006, 7494, 345, 6216, 397, 353, 1442, 9571, 387, 1651, 64, 2836, 1732, 964, 964, 14358, 13026, 22274, 4336, 40810, 9518, 3129, 1317, 49, 1060, 350, 8855, 478, 3294, 9518, 712, 3126, 58420, 1078, 869, 794, 8840, 3126, 2311, 697, 9571, 1276, 1136, 1909, 8702, 501, 2645, 49, 3189, 10934, 1692]
[498, 183, 199, 15631, 209, 301, 1182, 183, 9, 467, 4786, 734, 966, 9218, 4920, 4633, 3441, 354, 8668, 42, 5951, 5363, 682, 1667, 363, 30872, 53265, 986, 682, 4730, 53265, 30872, 30872, 1027, 2124, 3960, 350, 381]
[279, 498, 297, 849, 329, 5031, 2214, 9889, 6485, 3693, 15833, 4563, 260, 1011, 8262, 415, 99, 8152, 104, 9615, 26, 1935, 44, 350, 7575, 26770, 634, 635, 1276, 916, 1166, 36133,

## Create a Dataset and Dataloader class

In [19]:
## Create a dataset and dataloader class
class ReviewDataset(Dataset):
  def __init__(self, df, vocab, transform):
    self.vocab=vocab
    self.text = [transform(review, vocab) for review in df['review']] # Text to sequences (do i need to map unk tokens separately?)
    # self.text = [t[:maxLen] for t in self.text] # Truncate
    self.text = pad_sequence([torch.tensor(t) for t in self.text], batch_first=True, padding_value=vocab["<pad>"]) # Padding (what is batch first option?)
    self.labels=df['sentiment'].astype(float)
  def __len__(self):
    # return len(self.text)
    return len(self.labels)
  def __getitem__(self, index):
    return {'input_ids' : self.text[index], 'label_id' : self.labels[index]}

train_data=ReviewDataset(train, train_vocab, custom_transform)
valid_data=ReviewDataset(valid, train_vocab, custom_transform)
test_data=ReviewDataset(test, train_vocab, custom_transform)

In [58]:
# print(train_data.text[3], train_data.labels[3])
print(next(iter(train_data)))

TypeError: unsupported operand type(s) for +: 'iterator' and 'int'

In [27]:
# Batch-size for dataloader
batch_size=64
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [54]:
for idx, (text, label) in enumerate(train_loader):
# for batch in train_loader:
#   print(batch)
  # text=batch['input_ids']
  # label=batch['label_id']
  print(text.shape, label.shape)
  break
# print(train_loader)

KeyError: 3897

# Define the LSTM model

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

cuda


In [39]:
vocab_size=len(train_vocab)
embedding_dim=50
hidden_dim=16
output_dim=1
n_layers=1
bidirectional=True
dropout=0.2

class LSTM(torch.nn.Module):
  def __init__(self): # Try dropout also
    super().__init__()
    self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
    self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True) #(what is batch first option?)
    self.linear = torch.nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
    self.dropout = torch.nn.Dropout(dropout)
    self.sigmoid = torch.nn.Sigmoid()

  def forward(self, text):
    embedded = self.embedding(text)
    output, (hidden, cell) = self.rnn(embedded)
    dropped = self.dropout(output[:, -1, :])
    return self.sigmoid(self.linear(dropped))
model = LSTM().to(device)


In [30]:
num_epochs = 5
lr=0.005
optimizer = torch.optim.Adam(model.parameters(),lr=lr) # Try amsgrad option
criterion = torch.nn.BCELoss()

## Train the Model

In [41]:
def train_loop(dataloader, model, loss_fn, optimizer, verbose=True):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss=0

    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss+=loss.item()

        # Backpropagation, update parameters and reset all the gradients to zero
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Show progress report :D
        if verbose and batch % 500 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"Training Loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    train_loss/=num_batches
    return train_loss

def eval_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    valid_loss, correct = 0, 0

    # torch.no_grad() ensures that no gradients are computed during eval mode
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            valid_loss += loss_fn(pred, y).item()
            correct += (pred.round().int() == y.int()).type(torch.float).sum().item()


    valid_loss /= num_batches
    correct /= size
    print(f"Validation Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {valid_loss:>8f} \n")
    return valid_loss


In [43]:
# REF: https://www.geeksforgeeks.org/training-neural-networks-with-validation-using-pytorch/

def fit(model, epochs, train_loader, eval_loader, loss_fn, optimizer, verbose=True, plot_loss=True):
  min_valid_loss = np.inf
  train_loss_data = [] # tracks loss data over all epochs
  valid_loss_data = []
  for t in range(epochs):
      print(f"Epoch {t+1}\n-------------------------------")
      train_loss = train_loop(train_loader, model, loss_fn, optimizer, verbose)
      valid_loss = eval_loop(eval_loader, model, loss_fn)
      train_loss_data.append(train_loss)
      valid_loss_data.append(valid_loss)

      # Saving parameters when validation error decreases, indicating a better model
      if min_valid_loss > valid_loss:
          if verbose:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \n Saving The Model\n')
          min_valid_loss = valid_loss
          torch.save(model.state_dict(), 'saved_model.pth')
  # Plot training error and validation error
  if plot_loss:
    plt.plot(train_loss_data, label='Training Loss')
    plt.plot(valid_loss_data, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title("Avg. Loss function during training model")
    plt.show()

  print("Done!")
  # return train_loss_data, valid_loss_data

fit(model, num_epochs, train_loader, valid_loader, criterion, optimizer, verbose=True, plot_loss=True)


Epoch 1
-------------------------------


KeyError: 6849

In [31]:
from tqdm import tqdm

def correct(output, target):
    sentiment_pred = output.round().int()          # set to 0 for <0.5 and 1 for >0.5
    correct_ones = sentiment_pred == target.int()  # 1 for correct, 0 for incorrect
    return correct_ones.sum().item()               # count number of correct ones


In [40]:
def train_model(data_loader, model, criterion, optimizer):
    model.train()

    num_batches = 0
    num_items = 0

    total_loss = 0
    total_correct = 0
    for item in tqdm(data_loader):
        # Copy data and targets to GPU
        data = item['input_ids'].to(device)
        target = item['label_id'].to(device)

        # Do a forward pass
        output = model(data)

        # Calculate the loss
        loss = criterion(output, target)
        total_loss += loss
        num_batches += 1

        # Count number of correct digits
        total_correct += correct(output, target)
        num_items += len(target)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    train_loss = total_loss/num_batches
    accuracy = total_correct/num_items
    print(f"Average loss: {train_loss:7f}, accuracy: {accuracy:.2%}")
    return train_loss.item(), accuracy


In [37]:
losses = []
accuracies = []
for epoch in range(num_epochs):
    print(f"Training epoch: {epoch+1}")
    loss, acc = train_model(train_loader, model, criterion, optimizer)
    losses.append(loss)
    accuracies.append(acc)

Training epoch: 1


  0%|          | 0/625 [00:00<?, ?it/s]


KeyError: 2842