# Twitter Sentiment Analysis

## Notebook 3 : Deep Learning comparison of model ANN, LSTM and BERT

In [None]:
# new library download



In [None]:
# Importing libraires for file and time operations
import os
import time

# Libraries for random number generation and warnings
import random
import warnings
warnings.filterwarnings('ignore')

# Basic libraries
import pandas as pd
import numpy as np
from itertools import product

# Library for splitting data
from sklearn.model_selection import train_test_split

# libraries for Word2Vec
import gensim
from gensim.models import Word2Vec

# libraries for nltk
from nltk import tokenize
from nltk.tokenize import WhitespaceTokenizer

# Libraries for Deep Learning with PyTorch
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Library for clearing Jupyter Notebook cell output
from IPython.display import clear_output

# Libraries for model evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score





In [None]:
# new library importing inbetween




# Seeting seeds for reproducibility


In [None]:
def set_seed(seed_value):
  # set python's random seed
  random.seed(seed_value)

  # set numpy's random seed
  np.random.seed(seed_value)

  os.environ['PYTHONHASHSEED'] = str(seed_value)
  # set Pytorch;s CPU seed
  torch.manual_seed(seed_value)
  # Set CuDNN deterministic mode (for further reproducibility)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

  # Set PyTorch's GPU seed (if available)
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)

## Loading dataset

In [None]:
# copy files from google drive to google colab content

!cp '/content/drive/MyDrive/Twitter_Sentiment_Analysis/training_processed_data.csv' '/content/'
!cp '/content/drive/MyDrive/Twitter_Sentiment_Analysis/SKIP_Word2Vec.model' '/content/'
!cp '/content/drive/MyDrive/Twitter_Sentiment_Analysis/CBOW_Word2Vec.model' '/content/'

In [None]:
# Load the dataset

dataset = pd.read_csv('training_processed_data.csv',nrows = 500000)
dataset.sample(10)
print(dataset.shape)

(500000, 2)


In [None]:
dataset.dropna(axis = 0, how = "any", inplace = True)
dataset.isnull().sum()
print(dataset.shape)

(492950, 2)


## Splitting the data into Train and Test data

In [None]:
# import the libraries
from sklearn.model_selection import train_test_split

# Split the dataset into Training and Testing data
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size = 0.2, random_state = 42)


## Data Vectorization

In [None]:
# Library to work with Word2Vec
from gensim.models import Word2Vec

# Define the path to the pre-trained Word2Vec models
cbow_path = '/content/CBOW_Word2Vec.model'
sg_path = '/content/SKIP_Word2Vec.model'

# Load the models in the memory
cbow = Word2Vec.load(cbow_path)
sg = Word2Vec.load(sg_path)

converting the raw data into vectorize format with 300 dimensions using the above mentioned models

In [None]:
# library
from nltk.tokenize import WhitespaceTokenizer

def vectorizer(text, word2vec_model):
  x = len(text)
  y = 300

  matrix = np.zeros((x,y))

  # iterate through each text sample in the dataframe
  for i in range(x):
    words = WhitespaceTokenizer().tokenize(text.iloc[i])

    for word in words:
      word_vec = []
      if word in word2vec_model.wv:
        word_vec.append(word2vec_model.wv.get_vector(word))

      if word_vec:
        matrix[i] = np.mean(word_vec, axis=0)

  return matrix

In [None]:
# Start the data vectorization using CBOW
start_time = time.time()

matrix_train_cbow = vectorizer(X_train,cbow)
matrix_test_cbow = vectorizer(X_test,cbow)

print(f" Data Vectorization completed using cbow. Duration {time.time()-start_time} secs")

 Data Vectorization completed using cbow. Duration 60.00106453895569 secs


In [None]:
print(matrix_train_cbow.shape)

(394360, 300)


In [None]:
print(matrix_train_cbow[1])

[ 1.21310890e+00 -1.59038031e+00 -1.03793286e-01 -2.09765494e-01
  7.06457794e-01  2.22103342e-01 -5.52951515e-01  4.01179976e-04
 -6.63659871e-01 -2.86419392e-01  3.09581041e-01 -3.68017524e-01
 -1.61698079e+00  8.46598744e-01  8.76871884e-01  3.49490732e-01
 -1.74440742e+00  1.36529773e-01  2.74539739e-01  3.04339767e-01
  2.77809709e-01  2.94365525e-01  5.00815809e-01 -2.77122855e-01
  8.25947523e-01 -7.64158666e-01 -7.90861726e-01 -5.97459435e-01
  2.11164296e-01  7.01822698e-01  3.26846004e-01 -1.76645517e+00
  1.03135669e+00  7.50023603e-01  7.44568288e-01  2.31459305e-01
  2.76142269e-01  7.34570742e-01 -1.49256217e+00 -7.18964636e-01
 -6.16045892e-01 -2.79662937e-01  3.34402137e-02  4.45389748e-01
  3.39251250e-01  5.00838935e-01  5.81753731e-01 -2.00212336e+00
  8.01831186e-01  2.12147146e-01  1.40123576e-01  5.77987731e-01
 -6.23167813e-01  2.76066780e-01  3.75610590e-01  8.78760040e-01
 -8.73566389e-01 -1.14821124e+00  6.81660846e-02 -2.12199837e-01
  7.23350227e-01  3.88899

In [None]:
# Start the data vectorization with Skipgram
start_time = time.time()

matrix_train_sg = vectorizer(X_train,sg)
matrix_test_sg = vectorizer(X_test,sg)

print(f"Data Vectorization completed using Skipgram. Duration {time.time()- start_time}")


Data Vectorization completed using Skipgram. Duration 204.78982996940613


## Setting the computational device

In [None]:
# Choose GPU if availabel, otherwise use CPU
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

print(f"Device being used is {device}")

Device being used is cuda


## First:- Basic Neural Network

Classifier class sets up the network layer, while train and test functions manage the training and evaluation process.

In [None]:
# Define neural network classifier
class classifier(nn.Module):
  def __init__(self, input_size, hidden_size, out_size):
    super(classifier, self).__init__()

    # Define hidden layer with ReLU activation
    self.features = nn.Sequential(nn.Linear(input_size, hidden_size), nn.ReLU())

    # Define output Layer
    self.out = nn.Linear(hidden_size, out_size)

  #Define foreard pass
  def forward(self, X):
    feature = self.features(X)
    output = self.out(feature)

    return output

In [None]:
# Training Function for a single epoch

def train(train_loader, net, epoch, criterion, optimizer):

  # set the network to training mode
  net.train()

  # Initialize list to store batch loss
  epoch_loss = []

  # Iterate through each batch in the training data loader
  for batch in train_loader:
    data, target = batch

    # Move the data and labels to the chosen device
    data , target = data.to(device), target.to(device)

    # Forward pass: commute predictions and loss
    pred = net(data)
    loss = criterion(pred.squeeze(),target.float())
    epoch_loss.append(loss.cpu().data)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Convert epoch loss to numpy array and compute mean
  epoch_loss = np.asarray(epoch_loss)

  return epoch_loss.mean()

In [None]:
# Testing Function for single epoch

def test(test_loader, net, epoch, criterion):
  # set the network for evaluation mode
  net.eval()

  # Disable gradient calculation for performance calculation
  with torch.no_grad():
    # Initiate loss list to store the loss
    epoch_loss = []

    # Iterate through each batch in the test data loader
    for batch in test_loader:
      data,target = batch

      # Move data and target to the chosen device
      data, target = data.to(device), target.to(device)

      # Forward pass: compute predictoins and loss
      pred = net(data)
      loss = criterion(pred.squeeze(), target.float())
      epoch_loss.append(loss.cpu().data)

    #Convcer the epoch loss to numpy array and compute the mean
    epoch_loss = np.asarray(epoch_loss)

    return epoch_loss.mean()


In [None]:
# Main function to train the neural network

def train_nn(matrix_train, matrix_test, args, hidden_size = 32, input_size = 300, output_size = 1):


  # converting training and testing matrices to pytorch tensors
  X_train = torch.from_numpy(matrix_train).float()
  Y_train = torch.from_numpy(y_train.values)
  train_data = TensorDataset(X_train,Y_train)

  X_test = torch.from_numpy(matrix_test).float()
  Y_test = torch.from_numpy(y_test.values)
  test_data = TensorDataset(X_test, Y_test)

  # Create data Loaders for training and testing datasets
  train_loader = DataLoader(train_data, batch_size = args['batch_size'], shuffle = True,
                            num_workers = args['num_workers'])
  test_loader = DataLoader(test_data, batch_size = args['batch_size'], shuffle = True,
                           num_workers = args['num_workers'])

  # Initialize the neular network
  net = classifier(input_size, hidden_size, output_size)
  net.to(device)

  #Define the loss function
  criterion = nn.BCEWithLogitsLoss().to(device)

  # Chooses the optimizer based on user input
  if args['optimizer'] == 'Adam':
    optimizer = optim.Adam(net.parameters(), lr = args['lr'],
                          weight_decay = args['weight_decay'])

  # Initializes variable to keep track of losses and other metrics
  train_losses, test_losses = [], []
  best_test_loss = float('inf')
  best_model_epoch = -1
  no_improvement_epoch = 0

  # Records the time at which training starts
  start_time = time.time()

  # Main Training of neural network
  for epoch in range(args['num_epochs']):

    #Clear the output and prints the currnet training status
    clear_output(wait = True)

    print("Training neural network ... Epoch " + str(epoch) + '/' +str(args['num_epochs']-1))
    if best_model_epoch != -1:
      print("Best test loss epoch: "+ str(best_model_epoch))

    # Train the network and logs the loss

    epoch_train_loss = train(train_loader, net, epoch, criterion, optimizer)
    train_losses.append(epoch_train_loss)

    # Test the network and logs the loss
    epoch_test_loss = test(test_loader, net, epoch, criterion)
    test_losses.append(epoch_test_loss)

    # Checks for improvement in test loss
    diff = best_test_loss - epoch_test_loss
    if epoch_test_loss< best_test_loss:
      best_test_loss = epoch_test_loss
      #Save the state of the best model for future reference
      torch.save(net.state_dict(), 'backup_best_model.pth')
      best_model_epoch = epoch

    # Implement early stopping based on predefined tolerance and patience
    if diff> args['tolerance']:
      no_improvement_epochs = 0
    else:
      no_improvement_epochs +=1
    if no_improvement_epochs >= args['patience']:
      print("Early Stopping at epoch: ", epoch)
      break

    # Record the duration for which the model training took
    duration = time.time()-start_time

    # Clear the output and prints the final training status
    clear_output(wait = True)

    print(f"Neural network training has finished. Elasped time {duration :.2f}")
    print("Best model saved")
    print("Best test loss epoch: "+ str(best_model_epoch))

    # Load the best model
    true_test_labels, predicted_test_labels = [], []
    net.load_state_dict(torch.load("backup_best_model.pth"))

    # Use the best model to make predictions on the train data
    with torch.no_grad():
      for batch in test_loader:
        data, target = batch
        data, target = data.to(device), target.to(device)
        pred = net(data)
        true_test_labels.extend(target.cpu().numpy())
        predicted_test_labels.extend((torch.sigmoid(pred).squeeze()>0.5).cpu().numpy())

    # Calculate performance metrics
    test_accuracy = accuracy_score(true_test_labels, predicted_test_labels)
    test_precision = precision_score(true_test_labels, predicted_test_labels)
    test_recall = recall_score(true_test_labels, predicted_test_labels)
    test_f1 = f1_score(true_test_labels, predicted_test_labels)

    # print the performance metrics of the best model
    print("Best model metrics: ")
    print(f"Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 {test_f1}")


  return (train_losses, test_losses, best_model_epoch)

## Parameter Grid Definition

In [None]:
# define parameter grid for hyperparameter tuning

param_grid = {
    'vectorization' :['cbow'],
    'criterion': ['BCEWithLogitsLoss'],
    'optimizer' :['Adam'],
    'batch_size' : [500],
    'num_workers':[2],
    'lr' : [0.0001,0.001],
    'weight_decay' : [0.0001],
    'num_epochs':[500],
    'tolerance' : [0.003],
    'patience' : [25]
}

# generate all combinations of hyperparameters for grid search
all_combinations = [dict(zip(param_grid,v)) for v in product(*param_grid.values())]

In [None]:
len(all_combinations)

6

In [None]:
# initialize an empty list to store the results
results = []

# record the start time for model training process
start_time = time.time()

# Loop throught each combinations of hyperparameters
for params in all_combinations:

  # Select the appropriate word vector based on vectorization parameter
  if params['vectorization']== 'cbow':
    matrix_train = matrix_train_cbow.copy()
    matrix_test = matrix_test_cbow.copy()
  elif params['vectorization']=='sg':
    matrix_train = matrix_train_sg.copy()
    matrix_test = matrix_test_sg.copy

  # Train the neural network with the given parameters
  train_losses, test_losses, best_epoch = train_nn(matrix_train, matrix_test, params,
                                                   hidden_size = params.get('hidden_size',32),
                                                   input_size = params.get('input_size',300),
                                                   output_size = params.get('putput_size',1))

  # Append the training results to the results list
  results.append({
      'params':params,
      'train_losses':train_losses,
      'test_losses' : test_losses,
      'best_epoch' : best_epoch
  })

# record duration for training
duration = time.time()-start_time

print(f" Model training is completed. Elapsed time is {duration} secs")

print(results)


Training neural network ... Epoch 25/499
Best test loss epoch: 17
Early Stopping at epoch:  25
 Model training is completed. Elapsed time is 492.5805265903473 secs
[{'params': {'vectorization': 'cbow', 'criterion': 'BCEWithLogitsLoss', 'optimizer': 'Adam', 'batch_size': 500, 'num_workers': 2, 'lr': 0.0001, 'weight_decay': 0.0001, 'num_epochs': 500, 'tolerance': 0.003, 'patience': 25}, 'train_losses': [0.22805305, 0.047322586, 0.0153300725, 0.007410625, 0.004529834, 0.00314678, 0.0023377107, 0.0017929041, 0.0013842217, 0.0010603114, 0.0008020084, 0.0005992995, 0.00044533022, 0.0003341641, 0.00025485325, 0.00019977466, 0.0001620882, 0.00013638068, 0.000119417324, 0.00010826704, 0.000101085265, 9.6412354e-05, 9.3667775e-05, 9.259407e-05, 9.185473e-05, 9.178583e-05, 9.185104e-05, 9.1759444e-05, 9.177482e-05], 'test_losses': [0.08781215, 0.023617854, 0.009860607, 0.0055590617, 0.003668413, 0.0026701924, 0.0020026078, 0.0015546869, 0.0011923359, 0.000912208, 0.00068267004, 0.0005106336, 0.00