# BERT Sentiment Analysis

## Imports and Prerequisites

In [None]:
!pip install torchtext==0.10.0 --quiet
!pip install transformers==4.11.3 --quiet
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve,classification_report,auc
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.dataloader import default_collate
import re
import string
import warnings
import spacy
import nltk
import torch
import torchtext
import transformers
from transformers import BertTokenizer, BertModel
import numpy as np
import os
import collections
nltk.download('punkt')

device = torch.device('cuda:0' if torch.cuda.is_available()else 'cpu')


## Helper Functions

In [None]:
#function removeSpecialChars to remove all special characters from text
def removeSpecialChars(text):

    text = re.sub(r'[^a-zA-z0-9\s]','',text)
    return text

#function transform to clean all data from reviews column
def transform(data):

    data = data.str.lower()
    data = data.apply(lambda x:''.join([i for i in x if i not in string.punctuation])) #remove !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    data = data.apply(removeSpecialChars)
    data = data.apply(lambda x: re.compile(r'<[^>]+>').sub('', x))

    return data

#make ratings binary with 0 being negative and 1 being positive
def ratingToInt(data) :

  data['rating'] = data['rating'].replace([1.0, 2.0,3.0,4.0], 0)
  data['rating'] = data['rating'].replace([7.0,8.0,9.0,10.0], 1)
  return data

#make item 1-dimensional
def to1D(item) :

  temp = [[item[i][0],item[i][1]].index(max([item[i][0],item[i][1]]))for i in range(len(item))]
  return temp

## BERT Model Class

In [None]:
class BERT(nn.Module):

    def __init__(self, num_classes,dropValue):

        super(BERT, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropValue)
        self.linear = nn.Linear(self.bert.config.hidden_size,num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        out = self.relu(linear_output)

        return out

## Data Filtering

In [None]:
#filter datasets , call function transform , ratingtoInt , tokenize reviews column  , make Tabular datasets and rerurt a BucketIterator for training and validation
def filteringData(trainingData, validationData) :

  trainingData['review'] = transform(trainingData['review'])
  validationData['review'] = transform(validationData['review'])
  trainingData = ratingToInt(trainingData)
  validationData = ratingToInt(validationData)
  tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

  encodingTrain = tokenizer.batch_encode_plus(list(trainingData['review'].values),max_length=256,add_special_tokens=True,return_token_type_ids=False,padding='max_length',truncation=True,return_attention_mask=True,return_tensors='pt').to(device)
  encodingVal = tokenizer.batch_encode_plus(list(validationData['review'].values),max_length=256,add_special_tokens=True,return_token_type_ids=False,padding='max_length',truncation=True,return_attention_mask=True,return_tensors='pt').to(device)

  return trainingData,validationData,encodingTrain,encodingVal

## Training

In [None]:
def training(bertModel,optimizer,lossFunction,trainDataloader) :


  batch_losses = []
  correct = 0

  for batchID,batchMask,batchY in trainDataloader :

    yPred = bertModel.forward(batchID,batchMask)
    trainLoss = lossFunction(yPred,batchY.long())
    batch_losses.append(trainLoss.item())
    optimizer.zero_grad()
    trainLoss.backward()
    optimizer.step()
    yPred1D = to1D(yPred)
    yPred1D=torch.tensor(yPred1D,device=device)
    c = (batchY == yPred1D).float()
    correct+=c.sum() / len(c)

  print('     Calculated Loss         ----->    ',round(sum(batch_losses)/len(trainDataloader),4),'\n')
  print('     Calculated Accuracy     ----->    ',np.round((correct/len(trainDataloader)).cpu().detach().numpy(),4),'\n')


## Validation

In [None]:
def validation(bertModel,optimizer,lossFunction,valDataloader) :

  validationCheck = 0
  validationBatchLosses = []
  batchRecall0 = []
  batchPrecision0 = []
  batchF1score0= []
  batchRecall1 = []
  batchPrecision1 = []
  batchF1score1 = []

  for batchID,batchMask,batchY in valDataloader :

    yPred = bertModel.forward(batchID,batchMask)
    valLoss = lossFunction(yPred,batchY.long())
    validationBatchLosses.append(valLoss.item())
    yPred1D = to1D(yPred)
    yPred1D = torch.tensor(yPred1D,device=device)
    temp = (batchY == yPred1D).float()
    validationCheck += temp.sum() / len(temp)
    tempYPredArray = yPred1D.cpu().detach().numpy()
    tempBatchYArray = batchY.cpu().detach().numpy()
    batchRecall1.append(recall_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [1.0],zero_division=0))
    batchPrecision1.append(precision_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [1.0],zero_division=0))
    batchF1score1.append(f1_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [1.0],zero_division=0))
    batchRecall0.append(recall_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [0.0],zero_division=0))
    batchPrecision0.append(precision_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [0.0],zero_division=0))
    batchF1score0.append(f1_score(tempBatchYArray,tempYPredArray,average = 'micro',labels = [0.0],zero_division=0))

  print('     Calculated Loss        ----->    ',round(sum(validationBatchLosses)/len(valDataloader),4),'\n')
  print('     Calculated Accuracy    ----->    ',np.round((validationCheck/len(valDataloader)).cpu().detach().numpy(),4),'\n')
  print('\n \n')
  print('|| ---------------------------- Negative Reviews ---------------------------- ||')
  print('\n')
  print('     Calculated Recall      ----->    ',np.round(sum(batchRecall0)/len(batchRecall0),4),'\n')
  print('     Calculated Precision   ----->    ',np.round(sum(batchPrecision0)/len(batchPrecision0),4),'\n')
  print('     Calculated F1 score    ----->    ',np.round(sum(batchF1score0)/len(batchF1score0),4))
  print('\n \n')
  print('|| ---------------------------- Positive Reviews ---------------------------- ||','\n')
  print('\n')
  print('     Calculated Recall      ----->    ',np.round(sum(batchRecall1)/len(batchRecall1),4),'\n')
  print('     Calculated Precision   ----->    ',np.round(sum(batchPrecision1)/len(batchPrecision1),4),'\n')
  print('     Calculated F1 score    ----->    ',np.round(sum(batchF1score1)/len(batchF1score1),4),'\n')
  print('\n \n')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m831.4/831.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.14.1+cu116 requires torch==1.13.1, but you have torch 1.9.0 which is incompatible.
torchaudio 0.13.1+cu116 requires torch==1.13.1, but you have torch 1.9.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Main

In [None]:
########################################  .....Main Starts here.....  #########################################################

#path for csv file

device = torch.device('cuda:0' if torch.cuda.is_available()else 'cpu')

dataPath ='drive/MyDrive/ColabNotebooks/haha/imdb-reviews.csv'
print('\nShould take about 3-6 minutes to complete running ...\n')

dataset = pd.read_csv(dataPath,sep='\t')
#split dataset(80% for training and 20% for testing)
trainingData = dataset.sample(frac = 0.8, random_state = 25)
validationData = dataset.drop(trainingData.index)
filteredTraining,filteredValidation,encodingTrain,encodingVal = filteringData(trainingData, validationData)

input_ids_train = encodingTrain['input_ids']
attention_masks_train = encodingTrain['attention_mask']
labels_train = torch.tensor(filteredTraining.rating.values,device=device)

input_ids_val = encodingVal['input_ids']
attention_masks_val = encodingVal['attention_mask']
labels_val = torch.tensor(filteredValidation.rating.values,device=device)

trainDataset = TensorDataset(input_ids_train,attention_masks_train,labels_train)
valDataset = TensorDataset(input_ids_val,attention_masks_val,labels_val)

num_classes = 2
dim = 1
dropValue = 0.3
layers = 1
learningRate = 0.00001
batch_size = 8

trainDataloader = DataLoader(trainDataset,sampler=RandomSampler(trainDataset),batch_size=batch_size)
valDataloader= DataLoader(valDataset,sampler=RandomSampler(valDataset),batch_size=batch_size)

bertModel = BERT(num_classes,dropValue)
bertModel.to(device)
optimizer = torch.optim.Adam(bertModel.parameters(),learningRate)
lossFunction =  torch.nn.CrossEntropyLoss().to(device)

for epoch in range(5) :

  print('\n \n')
  print('|| #################################### Epoch Number ',epoch+1,'#################################### ||')
  print('\n \n')
  print('                      ################# Training Stats   ################')
  print('\n')
  training(bertModel,optimizer,lossFunction,trainDataloader)
  print('\n                    ################# Validation Stats ################')
  print('\n')
  validation(bertModel,optimizer,lossFunction,valDataloader)
  print('\n \n')
  print('|| #################################### End of Epoch ',epoch+1,'#################################### ||')


Should take about 3-6 minutes to complete running ...



Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]