# Sentiment Analysis

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 7.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 73.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import  BertModel, BertTokenizer
from torch.utils.data import DataLoader
import torch.optim as optim
import os
from torch.utils.data import Dataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Train-test split

In [5]:
path_csv = "/content/drive/MyDrive/Reviews.csv"
df=pd.read_csv(path_csv)
df=df[["Score","Text"]]
df

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...
...,...,...
568449,5,Great for sesame chicken..this is a good if no...
568450,2,I'm disappointed with the flavor. The chocolat...
568451,5,"These stars are small, so you can give 10-15 o..."
568452,5,These are the BEST treats for training and rew...


In [6]:
from sklearn.model_selection import train_test_split
train, rem = train_test_split(df, test_size=0.9)
test_val,rem = train_test_split(rem, test_size=0.9)
test,val = train_test_split(test_val, test_size=0.5)

In [7]:
train.Score.value_counts()

5    36210
4     8074
1     5235
3     4305
2     3021
Name: Score, dtype: int64

There is class imbalance, hence we do some preprocessing to remove the imbalance

In [8]:
from sklearn.utils import shuffle
train = shuffle(train)
test = shuffle(test)
val = shuffle(val)

In [9]:
len(train)

56845

## Loading tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## Preprocessing to deal with class imbalance

We split each review into text of upto 50 words and then perform undersampling

In [12]:
def remove_imbalance(df):
  dfnew=shuffle(df)
  num_each = dfnew.Score.value_counts().values[-1]

  df1 = dfnew[dfnew["Score"]==1].sample(num_each)
  df2 = dfnew[dfnew["Score"]==2].sample(num_each)
  df3 = dfnew[dfnew["Score"]==3].sample(num_each)
  df4 = dfnew[dfnew["Score"]==4].sample(num_each)
  df5 = dfnew[dfnew["Score"]==5].sample(num_each)
  dfs = [df1,df2,df3,df4,df5]
  df = pd.concat(dfs)
  df=shuffle(df)
  return df
  

In [13]:
train = remove_imbalance(train)
test = remove_imbalance(test)
val = remove_imbalance(val)

In [14]:
len(train)

15105

## Applying the Bert tokenizer

In [15]:
class Preprocess(Dataset):
    def __init__(self, df, maxlen=512):
        df.reset_index(drop=True, inplace=True)
        self.df = df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.maxlen = maxlen

    def __len__(self):
        return(len(self.df))

    def __getitem__(self, index):
        review = self.df.loc[index, 'Text']

        # Classes start from 0.
        label = int(self.df.loc[index, 'Score']) - 1

        # Use BERT tokenizer since it needs to be able to match the tokens to the pre trained words.
        encoded = self.tokenizer.encode_plus(
        text=review,  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = self.maxlen,  # maximum length of a sentence
        truncation=True,
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
        )

        tokens_ids_tensor = encoded['input_ids']
        attn_mask = encoded['attention_mask']

        return tokens_ids_tensor, attn_mask, label


In [16]:
train_set = Preprocess(train)
val_set = Preprocess(val)
test_set = Preprocess(test)

## Data loaders

In [17]:
train_loader = DataLoader(train_set, batch_size = 64)
val_loader = DataLoader(val_set, batch_size = 64)
test_loader = DataLoader(test_set, batch_size = 64)

## Defining NN class

In [18]:
class SentimentModel(nn.Module):
    def __init__(self, device, num_classes=5, freeze_bert = True):
        super(SentimentModel, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.device = device

        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.final_layer = nn.Linear(768, num_classes)

    def forward(self, seq, attn_masks):

        cont_reps = self.bert_layer(seq, attention_mask = attn_masks)
        cls_rep = cont_reps[0][:,0]
        logits = self.final_layer(cls_rep)
        return logits.to(self.device)

In [19]:
net = SentimentModel(device, freeze_bert=True)
net.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentimentModel(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [20]:
loss_func = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr = 2e-4)

## Training

In [21]:
def get_accuracy(logits, labels):
    predictedClass = logits.max(dim = 1)[1]
    acc = (predictedClass == labels).float().mean()
    return acc

In [22]:
def trainFunc(net, loss_func, opti, train_loader, test_loader, epochs, printEvery):
    for ep in range(epochs):
        print("EPOCH: ", ep+1)
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            opti.zero_grad()
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
      
            logits = net(seq[:,0,:], attn_masks[:,0,:])
            loss = loss_func(logits, labels)

            loss.backward()
            opti.step()

            if (it + 1) % printEvery == 0:
                acc = get_accuracy(logits, labels)

                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))

        # Perform validation at the end of an epoch.
        val_acc, val_loss = evaluate(net, loss_func, val_loader)
        print(" Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))
            

In [23]:
def evaluate(net, loss_func, dataloader):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            logits = net(seq[:,0,:], attn_masks[:,0,:])
            mean_loss += loss_func(logits, labels)
            mean_acc += get_accuracy(logits, labels)
            count += 1
    return mean_acc / count, mean_loss / count

In [24]:
trainFunc(net, loss_func, opti, train_loader, test_loader, epochs=10, printEvery=10)

EPOCH:  1




Iteration 10 of epoch 1 complete. Loss : 1.671596884727478 Accuracy : 0.1875
Iteration 20 of epoch 1 complete. Loss : 1.6141927242279053 Accuracy : 0.140625
Iteration 30 of epoch 1 complete. Loss : 1.596872329711914 Accuracy : 0.265625
Iteration 40 of epoch 1 complete. Loss : 1.594351887702942 Accuracy : 0.234375
Iteration 50 of epoch 1 complete. Loss : 1.5954415798187256 Accuracy : 0.21875
Iteration 60 of epoch 1 complete. Loss : 1.5486068725585938 Accuracy : 0.3125
Iteration 70 of epoch 1 complete. Loss : 1.5628129243850708 Accuracy : 0.390625
Iteration 80 of epoch 1 complete. Loss : 1.5889497995376587 Accuracy : 0.296875
Iteration 90 of epoch 1 complete. Loss : 1.5475162267684937 Accuracy : 0.3125
Iteration 100 of epoch 1 complete. Loss : 1.512465000152588 Accuracy : 0.40625
Iteration 110 of epoch 1 complete. Loss : 1.5212379693984985 Accuracy : 0.359375
Iteration 120 of epoch 1 complete. Loss : 1.453666090965271 Accuracy : 0.515625
Iteration 130 of epoch 1 complete. Loss : 1.546882

## Testing

In [25]:
test_acc, test_loss = evaluate(net, loss_func, test_loader)
print(" Test Accuracy : {}, Test Loss (Cross entropy) : {}".format(test_acc, test_loss))



 Test Accuracy : 0.4737866520881653, Test Loss (Cross entropy) : 1.2131178379058838


In [26]:
torch.save(net.state_dict(), "/content/drive/MyDrive/weight.pth")