In [1]:
import os

os.environ['TRANSFORMERS_CACHE'] = './hugging_face_models/cache/'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
from transformers import AutoModel,AutoTokenizer
import re
from sklearn.model_selection import train_test_split



In [3]:
# Read dataset

df = pd.read_csv('./data/imdb/imdb_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Getting data statistics

print(f"Total no of records are {df.shape[0]}")

print(f"Total no of unique labels are {df['sentiment'].nunique()} with values {df['sentiment'].unique()}")

print(f"The distribution of output labels are {df['sentiment'].value_counts(normalize=True)}")

Total no of records are 50000
Total no of unique labels are 2 with values ['positive' 'negative']
The distribution of output labels are positive    0.5
negative    0.5
Name: sentiment, dtype: float64


In [5]:
# Converting output label to numeric

label_mapper = {'positive':1,'negative':0}

df['sentiment'] = df['sentiment'].map(label_mapper)

df['sentiment'].head()

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [6]:
# Preprocessing the text column

def preprocess_reviews(text):
    
    text = text.lower()
    text = re.sub(r"[^a-z0-9.!?&\'*]"," ",text)
    text = re.sub(r"(\s)+"," ",text)
    text = text.strip()
    return text

df['review_clean'] = df['review'].apply(preprocess_reviews)
    

In [7]:
df.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production. br br the filmi...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei's love in the time of money is a...


In [8]:
# splitting into train,val

X_train,X_val,y_train,y_val = train_test_split(df.drop('sentiment',axis=1),df['sentiment'],test_size=0.25,stratify=df['sentiment'],random_state=60)

print(X_train.shape,y_train.shape)
print(X_val.shape,y_val.shape)

(37500, 2) (37500,)
(12500, 2) (12500,)


In [9]:
# Setting up GPU support

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [10]:
model_name = 'bert-base-uncased'

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# convert text to the format bert expects

def return_bert_inputs(df,text_column,tokenizer):
    
    input_ids = []
    attention_masks = []
    for text in df[text_column]:
        
        res_dict = tokenizer.encode_plus(text,add_special_tokens=True,max_length=512,
                                          pad_to_max_length=True,return_attention_mask=True)
        
        input_ids.append(res_dict['input_ids'])
        attention_masks.append(res_dict['attention_mask'])
    
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids,attention_masks
        

In [12]:
tr_input_ids,tr_attention_masks = return_bert_inputs(X_train,'review_clean',bert_tokenizer)

val_input_ids,val_attention_masks = return_bert_inputs(X_val,'review_clean',bert_tokenizer)

print(tr_input_ids.shape,tr_attention_masks.shape)
print(val_input_ids.shape,val_attention_masks.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([37500, 512]) torch.Size([37500, 512])
torch.Size([12500, 512]) torch.Size([12500, 512])


In [13]:
# Creating appropriate data loaders for training


y_train = torch.tensor(y_train.values)
y_val = torch.tensor(y_val.values)


batch_size = 32

train_data = TensorDataset(tr_input_ids,tr_attention_masks,y_train)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)


val_data = TensorDataset(val_input_ids,val_attention_masks,y_val)
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data,sampler=val_sampler,batch_size=batch_size)

In [14]:
# lets look at one batch of data loader

iter(train_loader).next()

[tensor([[  101,  1045,  2064,  ...,     0,     0,     0],
         [  101,  2023,  2003,  ...,     0,     0,     0],
         [  101,  2882,  8680,  ...,     0,     0,     0],
         ...,
         [  101,  2065,  2017,  ...,     0,     0,     0],
         [  101,  2023,  2143,  ...,     0,     0,     0],
         [  101, 27178, 18223,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
         0, 0, 1, 0, 1, 1, 1, 1])]

In [29]:
# Creating our custom model

class CustomBert(nn.Module):
    
    def __init__(self,hidden_dim,output_shape):
        super(CustomBert,self).__init__()
        
        bert_input = 768
        self.hidden_dim = hidden_dim
        self.output_shape = output_shape
        
        self.bert = bert_model
        
        self.classifier = nn.Sequential(nn.Linear(bert_input,self.hidden_dim),
                                  nn.ReLU(),
                                  nn.Linear(self.hidden_dim,self.output_shape))
        
        # Freezing bert
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def forward(self,input_ids,attention_masks):
        x = self.bert(input_ids,attention_masks)[0][:,0,:] # getting [cls] token embeddings only
        x = self.classifier(x)
        return x

In [30]:
custom_model = CustomBert(64,2).to(device)
custom_model

CustomBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [31]:
## Training loop

epochs = 1
total_steps_per_epochs = len(train_loader)

optimizer = torch.optim.SGD(custom_model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(1,epochs+1):
    
    custom_model.train() # put model in train mode
    
    for i,batch in enumerate(train_loader):
        
        b_ids,b_masks,b_labels = tuple(t.to(device) for t in batch)
        
        preds_b = custom_model(b_ids,b_masks)
        
        loss = criterion(preds_b,b_labels)
        
        optimizer.zero_grad() # reset any previous stored gradients
        loss.backward() # do backprop
        optimizer.step() # update weights
        
        if i % 100 == 0:
            print(f"Epoch : {epoch}, step per epoch : {i}, loss : {loss}")
        
        

Epoch : 1, step per epoch : 0, loss : 0.6820238828659058
Epoch : 1, step per epoch : 100, loss : 0.6959987878799438
Epoch : 1, step per epoch : 200, loss : 0.6901320219039917
Epoch : 1, step per epoch : 300, loss : 0.6916142106056213
Epoch : 1, step per epoch : 400, loss : 0.6882206797599792
Epoch : 1, step per epoch : 500, loss : 0.6643081903457642
Epoch : 1, step per epoch : 600, loss : 0.6945164203643799
Epoch : 1, step per epoch : 700, loss : 0.6791486144065857
Epoch : 1, step per epoch : 800, loss : 0.6945682764053345
Epoch : 1, step per epoch : 900, loss : 0.6641589999198914
Epoch : 1, step per epoch : 1000, loss : 0.671757698059082
Epoch : 1, step per epoch : 1100, loss : 0.6458482146263123


In [34]:
# evaluation

custom_model.eval() # put model on eval mode

total_correct = 0
total_samples = 0
class_wise_correct = [0 for i in range(2)]
class_wise_samples = [0 for i in range(2)]

with torch.no_grad(): # no computation of gradients needed since we are just predicting
    for batch in val_loader:
        v_ids,v_masks,v_labels = tuple(t.to(device) for t in batch)
        
        v_preds = custom_model(v_ids,v_masks)
        _,v_preds = torch.max(v_preds,axis=1)
        
        total_correct += (v_preds == v_labels).sum().item()
        total_samples += len(v_labels)
        
        for label,pred in zip(v_labels,v_preds):
            
            if label == pred:
                class_wise_correct[label] += 1
            
            class_wise_samples[label] += 1

print(f"Overall accuracy is {total_correct/total_samples:.3f}")
for i in range(2):
    print(f'Accuracy of class {i} is {class_wise_correct[i]/class_wise_samples[i]}')

Overall accuracy is 0.741
Accuracy of class 0 is 0.69504
Accuracy of class 1 is 0.78704
