In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import ( confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score)

from matplotlib import pyplot as plt
from tqdm import tqdm

import torch
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
import transformers

In [None]:
# if torch.cuda.is_available():
#   dev = torch.device('cuda:0')
# else:
#   dev = torch.device('cpu')
dev = torch.device('cpu')

## Importing the dataset

In [None]:
df = pd.read_csv('data.tsv', delimiter='\t', header=None)

In [None]:
batch_1 = df[:2000]

In [None]:
batch_1

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
1995,too bland and fustily tasteful to be truly pru...,0
1996,it does n't work as either,0
1997,this one aims for the toilet and scores a dire...,0
1998,in the name of an allegedly inspiring and easi...,0


In [None]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## Loading Pre-trained BERT model

In [None]:
# DistilBERT:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel, transformers.DistilBertTokenizer, 'distilbert-base-uncased')

# BERT:
#model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# print(tokenized)
for i in tokenized[:15].values:
    print(len(i))

20
16
45
22
25
21
21
17
28
7
22
24
29
16
18


In [None]:
# calc maximum length of tokenized sentence

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

print(max_len)

59


In [None]:
# Add Padding

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [None]:
np.array(padded).shape

(2000, 59)

In [None]:
# Masking: to tell BERT to ignore/mask added padding 

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [None]:
## Tokentzed Sentence => BERT => Sentence Embedding

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask)

In [None]:
# [CLS] Sentence [SEP]
# output corresponding to [CLS] token is embedding for entire sentence.
# saving features corresponding to [CLS]

features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

array([[-0.21593437, -0.14028914,  0.00831094, ..., -0.13694856,
         0.5867004 ,  0.20112717],
       [-0.17262724, -0.14476171,  0.00223437, ..., -0.17442554,
         0.21386446,  0.37197468],
       [-0.05063341,  0.07203971, -0.02959722, ..., -0.07148952,
         0.7185241 ,  0.26225477],
       ...,
       [-0.27829772, -0.2480361 ,  0.13585785, ..., -0.19039167,
         0.13099585,  0.3497837 ],
       [-0.03667716,  0.10638539, -0.0111102 , ..., -0.11206637,
         0.4161945 ,  0.50338024],
       [ 0.12402633,  0.01425167,  0.01038398, ..., -0.11606557,
         0.5345913 ,  0.27495337]], dtype=float32)

In [None]:
features.shape

(2000, 768)

In [None]:
labels = batch_1[1]

## Train Model


In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
pd.DataFrame(train_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,-0.107823,0.111220,0.007751,-0.186264,0.017974,0.030283,0.243291,0.338382,-0.062372,-0.382746,-0.058209,0.004015,0.079638,0.205572,-0.141916,0.373760,0.087848,0.146198,0.149237,-0.111292,-0.091705,-0.160435,0.071609,0.062883,0.062110,0.128250,0.008427,-0.196469,0.027979,-0.016148,0.124949,0.289490,-0.097533,-0.147253,-0.096288,-0.084171,0.073260,0.030870,0.191073,0.083149,...,0.022878,0.104922,-0.235396,0.387556,-0.072939,-0.340098,-0.065424,-0.024178,-0.022620,-0.217506,0.183706,0.011744,0.369475,0.005822,0.223637,0.165307,0.156381,-0.197007,-0.013248,-0.065221,-0.089332,-0.189002,-0.005612,0.145734,-7.047712,-0.410940,-0.064804,-0.173091,-0.163102,-0.013241,0.030233,-0.039298,0.070220,0.008022,0.264552,0.034919,-0.051451,0.101448,0.284157,0.335191
1,0.095286,0.077067,-0.264647,-0.044359,-0.067140,-0.112859,0.135131,0.302451,-0.067289,0.030090,0.020095,0.107327,0.021946,0.430170,0.062794,0.427362,-0.139562,0.027472,0.131419,-0.050279,-0.326841,-0.318972,0.196475,0.185676,-0.155374,0.060176,-0.012004,-0.067358,0.002503,0.127649,0.152596,-0.037731,-0.267700,-0.274499,-0.006970,-0.184752,0.054932,0.015867,0.109912,-0.000082,...,0.121164,-0.217894,-0.048373,0.191506,-0.125046,-0.204126,0.187196,-0.144931,-0.183426,-0.031838,-0.176233,0.192707,0.253268,0.159180,-0.051619,0.110963,0.467664,0.136564,-0.116687,-0.194654,-0.225855,0.060305,-0.182051,-0.094858,-5.753145,-0.430544,-0.096196,-0.037099,0.009764,-0.019659,-0.022370,-0.321931,0.071652,-0.239595,-0.114711,0.239941,0.107733,0.061932,0.629441,0.417959
2,0.089606,-0.114452,0.157323,0.015657,0.152667,-0.352516,0.098448,0.380798,-0.114708,-0.244245,-0.086395,0.037757,-0.010880,0.320740,0.162972,0.101190,-0.046626,0.213162,0.219105,-0.027633,-0.113588,-0.099889,0.089314,0.099486,-0.135969,-0.216305,-0.175764,0.225866,0.068100,-0.020560,-0.018281,0.051542,-0.388599,-0.336958,0.086325,0.039984,0.068570,-0.054057,-0.093390,-0.070891,...,0.031014,-0.044603,-0.085249,-0.020924,0.009540,-0.146324,-0.087281,-0.330616,-0.140934,-0.185638,-0.060687,0.150497,0.058385,0.254196,-0.014072,0.113893,0.395107,0.282438,0.174987,-0.053787,-0.397886,0.098765,-0.180523,0.072121,-6.962331,-0.279294,-0.121245,0.065999,-0.156699,-0.217195,-0.229150,-0.178881,0.234193,-0.139085,0.037097,0.246068,-0.012880,-0.071799,0.529332,0.380615
3,-0.125375,-0.058545,-0.129429,-0.107628,-0.304105,-0.203190,0.226137,0.158711,-0.076678,-0.044398,-0.296696,-0.127843,-0.045425,0.259921,0.199458,0.340376,-0.091153,0.130662,0.087126,-0.264176,0.094333,0.073670,-0.106589,0.040658,-0.055048,0.025232,0.191902,-0.043672,0.064993,0.178928,0.262746,0.059840,-0.006262,-0.329158,0.232546,-0.277781,-0.035381,-0.086124,0.304522,0.010976,...,0.088285,0.213585,0.108151,-0.022180,-0.364827,-0.195048,-0.051387,-0.199094,-0.047412,-0.091227,-0.004511,0.192218,-0.063572,0.051961,0.028780,0.095687,0.621607,0.192603,-0.127793,0.048106,-0.290899,-0.117908,-0.098175,0.256368,-6.155948,-0.370273,-0.142302,-0.025362,-0.047146,-0.234720,0.103592,-0.125050,-0.055821,0.000998,0.037700,0.044798,-0.067623,0.081697,0.446625,0.445961
4,0.131456,0.003370,-0.001269,-0.048962,-0.183062,-0.152779,0.217920,0.235372,0.097770,-0.090909,-0.033670,-0.107399,-0.164156,0.628630,0.195057,0.367297,-0.146439,0.111491,0.172675,-0.236713,-0.058929,-0.183212,0.299020,0.155977,-0.165793,0.111152,0.065651,-0.026628,-0.079303,0.171861,0.125053,-0.121696,-0.276433,-0.254619,0.232073,-0.268342,-0.129371,0.057542,0.080998,-0.122877,...,0.036485,-0.124530,0.070041,0.013081,-0.248389,-0.204709,0.088085,-0.245095,0.023698,-0.166536,-0.107585,0.232298,0.110611,0.285307,0.102113,0.280949,0.367623,0.122324,0.011049,-0.145418,-0.296109,-0.057227,-0.062689,0.102168,-6.480539,-0.287889,-0.188958,0.200361,0.131083,-0.089436,0.108468,-0.222599,0.006665,-0.124408,-0.028487,0.139138,0.076367,-0.151278,0.527354,0.315475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0.079769,0.163821,0.039639,-0.200538,-0.105984,0.060898,0.349350,0.226700,-0.142239,-0.047896,0.060450,-0.027002,-0.013967,0.204028,0.049045,0.198704,0.005393,0.047892,0.163269,-0.087299,0.031308,-0.189562,-0.055707,0.022088,-0.044758,0.103614,-0.038133,0.024916,0.048381,-0.082992,-0.015532,0.015227,-0.103904,-0.085219,-0.112678,-0.071668,0.013599,-0.061171,0.221332,-0.086875,...,0.099521,-0.071422,-0.127484,0.269602,-0.048991,-0.161537,0.075697,-0.106613,-0.017382,-0.096197,-0.100574,0.217639,0.068109,-0.114135,-0.156305,0.201475,0.221744,-0.028822,0.024932,0.030958,-0.162845,-0.133049,-0.142279,0.231040,-7.189999,-0.116854,0.120087,-0.271360,-0.094423,-0.066835,-0.016034,-0.194708,0.071258,-0.131594,0.261649,-0.031568,-0.192514,0.069214,0.291201,0.246542
1496,-0.104785,-0.029085,-0.203088,-0.148878,-0.261948,-0.071748,0.223727,0.322300,-0.041509,-0.365323,-0.010407,0.044167,-0.346317,0.722577,0.274255,0.287661,-0.160171,0.251146,0.126535,-0.052594,-0.296413,-0.315045,0.095475,0.035743,0.034613,0.044680,-0.160295,-0.017160,-0.029478,0.357034,0.042105,0.063991,-0.219876,-0.144120,-0.025838,-0.151011,0.059953,-0.042674,0.062193,0.080262,...,0.129053,-0.015363,0.109720,0.540686,-0.251908,-0.350180,0.077148,0.052013,-0.009537,-0.103565,-0.183218,0.004490,0.272250,0.142626,0.106060,0.210836,0.060382,-0.054857,0.042420,-0.153189,-0.240065,0.072518,-0.196466,0.116762,-7.326873,-0.225119,-0.369815,-0.242827,0.162992,0.114415,0.028137,-0.052976,0.071522,-0.063701,0.159390,-0.146770,0.063661,-0.195279,0.318506,0.181844
1497,-0.212149,-0.076728,0.055190,-0.233146,-0.020902,0.041325,0.141727,0.335852,0.155386,-0.159949,0.031903,-0.030581,-0.009845,0.522848,-0.006941,0.208238,0.119222,0.154942,0.169346,-0.096435,-0.066246,-0.357769,-0.153195,0.123195,0.091804,0.021859,0.178684,0.016428,0.154030,0.129672,0.099427,0.140332,-0.075095,-0.233111,-0.144989,-0.102034,0.176075,-0.128918,-0.024960,0.157065,...,0.013512,0.122073,0.189287,0.246285,-0.242550,-0.306299,0.127041,0.094134,0.120842,-0.234104,-0.164013,0.313328,0.283877,0.276368,0.026969,0.235984,0.312396,-0.002278,-0.023461,0.019588,-0.270537,-0.053531,-0.039019,0.161347,-7.209646,-0.058369,-0.134648,-0.114268,-0.014603,-0.074837,0.073483,-0.197092,0.006387,-0.078687,0.269555,-0.224290,-0.026730,-0.120650,0.325458,0.316969
1498,-0.186114,-0.078680,-0.135280,-0.368205,-0.294796,-0.282086,0.090594,0.345995,-0.082809,-0.074644,0.164646,-0.187514,-0.226415,0.309187,-0.084051,0.120422,-0.027516,0.127738,0.104486,0.000606,-0.163169,-0.125617,0.318097,-0.029062,-0.008502,-0.217723,0.147262,0.175482,-0.012480,0.126878,0.088208,0.024506,-0.386643,-0.180911,0.067882,-0.040278,0.321595,0.208379,0.074737,0.146272,...,-0.006339,-0.232416,0.128490,0.145906,0.073259,-0.289090,0.126051,-0.230831,-0.117151,-0.242391,-0.042123,0.129204,0.253589,0.076780,-0.012842,0.643095,0.449543,0.353295,0.177948,-0.083902,-0.078520,0.134436,0.026527,0.035280,-6.210321,-0.239505,-0.397245,0.087502,0.129845,-0.079933,0.041359,-0.330706,0.212895,-0.111953,0.010916,0.101088,-0.052374,-0.288308,0.417482,0.191375


In [None]:
train_dataloader = pd.concat([train_features, train_labels], axis=1, join="inner")
test_dataloader = pd.concat([test_features, test_labels], axis=1, join="inner")

TypeError: ignored

In [None]:
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression()

In [None]:
model.score(test_features, test_labels)

0.846

In [None]:
sentiment = {0:"Negative", 1:"Positive"}

# Dataset Parameters
input_nodes = 768
classes = 2
batch_size = 32


In [None]:
def accuracy(data_loader, model, is_test):
    correct = 0
    samples = 0
    cm = np.zeros((10, 10))
    model.eval()

    with torch.no_grad():
        for data_features, labels in data_loader:
            data_features = data_features.to(device=dev)
            labels = labels.to(device=dev)
            data_features = data_features.reshape(data_features.shape[0], -1)

            outputs = model(data_features)
            _, predictions = outputs.max(1)
            correct += (predictions == labels).sum()
            samples += predictions.size(0)
            acc = correct/samples
            if is_test:
              cm1 = confusion_matrix(predictions.cpu().numpy(),labels.cpu().numpy())
              if cm1.shape == (10, 10):
                cm += cm1

    model.train()
    return acc, cm

In [None]:
# Train Network
def trainer(model, train_dataloader, test_dataloader, epochs, loss_fn, optimizer):
  
  losses = []
  accuracies = []
  test_acc = 0.0
  
  for epoch in range(epochs):
    total_loss = 0.0
    samples = 0
    for batch_id, (data, labels) in enumerate(tqdm(train_dataloader)):
        
        # Use cuda
        data = data.to(device=dev)
        labels = labels.to(device=dev)

        # Reshape to flatten
        data = data.reshape(data.shape[0], -1)

        # forward
        outputs = model(data)
        if isinstance(loss_fn, nn.NLLLoss):
          outputs = f.log_softmax(outputs,dim=1)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()*outputs.size(0)
        samples += outputs.size(0)

        # calculate gradients of loss function
        optimizer.zero_grad()
        loss.backward()

        # adam step or gradient descent
        optimizer.step()
    
    acc, _ = accuracy(train_dataloader, model, False)
    avg_loss = total_loss/samples

    losses.append(avg_loss)
    accuracies.append(acc)

    print(f"Epoch: {epoch+1} | Accuracy on training set: {acc*100:.2f} | Loss: {avg_loss:.5f}")

  test_acc, cm = accuracy(test_dataloader, model, True)

  return losses, accuracies, test_acc, cm

In [None]:
def draw_graphs(losses, accuracies, epochs, cm):
  x = np.array(range(1,epochs+1))

  losses = np.array(losses)

  fig = plt.figure()
  plt.plot(x, losses)
  fig.suptitle('Loss Vs. Epochs')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.xticks(x)
  plt.show()

  accuracies = np.array(accuracies)*100


  fig = plt.figure()
  plt.plot(x, accuracies)
  fig.suptitle('Accuracy Vs. Epochs')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy (in %)')
  plt.xticks(x)
  plt.show()

  cmd = ConfusionMatrixDisplay(cm, display_labels=[label for key, label in clothes.items()])
  fig, ax = plt.subplots(figsize=(10,10))
  cmd.plot(ax=ax)

In [None]:
def metrics(cm):
  true_positive = cm.diagonal()
  false_positive = cm.sum(0) - true_positive
  false_negative = cm.sum(1) - true_positive

  precision = true_positive / (true_positive + false_positive)
  recall = true_positive / (true_positive + false_negative)

  f_score = 2*(precision*recall)/(precision+recall)
  return precision, recall, f_score

In [None]:
test_accuracies = []

In [None]:
# KL Divergence Model and its parameters
class NeuralNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NeuralNet, self).__init__()
        self.hidden_layer1 = nn.Linear(input_size, 16)
        self.output_layer = nn.Linear(16, num_classes)

    def forward(self, x):
        # Activation Functions
        x = f.relu(self.hidden_layer1(x))
        x = torch.sigmoid(self.output_layer(x))
        return x


# Initialize network
model = NeuralNet(input_size=input_nodes, num_classes=classes).to(dev)

# Hyperparameters
epochs = 2
learning_rate = 0.0001
loss_fn = nn.NLLLoss()   # Loss Function
# loss_fn = nn.CrossEntropyLoss()    # Loss Function

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Results
losses, accuracies, test_acc, cm = trainer(model, train_dataloader, test_dataloader, epochs, loss_fn, optimizer)
p, r, fs = metrics(cm)

print("-"*10 + "Test Results" + "-"*10)
print(f"Accuracy on test set: {test_acc*100:.2f}")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F1 score: {fs}")

test_accuracies.append(test_acc)

draw_graphs(losses, accuracies, epochs, cm)

  0%|          | 0/1500 [00:00<?, ?it/s]


ValueError: ignored