In [71]:
pip install transformers



In [72]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [73]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', sep='\t', header=None)

In [74]:
batch1 = df[:2000]

In [75]:
batch1

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
1995,too bland and fustily tasteful to be truly pru...,0
1996,it does n't work as either,0
1997,this one aims for the toilet and scores a dire...,0
1998,in the name of an allegedly inspiring and easi...,0


In [76]:
batch1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [77]:
# importing pretrained Distilbert and tokenizer
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [78]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
# Tokenization for the comments

tokens = batch1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [80]:
tokens

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

In [81]:
max_len = 0
for i in tokens.values:
  if len(i)> max_len:
    max_len = len(i)

In [82]:
max_len

59

In [83]:
padded = np.array([i + [0]*(max_len - len(i)) for i in tokens.values])

In [84]:
padded.shape

(2000, 59)

In [85]:
attention_mask = np.where(padded != 0,1,0 ) # bert will confuse with the padded values so we need to tell the model to ignore it so we are creating this.
attention_mask.shape

(2000, 59)

In [86]:
input_id = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [87]:
with torch.no_grad():
  last_hidden_states = model(input_id, attention_mask=attention_mask)

In [88]:
last_hidden_states

BaseModelOutput([('last_hidden_state',
                  tensor([[[-0.2159, -0.1403,  0.0083,  ..., -0.1369,  0.5867,  0.2011],
                           [-0.2471,  0.2468,  0.1008,  ..., -0.1631,  0.9349, -0.0715],
                           [ 0.0558,  0.3573,  0.4140,  ..., -0.2430,  0.1770, -0.5080],
                           ...,
                           [-0.0165,  0.1179,  0.3512,  ..., -0.2401,  0.2722, -0.1750],
                           [ 0.0961,  0.0667,  0.3147,  ..., -0.3277,  0.3556, -0.2135],
                           [ 0.0454,  0.0519,  0.3168,  ..., -0.2880,  0.1844, -0.1042]],
                  
                          [[-0.1726, -0.1448,  0.0022,  ..., -0.1744,  0.2139,  0.3720],
                           [ 0.0022,  0.1684,  0.1269,  ..., -0.1888, -0.0195, -0.0283],
                           [ 0.0257, -0.2458,  0.0717,  ..., -0.4339,  0.1622,  0.0133],
                           ...,
                           [ 0.0505, -0.0493,  0.0463,  ..., -0.0448, -0.054

In [89]:
features = last_hidden_states[0][:,0,:].numpy()

In [90]:
features

array([[-0.21593425, -0.14028913,  0.00831123, ..., -0.13694875,
         0.5867002 ,  0.20112707],
       [-0.1726271 , -0.14476179,  0.0022342 , ..., -0.17442557,
         0.21386456,  0.37197497],
       [-0.05063327,  0.07203942, -0.02959672, ..., -0.07148942,
         0.71852356,  0.26225507],
       ...,
       [-0.27829766, -0.24803594,  0.13585812, ..., -0.19039148,
         0.13099591,  0.3497837 ],
       [-0.03667733,  0.10638569, -0.01111017, ..., -0.1120664 ,
         0.41619503,  0.50338006],
       [ 0.12402605,  0.01425171,  0.01038414, ..., -0.11606554,
         0.5345917 ,  0.2749535 ]], dtype=float32)

In [91]:
labels = batch1[1]

In [92]:
labels

0       1
1       0
2       0
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, Length: 2000, dtype: int64

In [93]:
X_train,X_test,y_train,y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)

In [94]:
log_reg = LogisticRegression()

In [95]:
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
y_pred = log_reg.predict(X_test)

In [97]:
y_pred

array([1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,

In [98]:
log_reg.score(X_test, y_test)

0.82

In [99]:
from sklearn.metrics import accuracy_score

acc_score = accuracy_score(y_pred, y_test)

In [100]:
acc_score

0.82

In [101]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_pred, y_test)

In [102]:
cm

array([[158,  40],
       [ 32, 170]])