In [42]:
!pip install transformers



In [43]:
!pip install torch===1.7.1 torchvision===0.8.2 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [44]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load dataset


In [45]:
df = pd.read_csv('Corona_NLP_train.csv', delimiter=',', encoding='latin-1', header=None)

In [46]:
df

Unnamed: 0,0,1,2,3,4,5
0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
1,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
2,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
3,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
4,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
...,...,...,...,...,...,...
41153,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41154,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41155,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41156,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [47]:
df = df[df.columns[4:6]]
df.columns = [0,1]
df = df.iloc[1:].sample(frac=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,0,1
0,You know shit is real when the supermarket is ...,Negative
1,New England-based seafood manufacturer has the...,Neutral
2,YouÂre not working this weekend but corporate...,Neutral
3,"Friends! It's March 25, 2020 at 03:00PM- time ...",Extremely Positive
4,Mexico's #crudeoil basket price closed at $18....,Negative
5,Consumers in Labasa are rushing to supermarket...,Neutral
6,As COVID 19 continues to spread our Chief Econ...,Neutral
7,One wonders if when there finally is a Covid 1...,Negative
8,@EmmMacfarlane @britjpncdn We are not dumping ...,Positive
9,This pandemic had really exposed a lot outside...,Extremely Positive


In [48]:
df[1].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: 1, dtype: int64

In [49]:
df = df.loc[df[1] != 'Neutral'].reset_index(drop=True)
df[1].value_counts()

Positive              11422
Negative               9917
Extremely Positive     6624
Extremely Negative     5481
Name: 1, dtype: int64

In [50]:
df = df[:2000]
df.head(10)

Unnamed: 0,0,1
0,You know shit is real when the supermarket is ...,Negative
1,"Friends! It's March 25, 2020 at 03:00PM- time ...",Extremely Positive
2,Mexico's #crudeoil basket price closed at $18....,Negative
3,One wonders if when there finally is a Covid 1...,Negative
4,@EmmMacfarlane @britjpncdn We are not dumping ...,Positive
5,This pandemic had really exposed a lot outside...,Extremely Positive
6,#WhenThisIsAllOver\r\r\nI'm going to join The ...,Negative
7,A Cumberland County student spent spring break...,Extremely Positive
8,"NHS workers, delivery drivers, supermarket sta...",Positive
9,Thanks Gran..\r\r\n\r\r\n#StopHoarding #Corona...,Positive


In [51]:
import re

def preprocess1(text):
        """Minimal text preprocessing
        """
        text = re.sub("^\s+|(@[A-Za-z0-9\_\-]+)|\n|\r|\s+$", ' ', text)
        return text.strip()

def preprocess2(text):
    if text == 'Negative' or text == 'Extremely Negative':
        return 0
    else:
        return 1
    

In [52]:
df[0] = df.apply(lambda x: preprocess1(x[0]), axis=1)
df[1] = df.apply(lambda y: preprocess2(y[1]), axis=1)

In [53]:
df[1].value_counts()

1    1094
0     906
Name: 1, dtype: int64

# Loading model

In [54]:
model_name = 'bert-base-cased'

tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertModel.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




# Preparing the Dataset

In [55]:
tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [56]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [57]:
np.array(padded).shape

(2000, 152)

In [58]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 152)

## Train

In [59]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [60]:
last_hidden_states[0]

tensor([[[ 3.9186e-01,  3.2999e-01, -8.8564e-02,  ..., -1.2987e-01,
           4.9263e-01,  6.1227e-02],
         [ 3.5300e-01,  2.8438e-01,  2.1234e-01,  ...,  5.3138e-01,
           1.5911e-01,  6.3306e-01],
         [ 4.6347e-01,  4.8181e-01,  1.3075e-01,  ...,  6.3777e-01,
          -3.5892e-01, -6.9674e-02],
         ...,
         [-2.8272e-02,  1.1382e-01, -3.1168e-01,  ..., -8.6519e-02,
           1.5277e-01, -2.7097e-01],
         [-1.8611e-02,  1.1986e-01, -2.7890e-01,  ..., -6.2159e-02,
           1.3307e-01, -2.5080e-01],
         [ 2.8356e-02,  1.8315e-01, -2.9012e-01,  ..., -7.1138e-02,
           1.5453e-01, -2.1126e-01]],

        [[ 5.7915e-01,  1.8837e-01, -9.5376e-02,  ..., -4.5307e-01,
           4.7874e-01,  4.0141e-02],
         [ 3.7257e-01, -2.3702e-01,  4.2405e-01,  ..., -4.2023e-01,
           2.6787e-01, -1.3222e-01],
         [ 3.6418e-01,  2.8576e-01,  3.4458e-01,  ...,  2.9952e-01,
           1.6636e-01,  3.5639e-01],
         ...,
         [ 2.3137e-01, -3

In [61]:
features = last_hidden_states[0][:,0,:].numpy()

In [62]:
labels = df[1]

## Model #2 Log regression

In [63]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [64]:
lr_clf = LogisticRegression(C=0.0001)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
lr_clf.score(test_features, test_labels)

0.554