In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import roc_curve 
import numpy as np
import pandas as pd

import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

from sklearn.model_selection import train_test_split
import tqdm

# 1. Preprocessing

In [2]:
# We define utility functions for text processing.

def removePunctuation(text):
    """
    Remove punctuation symbols and convert text to lowercase
    """
    return text.lower().translate(str.maketrans('', '', string.punctuation))

def removeStopWords(text):
    """
    Filter words that are not found in stop words
    """
    return " ".join([word for word in text.split() if word not in stopwords.words("english")])

def buildDictionary(texts):
    """
    Build the dictionary of words where key is the word and value is the index.
    """
    hashmap = {}
    for text in texts:
        for word in text.split():
            hashmap[word] = hashmap.get(word, 0) + 1
    return {w:i+1 for i, w in enumerate(list(hashmap.keys()))}

def calculateMaxSeqLen(texts):
    """
    Calculates the maximum sequence length found in the corpus
    """
    max_len = float('-inf')
    for text in texts:
        if len(text.split()) > max_len:
            max_len = len(text.split())
            
    return max_len

def lemmatize(text):
    """
    Filter words that are not found in stop words
    """
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [3]:
# Load the *.csv* file and filter the features that are useful for the problem:
data = pd.read_csv("C:/datasets/coronavirus/Corona_NLP_train.csv")

In [4]:
data

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,menyrbie philgahan chrisitv httpstcoifz9fan2pa...,Neutral
1,3800,48752,UK,16-03-2020,advice talk neighbours family exchange phone n...,Positive
2,3801,48753,Vagabonds,16-03-2020,coronavirus australia woolworths give elderly ...,Positive
3,3802,48754,,16-03-2020,food stock one empty please dont panic enough ...,Positive
4,3803,48755,,16-03-2020,ready go supermarket covid19 outbreak im paran...,Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,airline pilots offering stock supermarket shel...,Neutral
41153,44952,89904,,14-04-2020,response complaint provided citing covid19 rel...,Extremely Negative
41154,44953,89905,,14-04-2020,know itâs getting tough kameronwilds rationin...,Positive
41155,44954,89906,,14-04-2020,wrong smell hand sanitizer starting turn coron...,Neutral


In [5]:
# Text preprocessing. Remove punctuation symbols and stop words:
# data["OriginalTweet"] = data["OriginalTweet"].apply(lambda x: removePunctuation(x))
# data["OriginalTweet"] = data["OriginalTweet"].apply(lambda x: removeStopWords(x))
data["OriginalTweet"] = data["OriginalTweet"].apply(lambda x: lemmatize(x))

In [6]:
data["Sentiment"].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [7]:
data["Sentiment"] = data["Sentiment"].replace("Extremely Positive", 2)
data["Sentiment"] = data["Sentiment"].replace("Positive", 2)
data["Sentiment"] = data["Sentiment"].replace("Neutral", 1)
data["Sentiment"] = data["Sentiment"].replace("Negative", 0)
data["Sentiment"] = data["Sentiment"].replace("Extremely Negative", 0)

In [8]:
classes = ["Negative", "Neutral", "Positive"]

In [9]:
data["Sentiment"].value_counts()

2    18046
0    15398
1     7713
Name: Sentiment, dtype: int64

In [10]:
data = data[["OriginalTweet", "Sentiment"]]

In [11]:
dictionary = buildDictionary(data["OriginalTweet"])
max_seq_len = calculateMaxSeqLen(data["OriginalTweet"])

print(f"Number of words in dictionary: {len(dictionary)}")
print(f"Maximum sequence lenght: {max_seq_len}")

Number of words in dictionary: 83193
Maximum sequence lenght: 46


In [12]:
data["Sentiment"].value_counts() / len(data)

2    0.438467
0    0.374128
1    0.187404
Name: Sentiment, dtype: float64

In [13]:
data

Unnamed: 0,OriginalTweet,Sentiment
0,menyrbie philgahan chrisitv httpstcoifz9fan2pa...,1
1,advice talk neighbour family exchange phone nu...,2
2,coronavirus australia woolworth give elderly d...,2
3,food stock one empty please dont panic enough ...,2
4,ready go supermarket covid19 outbreak im paran...,0
...,...,...
41152,airline pilot offering stock supermarket shelf...,1
41153,response complaint provided citing covid19 rel...,0
41154,know itâs getting tough kameronwilds rationin...,2
41155,wrong smell hand sanitizer starting turn coron...,1


In [14]:
# Split data into training and testing
x_train, x_valid, y_train, y_valid = train_test_split(data["OriginalTweet"], data["Sentiment"], test_size=0.2, stratify=data["Sentiment"])

# 2. Data loaders

In [15]:
y_valid.value_counts()

2    3609
0    3080
1    1543
Name: Sentiment, dtype: int64

In [16]:
y_train.value_counts()

2    14437
0    12318
1     6170
Name: Sentiment, dtype: int64

In [17]:
BATCH_SIZE = 64
LEARNING_RATE = 0.001
DROPOUT = 0.5
INPUT_SIZE = len(dictionary) + 1
EPOCHS = 30

In [18]:
class DataHandler(Dataset):
    """
    Iterator generator for data loader construction
    """
    
    @staticmethod
    def tokenizer(X, max_seq_len, dicitionary):
        """
        Given a sequence of words, tokenize each word with 
        use of `dictionary` and apply padding considering `max_seq_len`
        """
        sequences = []
        for x in X:
            sequence = [0] * max_seq_len
            for idx, word in enumerate(x.split()):
                sequence[idx] = dictionary[word]
            sequences.append(sequence)
            
        return np.array(sequences)
    
    def __init__(self, x, y, max_seq_len, dictionary):
        self.x = self.tokenizer(x, max_seq_len, dictionary)
        self.y = y.to_numpy()
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [19]:
# Initialize iterator objects for the data loader
train = DataHandler(x_train, y_train, max_seq_len, dictionary)
valid = DataHandler(x_valid, y_valid, max_seq_len, dictionary)

In [20]:
# Initialize data loaders
loader_training = DataLoader(train, batch_size=BATCH_SIZE, shuffle=False)
loader_valid = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False)

In [21]:
txt1 = iter(loader_valid)

In [22]:
x, y = next(txt1)

In [23]:
dictionary

{'menyrbie': 1,
 'philgahan': 2,
 'chrisitv': 3,
 'httpstcoifz9fan2pa': 4,
 'httpstcoxx6ghgfzcc': 5,
 'httpstcoi2nlzdxno8': 6,
 'advice': 7,
 'talk': 8,
 'neighbour': 9,
 'family': 10,
 'exchange': 11,
 'phone': 12,
 'number': 13,
 'create': 14,
 'contact': 15,
 'list': 16,
 'school': 17,
 'employer': 18,
 'chemist': 19,
 'gp': 20,
 'set': 21,
 'online': 22,
 'shopping': 23,
 'account': 24,
 'po': 25,
 'adequate': 26,
 'supply': 27,
 'regular': 28,
 'med': 29,
 'order': 30,
 'coronavirus': 31,
 'australia': 32,
 'woolworth': 33,
 'give': 34,
 'elderly': 35,
 'disabled': 36,
 'dedicated': 37,
 'hour': 38,
 'amid': 39,
 'covid19': 40,
 'outbreak': 41,
 'httpstcobinca9vp8p': 42,
 'food': 43,
 'stock': 44,
 'one': 45,
 'empty': 46,
 'please': 47,
 'dont': 48,
 'panic': 49,
 'enough': 50,
 'everyone': 51,
 'take': 52,
 'need': 53,
 'stay': 54,
 'calm': 55,
 'safe': 56,
 'covid19france': 57,
 'confinement': 58,
 'confinementotal': 59,
 'confinementgeneral': 60,
 'httpstcozrlg0z520j': 61,
 'r

In [24]:
x_valid.reset_index()["OriginalTweet"].loc[0].split() 

['question',
 'medical',
 'expert',
 'could',
 'covid19',
 'passed',
 'standing',
 'behind',
 'smoker',
 'amp',
 'vapers',
 'supermarket',
 'queue',
 'inhaling',
 'exhaled',
 'smokevapour',
 'pas',
 'open',
 'place',
 'output',
 'easily',
 'extends',
 'beyond',
 '2m',
 'banned',
 'outside',
 'temporarily',
 'coronavirus']

In [25]:
for word in x_valid.reset_index()["OriginalTweet"].loc[0].split() :
    print(dictionary[word])

2050
725
3850
433
40
4083
2910
3069
8201
412
47337
64
1410
8202
56883
56884
2961
122
2115
9220
850
12746
2256
27801
3482
570
1250
31


In [26]:
x[0]

tensor([ 2050,   725,  3850,   433,    40,  4083,  2910,  3069,  8201,   412,
        47337,    64,  1410,  8202, 56883, 56884,  2961,   122,  2115,  9220,
          850, 12746,  2256, 27801,  3482,   570,  1250,    31,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0], dtype=torch.int32)

In [27]:
y_valid.reset_index()["Sentiment"].loc[0]

1

In [28]:
y[0].item()

1

# 3. Model

In [29]:
x.shape, len(y)

(torch.Size([64, 46]), 64)

In [30]:
EMBEDDING_DIM = 512
HIDDEN_DIM = 512
LSTM_LAYERS = 2

class TextClassifier(nn.ModuleList):
    """
    LSTM Network definition
    """
    def __init__(self):
        super(TextClassifier, self).__init__()

        self.batch_size = BATCH_SIZE
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_dim = HIDDEN_DIM
        self.LSTM_layers = LSTM_LAYERS
        self.input_size = INPUT_SIZE
        self.max_seq_len = max_seq_len

        self.dropout = nn.Dropout(DROPOUT)
        self.embedding = nn.Embedding(num_embeddings=self.input_size, embedding_dim=self.embedding_dim, padding_idx=0)
        
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, dropout=DROPOUT,
                            num_layers=self.LSTM_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(in_features=self.hidden_dim*4, out_features=512)
        self.fc2 = nn.Linear(512, 3)
    

    def forward(self, x):        
        embedded = self.embedding(x)

        out, (hidden, cell) = self.lstm(embedded)

        concat_hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        concat_cell = torch.cat((cell[0], cell[1]), dim=1)
        all_states = torch.cat((concat_hidden, concat_cell), dim=1)
        out = torch.relu_(self.fc1(all_states))
        out = self.dropout(out)
        out = self.fc2(out)
        return out.squeeze()

# Identify device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model initialization
model = TextClassifier().to(device)
print(model)

TextClassifier(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(83194, 512, padding_idx=0)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=2048, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=3, bias=True)
)


In [31]:
x.shape

torch.Size([64, 46])

In [32]:
y.shape

torch.Size([64])

In [33]:
model(x.cuda()).shape

torch.Size([64, 3])

In [34]:
# Optimizer initialization
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

In [35]:
# def calculate_accuracy_with_selected_threshold(grand_truth, predictions):
#     """
#     Accuracy calcuation: (tp + tn) / N
#     """
#     true_positives, true_negatives = 0, 0
#     fpr, tpr, thr = roc_curve(grand_truth, predictions)
#     roc = pd.DataFrame({'fpr':fpr, 'tpr':tpr, 'thr':thr});
#     roc['random'] = roc['fpr'].copy()
#     roc['diff'] = roc['tpr'] - roc['fpr']
#     roc = roc.sort_values('diff', ascending=False).reset_index().drop('index', axis=1)
    
#     for true, pred in zip(grand_truth, predictions):
#         if (pred > roc.loc[0,'thr']) and (true == 1):
#             true_positives += 1
#         elif (pred < roc.loc[0,'thr']) and (true == 0):
#             true_negatives += 1
#     threshold = roc.loc[0,'thr']
#     return ((true_positives+true_negatives) / len(grand_truth)), threshold

# def calculate_accuray_with_point_five(grand_truth, predictions):
#     """
#     Accuracy calcuation: (tp + tn) / N
#     """
#     true_positives, true_negatives = 0, 0    
#     for true, pred in zip(grand_truth, predictions):
#         if (pred > 0.5) and (true == 1):
#             true_positives += 1
#         elif (pred < 0.5) and (true == 0):
#             true_negatives += 1
#     return (true_positives+true_negatives) / len(grand_truth)

In [36]:
# Training
EPOCHS = 3
valid_loss_min = np.Inf
clip = 0.2
for epoch in range(EPOCHS):
    model.train()
    train_predictions = []
    train_loss = 0.0
    valid_loss = 0.0
    train_counter = 0
    valid_counter = 0
    
    for x_batch, y_batch in tqdm.tqdm(loader_training, desc=f"training epoch {epoch+1}"):
        
        train_counter += 1
        
        x = x_batch.type(torch.LongTensor).to(device)
        y = y_batch.type(torch.int64).to(device)
        
        y_pred = model(x)
        
        loss = criterion(y_pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        train_loss += loss.item()
    
    # Evaluation
    with torch.no_grad():
        model.eval()
        valid_predictions = []
        for x_batch, y_batch in tqdm.tqdm(loader_valid, desc=f"validation epoch {epoch+1}"):
            valid_counter += 1
            x = x_batch.type(torch.LongTensor).to(device)
            y = y_batch.type(torch.int64).to(device)
            
            y_pred = model(x)
            
            loss = criterion(y_pred, y)
            valid_loss += loss.item()
            
    train_loss = train_loss / train_counter
    valid_loss = valid_loss / valid_counter


    print(f"epoch: {epoch+1}, train_loss: {train_loss:.5f}, valid_loss: {valid_loss:.5f}")
    if valid_loss <= valid_loss_min:
        print('Valid loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # change the name, for saving multiple files
        torch.save(model.state_dict(), 'text_clf.pt')
        valid_loss_min = valid_loss
        print("=========================================")

training epoch 1: 100%|██████████████████████████████████████████████████████████████| 515/515 [00:49<00:00, 10.31it/s]
validation epoch 1: 100%|████████████████████████████████████████████████████████████| 129/129 [00:04<00:00, 31.00it/s]


epoch: 1, train_loss: 0.80977, valid_loss: 0.66062
Valid loss decreased (inf --> 0.660620).  Saving model ...


training epoch 2: 100%|██████████████████████████████████████████████████████████████| 515/515 [00:49<00:00, 10.36it/s]
validation epoch 2: 100%|████████████████████████████████████████████████████████████| 129/129 [00:04<00:00, 31.22it/s]


epoch: 2, train_loss: 0.51025, valid_loss: 0.63703
Valid loss decreased (0.660620 --> 0.637033).  Saving model ...


training epoch 3: 100%|██████████████████████████████████████████████████████████████| 515/515 [00:49<00:00, 10.40it/s]
validation epoch 3: 100%|████████████████████████████████████████████████████████████| 129/129 [00:04<00:00, 31.34it/s]


epoch: 3, train_loss: 0.29983, valid_loss: 0.75533


training epoch 4:  98%|████████████████████████████████████████████████████████████▌ | 503/515 [00:48<00:01, 10.36it/s]


KeyboardInterrupt: 

In [37]:
model.load_state_dict(torch.load('text_clf.pt'))

<All keys matched successfully>

In [38]:
# track valid loss
valid_loss = 0.0
class_correct = list(0. for i in range(3))
class_total = list(0. for i in range(3))
valid_counter = 0

with torch.no_grad():
    model.eval()
    # iterate over valid data
    for data, target in tqdm.tqdm(loader_valid, desc="accuracy calculation"):
        valid_counter += 1
        # move tensors to GPU if CUDA is available
        data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update valid loss 
        valid_loss += loss.item()
        # convert output probabilities to predicted class
        _, pred = torch.max(output, 1)    
        # compare predictions to true label
        correct_tensor = pred.eq(target.data.view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        # calculate valid accuracy for each object class
        for i in range(target.shape[0]):
            label = target.data[i]
            class_correct[label] += correct[i].item()
            class_total[label] += 1

# average valid loss
valid_loss = valid_loss / valid_counter
print('valid Loss: {:.6f}\n'.format(valid_loss))

for i in range(3):
    if class_total[i] > 0:
        print('valid Accuracy of %5s: %2d%% (%2d/%2d)' % (
            classes[i], 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('valid Accuracy of %5s: N/A (no training examples)' % (classes[i]))

print('\nvalid Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

accuracy calculation: 100%|██████████████████████████████████████████████████████████| 129/129 [00:05<00:00, 22.92it/s]

valid Loss: 0.637033

valid Accuracy of Negative: 78% (2431/3080)
valid Accuracy of Neutral: 65% (1005/1543)
valid Accuracy of Positive: 79% (2876/3609)

valid Accuracy (Overall): 76% (6312/8232)



