# Sentiment Analysis with twitter API

In this work, we want to find if the tag in twitter (e.g. "iphone) is positive or negative by using
sentiment analysis on the post related to tag. Then use our model to determine if it contains positive or negative sentiment.

## Data preparation

Just loading and do some indexing. Nothing much.

In [1]:
# Import necessary libraries.
import torch, torchdata, torchtext
from torch import nn
import time

# Check if we can use CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Use for reproducability
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# Download dataset for sentiment analysis.
import pytreebank
dataset = pytreebank.load_sst()
example = dataset["train"][0]

In [3]:
# There are 5 labels for each sentence.
# ["very negative", "negative", "neutral", "positive", "very positive"]
# [0, 1, 2, 3, 4]
for label, sentence in example.to_labeled_lines():
    print(label)
    print(sentence)
    break

3
The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .


In [4]:
def unpack(data):
    temp_data  = list()
    temp_label = list()
    lenght = len(list(iter(data)))
    for i in range(lenght):
        for label, sentence in data[i].to_labeled_lines():
            temp_data.append(sentence)
            temp_label.append(label)
    
    return (temp_data, temp_label)

In [5]:
# We do not want to use the treebank structure.
# Seperated into train, dev and test set
train_data, train_label = unpack(dataset["train"])
valid_data, valid_label = unpack(dataset["dev"])
test_data,  test_label  = unpack(dataset["test"])

# Check the lenght of each data set
train_size = len(train_data)
valid_size = len(valid_data)
test_size =  len(test_data)

# This is the same as train_data above, it is just in the format of tree.
# I do this for the sake of convenience when we trying to build dataloader.
# But IT IS NOT A GOOD PRACTICE!
train = dataset["train"]
valid = dataset["dev"]
test  = dataset["test"]

t_train_size = len(list(iter(train)))
t_valid_size = len(list(iter(valid)))
t_test_size = len(list(iter(test)))

print(t_train_size, t_valid_size, t_test_size)
print(train_size, valid_size, test_size)
print(len(train_label), len(valid_label), len(test_label))

8544 1101 2210
318582 41447 82600
318582 41447 82600


In [6]:
# Let's take a look
train_data[0], train_label[0]

("The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",
 3)

In [39]:
# Tokenize
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_md')

In [8]:
#Numericalization

from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):  #data_iter = train, test, validation
    for data in data_iter:  # Look for the tree
        for _, text in data.to_labeled_lines(): # Get the data inside tree
            yield tokenizer(text)
        
vocab = build_vocab_from_iterator(yield_tokens(train), specials=['<unk>', '<pad>',
                                                                 '<bos>', '<eos>'])

In [9]:
# If it's don't know the word
vocab.set_default_index(vocab["<unk>"])

In [10]:
# Check if our vocab is working.
print(vocab(['Chaky', 'wants', 'his', 'student', 'to', 'be', 'number', '1', '.']))
print(vocab(['<pad>','<bos>','<eos>']))

[0, 919, 36, 2733, 9, 28, 908, 3233, 10]
[1, 2, 3]


In [11]:
# Create index2word index.
id2word = vocab.get_itos()
id2word[0]

'<unk>'

In [12]:
# Check the lenght of vocab
len(vocab)

17136

## Prepare Embedding

In [27]:
# We use Fasttext embedding
from torchtext.vocab import FastText

# Create fast_vectors
#fast_vectors = FastText(language='en')

# This is the code for saving the vector that we download as pickle.

# import pickle 
# object = fast_vectors
# filehandler = open('/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/data/Fast_vector_en.pkl', 'wb') 
# pickle.dump(object, filehandler)

# If we already have the Fast vector in your device, you can use that.
# Otherwise you need to download for a long time.
import pickle
filehandler = open('/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/data/Fast_vector_en.pkl', 'rb') 
fast_vectors = pickle.load(filehandler)

In [39]:
# Now that we get the vectors, it's time to create embedding.
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [40]:
# Let check the shape
fast_embedding.shape

torch.Size([17136, 300])

## Prepare Dataloader

In [37]:
text_pipeline  = lambda x: vocab(tokenizer(x))
#label_pipeline = lambda x: int(x) - 1  #1, 2, 3, 4 ---> 0, 1, 2, 3 #

In [None]:
# Testing text_pipeline
text_pipeline("I love to play football")

In [43]:
t = torch.empty(3, 4, 5)
t.size()
torch.Size([3, 4, 5])
t.size(0)

3

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence #making each batch same length

pad_ix = vocab['<pad>']

#this function gonna be called by DataLoader
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        length_list.append(processed_text.size(0)) #for padding, this keep the lenght of sequence.
        
    return torch.tensor(label_list, dtype=torch.int64), \
        pad_sequence(text_list, padding_value=pad_ix, batch_first=True), \
        torch.tensor(length_list, dtype=torch.int64)  # The pad_seq functions automatically do the work.

In [45]:
# We need the data in the format of tuples .. e.g. (label, text)

def merge(list1, list2):
    merged_list = [(list1[i], list2[i]) for i in range(0, len(list1))]
    return merged_list


training_data   = merge(train_label, train_data)
validation_data = merge(valid_label, valid_data)
testing_data    = merge(test_label,  test_data)

In [46]:
# Test the one that we created.
training_data[0]

(3,
 "The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .")

In [47]:
# The one that we already have.
# Exactly the same!
train_label[0], train_data[0]

(3,
 "The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .")

In [48]:
batch_size = 64

train_loader = DataLoader(training_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

val_loader   = DataLoader(validation_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

test_loader  = DataLoader(testing_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

In [49]:
for label, text, length in train_loader:
    break
print("Label shape: ", label.shape) # (batch_size, )
print("Text shape: ", text.shape)   # (batch_size, seq len)

Label shape:  torch.Size([64])
Text shape:  torch.Size([64, 27])


## Prepare Model
Basically in this part, we will just define LSTM neural network and function for training.

In [84]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout):
        super().__init__()
        #put padding_idx so asking the embedding layer to ignore padding
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_ix)
        self.lstm = nn.LSTM(emb_dim, 
                           hid_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hid_dim * 2, output_dim)
        
    def forward(self, text, text_lengths):
        #text = [batch size, seq len]
        embedded = self.embedding(text)
        
        #++ pack sequence ++
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False, batch_first=True)
        
        #embedded = [batch size, seq len, embed dim]
        packed_output, (hn, cn) = self.lstm(packed_embedded)  #if no h0, all zeroes
        
        #++ unpack in case we need to use it ++
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        #output = [batch size, seq len, hidden dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
        #hn = [batch size, hidden dim * num directions]
        
        return self.fc(hn)

In [88]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [52]:
input_dim  = len(vocab)
hid_dim    = 256
emb_dim    = 300         # Why 300, we do not know depend on you.
output_dim = 5 # [0, 1, 2, 3, 4] # We have 5 class

#for biLSTM
num_layers = 2
bidirectional = True
dropout = 0.5

model = LSTM(input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout).to(device)
model.apply(initialize_weights)
model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

In [53]:
#we can print the complexity by the number of parameters
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    for item in params:
        print(f'{item:>6}')
    print(f'______\n{sum(params):>6}')
    
count_parameters(model)

5140800
307200
262144
  1024
  1024
307200
262144
  1024
  1024
524288
262144
  1024
  1024
524288
262144
  1024
  1024
  2560
     5
______
7863109


In [27]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

In [55]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [56]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text, text_length) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text, text_length).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [57]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text, text_length).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [58]:
# Function to calculate time.
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Training!

In [59]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [60]:
best_valid_loss = float('inf')
num_epochs      = 8
tolerance_counter = 0

save_path = f'/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/weights/{model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
    
    start_time = time.time()

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
    valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        tolerance_counter = 0
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')   
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    # Tolerance techniques, stop the model if it start to overfit.
    if tolerance_counter >= 3:
        break

    tolerance_counter = tolerance_counter + 1

Epoch: 01 | Time: 1m 46s
	Train Loss: 0.468 | Train Acc: 81.03%
	 Val. Loss: 0.449 |  Val. Acc: 82.25%
Epoch: 02 | Time: 1m 48s
	Train Loss: 0.351 | Train Acc: 85.33%
	 Val. Loss: 0.452 |  Val. Acc: 82.07%
Epoch: 03 | Time: 1m 53s
	Train Loss: 0.311 | Train Acc: 86.96%
	 Val. Loss: 0.472 |  Val. Acc: 81.69%
Epoch: 04 | Time: 1m 56s
	Train Loss: 0.283 | Train Acc: 88.08%
	 Val. Loss: 0.469 |  Val. Acc: 82.19%


In [61]:
def predict(text, text_length):
    with torch.no_grad():
        output = model(text, text_length).squeeze(1)
        predicted = torch.max(output.data, 1)[1]
        return predicted

In [66]:
def sentence_checking(test_list):
    predict_list = list()
    for sent in test_list:
        text = torch.tensor(text_pipeline(sent)).to(device)
        text_list = [x.item() for x in text]
        text = text.reshape(1, -1)
        text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)
        predict_list.append(predict(text, text_length))
    return predict_list

In [73]:
#["very negative", "negative", "neutral", "positive", "very positive"]
test_case = ['The movie should have been good', # Negative
    'What is not to like about this product.', # Negative
    "The price is not so bad", # Positive
    'This software is not buggy'] # Positive

print(sentence_checking(test_case))

# Our model is not so smart, but I this is the best that I can do for now.
# I think it is mostly because the datasets is not that diverse.

[tensor([2], device='cuda:0'), tensor([1], device='cuda:0'), tensor([2], device='cuda:0'), tensor([2], device='cuda:0')]


# Below are in construction
I tried to train with other datasets with the same model but I have a problem which I suspect occuring due to the batch size.

## Improving model Accruacy
As we can see from the results above our model is not so smart, I think the problem lie in the 
data diversity. Let's try to improve our model accruacy by feeding it's with more data.

### Data preparation

We want to make it learn imdb dataset with the same weight.

In [1]:
import pandas as pd

# Let's import imdb dataset.
df = pd.read_csv('/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
# Check the null values
df.isnull().values.any()

False

In [None]:
# Maybe we should clean it's a little bit and encoding the sentiment to positive and negative

from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import re

nlp = spacy.load('en_core_web_md')

def preprocessing(sentence):
    
    # Clear the html tag by using regular expression.
    sentence = re.sub("<[^>]*>", "", sentence)
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM' and token.pos_!= 'X':
                cleaned_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(cleaned_tokens)

In [40]:
# The original.
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [41]:
# The preprocessing one, look much better.
preprocessing(df['review'][1])

"a wonderful little production the film technique unassuming- old time bbc fashion give comforting discomforting sense realism entire piece the actor extremely chosen- michael sheen get polari voice pat you truly seamless editing guide reference williams ' diary entry worth watching terrificly write perform piece a masterful production great master comedy life the realism come home little thing fantasy guard use traditional dream technique remain solid disappear it play knowledge sense particularly scene concern orton halliwell set particularly flat halliwell mural decorate surface terribly"

In [42]:
# Apply the cleaning to entire dataframe
for i, row in df.iterrows():
    clean_text = preprocessing(row.review)
    df.at[i, 'Clean_review'] = clean_text

In [47]:
# Let check the sentiment values
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [52]:
# Encoding the sentiment
for i, row in df.iterrows():
    cur_sentiment = row.sentiment
    if cur_sentiment == 'positive':
        df.at[i, 'Encoded_sentiment'] = 1
    else:
        df.at[i, 'Encoded_sentiment'] = 0

In [54]:
# Save the data cleaning csv for later use.
#df.to_csv('/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/data/Cleaned_IMDB.csv')

### Prepare train loader

In [2]:
# Code to use the csv that we just save.
import pandas as pd
device = 'cpu' # We have some problems with the cuda so cpu will do for now.
df = pd.read_csv('/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/data/Cleaned_IMDB.csv')
df = df[["Clean_review", "Encoded_sentiment"]] # Positive equal 1 and Negative equal 0
df.head()

Unnamed: 0,Clean_review,Encoded_sentiment
0,one reviewer mention watch 1 oz episode hook t...,1.0
1,a wonderful little production the film techniq...,1.0
2,i think wonderful way spend time hot summer we...,1.0
3,basically family little boy jake think zombie ...,0.0
4,petter mattei love time money visually stunnin...,1.0


In [3]:
# Seperate data into train, valid and test sets

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

train, valid = train_test_split(df, test_size=0.1)

print(len(train), len(valid), len(test))

45000 5000 10000


In [4]:
# Function to convert the df to the format that we want. e.g.. (label, text)
def convert_format(df):
    temp = list()
    for i, row in df.iterrows():
        text = row[0]
        label = row[1]
        temp.append((label, text)) 
    return temp

In [5]:
train = convert_format(train)
test  = convert_format(test)
valid = convert_format(valid)

In [6]:
train_text = [text[1].split() for text in train]

# Get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(train_text))) # Unique set of word
print(vocab[0:5])
print(len(vocab))

['spaz', 'definate', 'isenberg', 'age-', 'howie']
113860


In [7]:
#from collections import defaultdict
# Function to build the w2idx and id2word.
def build_vocab(docs, max_vocab=1000000):

  stoi = {'<unk>':0, '<pad>':1, '<bos>':2, '<eos>':3}
  itos = {0:'<unk>', 1:'<pad>', 2:'<bos>', 3:'<eos>'}
  idx = 4

  for word in docs:
    if word not in stoi:
      if len(stoi) < max_vocab:
        stoi[word] = idx
        itos[idx] = word
        idx += 1
        
  return stoi, itos

In [8]:
# Create word to index and index to word with the function we just define.
# It is in the dictionary form
word2id, id2word = build_vocab(vocab)

In [9]:
def text_pipeline_2(sentence):
    temp_list = list()
    tokenizer_sentence = tokenizer(sentence)

    for word in tokenizer_sentence:
        try: 
            temp_list.append(word2id[word])
        except:
            temp_list.append(0)
    return temp_list


In [10]:
# Same one from the above

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence #making each batch same length

pad_ix = word2id['<pad>']

#this function gonna be called by DataLoader
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline_2(_text), dtype=torch.int64)
        text_list.append(processed_text)
        length_list.append(processed_text.size(0)) #for padding, this keep the lenght of sequence.
        
    return torch.tensor(label_list, dtype=torch.int64), \
        pad_sequence(text_list, padding_value=pad_ix, batch_first=True), \
        torch.tensor(length_list, dtype=torch.int64)  # The pad_seq functions automatically do the work.

In [11]:
batch_size = 32

train_loader = DataLoader(train, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

val_loader   = DataLoader(valid, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

test_loader  = DataLoader(test, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

In [26]:
# Just checking and import get_tokenizer here for the sake of convenience.

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_md')

for label, text, length in train_loader:
    break
print("Label shape: ", label.shape) # (batch_size, )
print("Text shape: ", text.shape)   # (batch_size, seq len)

Label shape:  torch.Size([32])
Text shape:  torch.Size([32, 383])


In [13]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout):
        super().__init__()
        #put padding_idx so asking the embedding layer to ignore padding
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_ix)
        self.lstm = nn.LSTM(emb_dim, 
                           hid_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hid_dim * 2, output_dim)
        
    def forward(self, text, text_lengths):
        #text = [batch size, seq len]
        embedded = self.embedding(text)
        
        #++ pack sequence ++
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False, batch_first=True)
        
        #embedded = [batch size, seq len, embed dim]
        packed_output, (hn, cn) = self.lstm(packed_embedded)  #if no h0, all zeroes
        
        #++ unpack in case we need to use it ++
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        #output = [batch size, seq len, hidden dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
        #hn = [batch size, hidden dim * num directions]
        
        return self.fc(hn)

In [14]:
# The first weight on the first layer and output is differents we need to filter first.
# Before loading the model

weight_path = '/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/weights/LSTM.pt'
weight = torch.load(weight_path)
print(len(weight))
del weight['embedding.weight']
del weight['fc.bias']
del weight['fc.weight']
print(len(weight))

19
16


In [15]:
input_dim  = len(vocab)
hid_dim    = 256
emb_dim    = 300  # Why 300, I do not know as it's mentioned above
output_dim = 2 # [0, 1, 2, 3, 4] # Now we have 2 classes

#for biLSTM
num_layers = 2
bidirectional = True
dropout = 0.5

model = LSTM(input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout).to(device)
model.load_state_dict(weight, strict=False)

fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)
model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

_IncompatibleKeys(missing_keys=['embedding.weight', 'fc.weight', 'fc.bias'], unexpected_keys=[])

In [16]:
# Testing the model
for i, (label, text, text_length) in enumerate(train_loader):
    print(label)
    test_label = label
    test_text = text   
    test_text_lenght = text_length
    break

tensor([1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 1, 1, 0])


In [17]:
# Just put it inside CUDA and tried to make the model predict.
test_text = test_text.to(device)
test_text_lenght = test_text_lenght.to(device)
test_output = model(test_text, test_text_lenght)

In [18]:
# It's seem working
test_output

tensor([[ 0.1024,  0.0394],
        [ 0.0548,  0.1474],
        [ 0.0612,  0.0926],
        [ 0.1381,  0.0950],
        [ 0.0538,  0.0623],
        [ 0.0376,  0.0274],
        [ 0.0808,  0.1127],
        [ 0.0468,  0.1822],
        [ 0.0519,  0.0928],
        [ 0.0794,  0.0245],
        [ 0.0770,  0.0867],
        [ 0.0977,  0.0987],
        [ 0.0656, -0.0213],
        [ 0.0667,  0.0659],
        [ 0.0707,  0.1548],
        [ 0.0036,  0.1207],
        [ 0.0814,  0.1427],
        [ 0.0874,  0.1499],
        [ 0.1174,  0.0801],
        [ 0.0412,  0.0502],
        [ 0.0514,  0.0947],
        [ 0.1143,  0.0802],
        [ 0.0899,  0.1118],
        [-0.0225,  0.1646],
        [ 0.0383, -0.0289],
        [ 0.0582,  0.0272],
        [ 0.0627,  0.1125],
        [-0.0056,  0.0761],
        [ 0.0535,  0.0509],
        [ 0.1868,  0.0745],
        [ 0.0803,  0.0533],
        [ 0.0266, -0.0562]], grad_fn=<AddmmBackward0>)

In [19]:
import torch.optim as optim

test_label = test_label.to(device)

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

loss = criterion(test_output, test_label)

In [20]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text, text_length) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text, text_length).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text, text_length).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

# Function to calculate time.
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [23]:
import numpy as np
best_valid_loss = float('inf')
num_epochs      = 8
tolerance_counter = 0 # Initialize this counter
save_path = f'/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/weights/LSTM2.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
    
    start_time = time.time()

    ### TRAINING ###

    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text, text_length) in enumerate(train_loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text, text_length).squeeze(1)
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    train_loss, train_acc =  epoch_loss / train_loader_length, epoch_acc / train_loader_length

    ### END OF TRAINING ###

    ### START OF VALIDATION ###

    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(val_loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text, text_length).squeeze(1)
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    valid_loss, valid_acc =  epoch_loss / val_loader_length, epoch_acc / val_loader_length
    

    #train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
    #valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        tolerance_counter = 0
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')   
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    # Tolerance techniques, stop the model if it start to overfit.
    if tolerance_counter >= 3:
        break

    tolerance_counter = tolerance_counter + 1

IndexError: index out of range in self