# Note: This is a computational heavy task and might crash your browser
# Importing the necessary Libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings('ignore')

torch.cuda.empty_cache()

# Reading the datasets

In [2]:
def read_file(file_list):
    '''
    Reads the txt file and assigns the parameters to respective list, updating the dictionary.
    Also, performing One Hot Encoding on the Sentiments.
    '''
    dataset = {}
    for path in file_list:
        dataset[path] = {}
        tweet = []
        tweetgts = []
        tweetid = []
        with open(path, encoding='utf8') as file:
            for line in file:
                line = line[:len(line) - 1]
                contents = line.split('\t')
                tweetid.append(int(contents[0]))
                if(contents[1] == 'positive'):
                    tweetgts.append([0, 1, 0])
                elif(contents[1] == 'negative'):
                    tweetgts.append([0, 0, 1])
                else:
                    tweetgts.append([1, 0, 0])
                tweet.append(contents[2])
        dataset[path]['tweet'] = tweet
        dataset[path]['sentiment'] = tweetgts
        dataset[path]['ids'] = tweetid
    return dataset
dataset = read_file(['twitter-training-data.txt', 'twitter-dev-data.txt','twitter-test1.txt','twitter-test2.txt','twitter-test3.txt'])

## LSTM will take a lot of time if running on cpu,
## This checks if our machine has cuda cores or not.
## Cuda can be enabled for faster processing

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## TEXT Pre-Processing

In [4]:
def cleanup_text(texts):
    '''
    Pre-processed the tweets and returns a clean tweets after
    replacing and removing the unwanted bits and pieces from the tweet.
    '''
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKTAG", text)

        # replace dollar
        text = re.sub(r"\£\d+", "TOKPOUND", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text

## Use other test files for generating different validation losses

In [5]:
cleaned_tweets = cleanup_text(dataset['twitter-training-data.txt']['tweet'])
v_clean_tweets = cleanup_text(dataset['twitter-dev-data.txt']['tweet'])
tokenizer = Tokenizer(num_words = 5000,oov_token='<oov>')
tokenizer.fit_on_texts(cleaned_tweets)
word_index= tokenizer.word_index
print(len(word_index))
train_tokenized_sentence = tokenizer.texts_to_sequences(cleaned_tweets)
valid_tokenized_sentence = tokenizer.texts_to_sequences(v_clean_tweets)

35402


# Padding the list

In [6]:
def padding(seq, max_len = 45):
    '''
    Padding to make tweets same in length.
    Filling empty spaces with 0.
    '''
    pad_value = 0
    ls=[]
    for i in seq:
        pad_size = max_len - len(i)
        final_list = [*i, *[pad_value] * pad_size]
        ls.append(final_list)
    return ls
train_padded_seq = padding(train_tokenized_sentence)
valid_padded_seq = padding(valid_tokenized_sentence)

## Converting to tensor

In [7]:
train_tweet_tensor = torch.tensor(train_padded_seq).to(device=device)
train_sentiment_tensor = torch.tensor(dataset['twitter-training-data.txt']['sentiment']).to(device=device)

valid_tweet_tensor = torch.tensor(valid_padded_seq).to(device=device)
valid_sentiment_tensor = torch.tensor(dataset['twitter-dev-data.txt']['sentiment']).to(device=device)

train_tensor_dataset = TensorDataset(train_tweet_tensor, train_sentiment_tensor)
valid_tensor_dataset = TensorDataset(valid_tweet_tensor, valid_sentiment_tensor)

train_tensor_dataloader = DataLoader(train_tensor_dataset, batch_size=400, shuffle=True)
valid_tensor_dataloader = DataLoader(valid_tensor_dataset, batch_size=400, shuffle=True)

## Reading the GloVe embedding file and returning weight_matrix to tensor

In [8]:
glove = {}

with open('glove.6B.100d.txt', 'rb') as f:
    for l in f:
        line = str(l)
        line = line[2:len(line) - 3].split(' ')
        val = [float(line[i]) for i in range(1, len(line))]
        glove[line[0]] = val


matrix_len = len(word_index) + 1
weights_matrix = np.zeros((matrix_len, 100)) # 100 because this version of glove has 100 dims for a word
words_found = 0

for i, word in enumerate(word_index):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))
weights_matrix = torch.tensor(weights_matrix).to(device=device)

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embed, embed_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embed, embed_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embed, embed_dim

# Building the Model

In [9]:
class LSTMClassifier(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, dropout = 0.5):
        super(LSTMClassifier, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding, _, embedding_dim = create_emb_layer(weights_matrix, non_trainable=True)
        self.LSTM = nn.LSTM(embedding_dim, self.hidden_size, self.num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        embed = self.embedding(input)
        hidden = self.init_hidden(input.size(0))
        out, _ = self.LSTM(embed, hidden)
        out = out[:, -1, :]
        out = self.fc(out)
        return F.softmax(out, dim=1)
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        return hidden

# Training the Model

In [10]:
vocab_size = 5001
embedding_dim = 100 
hidden_size = 500
num_classes = 3 
num_layers = 2
EPOCHS = 1
lr = 0.01
criterion = nn.BCELoss()
clip = 5
BATCH_SIZE = 500
model = LSTMClassifier(weights_matrix, hidden_size, num_layers, num_classes).to(device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [11]:
for i in range(EPOCHS):
    counter = 0
    print('Epoch - '+str(i + 1))
    for inputs, labels in train_tensor_dataloader:
        counter += 1
        model.zero_grad()
        output = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        if counter%10 == 0:
            val_h = model.init_hidden(BATCH_SIZE)
            val_losses = []
            model.eval()
            for inp, lab in valid_tensor_dataloader:
                out = model(inp)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, EPOCHS),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
    print('')

Epoch - 1
Epoch: 1/1... Step: 10... Loss: 0.621390... Val Loss: 0.629132
Epoch: 1/1... Step: 20... Loss: 0.621878... Val Loss: 0.637801
Epoch: 1/1... Step: 30... Loss: 0.614338... Val Loss: 0.611651
Epoch: 1/1... Step: 40... Loss: 0.614044... Val Loss: 0.614745
Epoch: 1/1... Step: 50... Loss: 0.624078... Val Loss: 0.608102
Epoch: 1/1... Step: 60... Loss: 0.609772... Val Loss: 0.609923
Epoch: 1/1... Step: 70... Loss: 0.605004... Val Loss: 0.611138
Epoch: 1/1... Step: 80... Loss: 0.610643... Val Loss: 0.608772
Epoch: 1/1... Step: 90... Loss: 0.623958... Val Loss: 0.609026
Epoch: 1/1... Step: 100... Loss: 0.596491... Val Loss: 0.608315
Epoch: 1/1... Step: 110... Loss: 0.620382... Val Loss: 0.611771



# Saves the Trained Model for testing Purposes

In [12]:
torch.save(model, 'tushar.pth')