# Note: This is a computational heavy task and might crash your browser
# Importing the necessary Libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
torch.cuda.empty_cache()

# Reading the datasets

In [2]:
def read_file(file_list):
    '''
    Reads the txt file and assigns the parameters to respective list, updating the dictionary.
    Also, performing One Hot Encoding on the Sentiments.
    '''
    dataset = {}
    for path in file_list:
        dataset[path] = {}
        tweet = []
        tweetgts = []
        tweetid = []
        with open(path, encoding='utf8') as file:
            for line in file:
                line = line[:len(line) - 1]
                contents = line.split('\t')
                tweetid.append(int(contents[0]))
                if(contents[1] == 'positive'):
                    tweetgts.append([0, 1, 0])
                elif(contents[1] == 'negative'):
                    tweetgts.append([0, 0, 1])
                else:
                    tweetgts.append([1, 0, 0])
                tweet.append(contents[2])
        dataset[path]['tweet'] = tweet
        dataset[path]['sentiment'] = tweetgts
        dataset[path]['ids'] = tweetid
    return dataset
dataset = read_file(['twitter-training-data.txt', 'twitter-dev-data.txt','twitter-test1.txt','twitter-test2.txt','twitter-test3.txt'])

## LSTM will take a lot of time if running on cpu,
## This checks if our machine has cuda cores or not.
## Cuda can be enabled for faster processing

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## TEXT Pre-Processing

In [4]:
def cleanup_text(texts):
    '''
    Pre-processed the tweets and returns a clean tweets after
    replacing and removing the unwanted bits and pieces from the tweet.
    '''
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKTAG", text)

        # replace dollar
        text = re.sub(r"\£\d+", "TOKPOUND", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text

In [5]:
cleaned_tweets = cleanup_text(dataset['twitter-training-data.txt']['tweet'])
tokenizer = Tokenizer(num_words = 5000,oov_token='<oov>')
tokenizer.fit_on_texts(cleaned_tweets)
word_index= tokenizer.word_index
print(len(word_index))
train_tokenized_sentence = tokenizer.texts_to_sequences(cleaned_tweets)

35402


# Padding the list

In [6]:
def padding(seq, max_len = 45):
    '''
    Padding to make tweets same in length.
    Filling empty spaces with 0.
    '''
    pad_value = 0
    ls=[]
    for i in seq:
        pad_size = max_len - len(i)
        final_list = [*i, *[pad_value] * pad_size]
        ls.append(final_list)
    return ls

# Using the saved model and Testing with our test sets

In [7]:
from aspyfile import LSTMClassifier
saved_model = torch.load('tushar.pth')
saved_model.eval()
print(saved_model)
test_set = ['twitter-dev-data.txt','twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']
test_data = read_file(test_set)
for path in test_set:
    print('Accuracy on ',path)
    clean = cleanup_text(test_data[path]['tweet'])
    clean_token = tokenizer.texts_to_sequences(clean)
    padded_clean_token = torch.tensor(padding(clean_token)).to(device=device)
    sent = torch.tensor(test_data[path]['sentiment']).to(device=device)
    print('Length - ',len(padded_clean_token))
    batch = 400
    batch_iter = len(padded_clean_token)//batch 
    counter = 0
    batch_counter = 1
    flag = False
    acc = 0
    while True:
        if(counter + batch >= len(sent)):
            batch_text = padded_clean_token[counter:len(padded_clean_token)]
            batch_sent = sent[counter:len(padded_clean_token)]
            flag = True
        else:
            batch_text = padded_clean_token[counter:counter+batch]
            batch_sent = sent[counter:counter+batch]
        if(batch_counter % 25 == 0):
            print('Calculating for batch ', batch_counter)
        output = saved_model(batch_text)
        for i in range(len(output)):
            pred = torch.argmax(output[i]).to(device=device)
            actual = torch.argmax(batch_sent[i]).to(device=device)
            if (pred == actual):
                acc = acc + 1
        counter = counter + batch
        batch_counter = batch_counter + 1
        del batch_text, batch_sent, output, pred, actual
        if (flag):
            break
    print('Accuracy on ',path,' is ', (acc/len(sent)))
    # del clean, clean_token, padded_clean_token, sent, output, acc

LSTMClassifier(
  (embedding): Embedding(35403, 100)
  (LSTM): LSTM(100, 500, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=500, out_features=3, bias=True)
)
Accuracy on  twitter-dev-data.txt
Length -  2000
Accuracy on  twitter-dev-data.txt  is  0.4595
Accuracy on  twitter-test1.txt
Length -  3531
Accuracy on  twitter-test1.txt  is  0.4259416595865194
Accuracy on  twitter-test2.txt
Length -  1853
Accuracy on  twitter-test2.txt  is  0.36103615758229896
Accuracy on  twitter-test3.txt
Length -  2379
Accuracy on  twitter-test3.txt  is  0.4131988230348886
