In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [4]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

In [5]:
# Load training set, dev set and testing set
data = {}
tweetids = {}
tweetgts = {}
tweets = {}

for dataset in ['twitter-training-data.txt'] + testsets:
    data[dataset] = []
    tweets[dataset] = []
    tweetids[dataset] = []
    tweetgts[dataset] = []
    


    # write code to read in the datasets here
    with open(dataset, encoding='utf8') as file:
        for check in file:
            senti=check.split('\t')
            tweetids[dataset].append(senti[0]) 
            tweetgts[dataset].append(senti[1]) 
            tweets[dataset].append(senti[2])
# print(len(tweetids)) 

#     for i in range(len(tweets['twitter-training-data.txt'])):
#         tweets[i]=re.sub(r"https*://[^\s]+","",tweets[i])
    
    

In [6]:
import re


def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKHASHTAG", text)

        # replace dollar
        text = re.sub(r"\$\d+", "TOKDOLLAR", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text

In [7]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords
encoded=[]
for t in cleanup_text(tweets['twitter-training-data.txt']):
    filtered=remove_stopwords(t)
    encoded.append(filtered.lower())
#     t=t.split(' ')
#     t=[w for w in t if (w not in stop)]
#     encoded.append(tokenizer.texts_to_sequences(t))
print(encoded[:1])



['felt privileged play foo fighters songs guitar today plectrums gig saturday']


In [27]:
tokenizer = Tokenizer(num_words=5000,oov_token='<oov>')
tokenizer.fit_on_texts(encoded)

word_index= tokenizer.word_index
print(len(word_index))

seq=tokenizer.texts_to_sequences(encoded)
print(seq[:5])

35091
[[1973, 1, 64, 213, 192, 624, 2594, 23, 1, 1606, 20], [2, 1461, 2377, 366, 1, 295, 260, 148, 1074, 34, 366, 1], [47, 73, 4374, 4375, 1, 2, 27, 1, 1, 1699, 387, 1, 164, 996, 1438], [2, 1, 11, 4376, 7, 5, 33, 27], [1, 3, 3271, 14, 1133, 495, 3733, 4]]


In [9]:
sentiment={'positive':1,
           'neutral':0,
           'negative':2}

In [10]:
senti=[]
for i in tweetgts['twitter-training-data.txt']:
    # print(i)
    senti.append(sentiment[i])
print(senti[:10])

[1, 1, 1, 2, 2, 1, 1, 1, 2, 1]


In [11]:
senti=[]
for i in tweetgts['twitter-training-data.txt']:
    senti.append(sentiment[i])

In [13]:
test_seq=tokenizer.texts_to_sequences(tweets["twitter-test1.txt"])
print(test_seq[:5])

[[1, 1132, 1, 1, 1, 1, 1, 136, 1693, 122, 1, 4452, 8, 1, 1], [491, 1, 1, 626, 1, 1, 1, 656, 1, 1, 121, 1, 1, 1, 1, 1, 49, 1, 1304], [60, 1624, 667, 1, 537, 1, 1, 41], [1, 1, 1, 2859, 1, 1, 1, 1, 1, 2679, 1211, 1, 1, 1, 1, 1, 1, 858, 1, 46, 1, 2494], [1, 33, 1, 1945, 482, 3555, 1, 1, 1, 1031, 1, 18, 45, 1, 1, 232, 1, 4401, 1, 4515, 4452, 8, 1, 1]]


In [14]:
# Padding

N = 40

pad_value = 0
ls=[]
for i in seq:
    pad_size = N - len(i)

    final_list = [*i, *[pad_value] * pad_size]
    ls.append(final_list)


In [15]:
for i in ls:
    if(len(i)>40 or len(i)<40):
        print(i,' +', len(i))

In [17]:
print(ls[1])

[2, 1461, 2377, 366, 1, 295, 260, 148, 1074, 34, 366, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [24]:
x=torch.tensor(ls)
y=torch.tensor(senti)

abc=TensorDataset(x,y)
batch_size=500
load=DataLoader(abc,batch_size=batch_size)

In [43]:
# Building Glove

glove = {}

with open('glove.6B.100d.txt', 'rb') as f:
    for l in f:
        line = str(l)
        line = line[2:len(line) - 3].split(' ')
        val = [float(line[i]) for i in range(1, len(line))]
        glove[line[0]] = val


matrix_len = len(word_index) + 1
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(word_index):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))
weights_matrix = torch.tensor(weights_matrix)

print(len(weights_matrix))

35092


In [44]:
# BUILDING

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class LSTMClassifier(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, dropout = 0.25):
        super(LSTMClassifier, self).__init__()
        self.num_layers = num_layers
        self.hidden_sweights_matrixidden_size
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, non_trainable=True)
        self.LSTM = nn.LSTM(embedding_dim, self.hidden_size, self.num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.dense = nn.Linear(hidden_size, 256)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        embed = self.embedding(x)
        hidden = self.init_hidden(x.size(0))
        out, _ = self.LSTM(embed, hidden)
        out = out[:, -1, :]
        out = self.dropout(out)
        out = F.relu(self.dense(out))
        out = self.fc(out)
        return torch.exp(F.log_softmax(out, dim=1))
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        return hidden

In [45]:
## TRAINING THE LSTM

vocab_size = len(word_index) + 1
embedding_dim = 100 # Since we have choosen glove embedding 100 dimenstion version
hidden_size = 500
num_classes = 3
num_epochs = 2
num_layers = 2
EPOCHS = 10
lr=0.001
criterion = nn.BCELoss()
clip = 5

net = LSTMClassifier(weights_matrix, hidden_size, num_layers, num_classes)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
input = torch.randint(1, 4, (5, 3))
output = net(input)
net.train()
for i in range(EPOCHS):
    counter = 0
    print('Epoch - '+str(i + 1))
    for inputs, labels in enumerate(word_index):
        counter += 1
        net.zero_grad()
        # print(inputs.size())
        output = net(inputs)
        # print(labels.float())
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if counter%5 == 0:
            val_h = net.init_hidden(BATCH_SIZE)
            val_losses = []
            net.eval()
            for inp, lab in valid_dataloader:
                # val_h = tuple([each.data for each in val_h])
                out = net(inp)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            net.train()
            print("Epoch: {}/{}...".format(i+1, EPOCHS),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
    print('')

AttributeError: 'LSTMClassifier' object has no attribute 'hidden_sweights_matrixidden_size'