In [1]:
from datasets import load_dataset
import numpy as np

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [3]:
lstm_dataset = load_dataset('liar')

In [4]:
train = lstm_dataset['train']
val = lstm_dataset['validation']
test = lstm_dataset['test']

In [5]:
def load_embedding_model(model):
    wv_from_bin = api.load(model)
    return wv_from_bin

In [6]:
glove = "glove-wiki-gigaword-200"
wv_from_bin = load_embedding_model(glove)

In [7]:
train_sentences = train['statement']
train_labels = train['label']

In [8]:
def preprocess(data, wv_from_bin):

    # build vocab
    corpus = []
    vocab = {"<PAD>":0, "UNKA":1}
    count_v = 2
    X = []
    vocab_count = {}
    words = list(wv_from_bin.index_to_key)
    
    for l in data:
        line = l.split()
        sentence = []
        for i in range(len(line)):
            if line[i] not in words:
                sentence.append("UNKA")
            else:
                sentence.append(line[i])
        corpus.append(sentence)

    for l in range(len(corpus)):
        line = corpus[l]
        sent_x = []
        tag_y = []
        for i in range(len(line)):
            if line[i] == "UNKA":
                sent_x.append(line[i])
            else:
                sent_x.append(line[i].lower())
                if line[i].lower() not in vocab:
                    vocab[line[i].lower()] = count_v
                    count_v += 1

        X.append(sent_x)
    
    # map the sentence using the vocab word to index dictionary
    X =  [[vocab[word] for word in sentence] for sentence in X]

    return vocab, X

In [9]:
vocab_train, X = preprocess(train_sentences, wv_from_bin)

In [10]:
matrix_len = len(vocab_train)
weights_matrix = np.zeros((matrix_len, 200), dtype = np.float32)

for i, (key, value) in enumerate(vocab_train.items()):
    try: 
        if key == "<PAD>" or key == "UNKA":
            weights_matrix[i] = np.random.normal(scale=0.6, size=(200, ))
        else:
            weights_matrix[i] = wv_from_bin.get_vector(key)
    except KeyError:
        print('-------------ERROR------------')

weights_vectors = torch.from_numpy(weights_matrix)

In [11]:
class RNNTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, weights_vectors, tagset_size):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding.from_pretrained(weights_vectors, padding_idx = 0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.dropout = nn.Dropout(0.3)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = self.dropout(embeds)
        lstm_out, (ht, ct) = self.lstm(x)
        tag_scores = self.hidden2tag(ht[-1])
        return tag_scores

In [12]:
model = RNNTagger(200, 45, weights_vectors, 6) 
loss_function = nn.CrossEntropyLoss(ignore_index = 0)
optimizer = optim.Adam(model.parameters(), lr=0.0003)

In [13]:
for epochs in range(10):
    for sentence, y_true in zip(X, train_labels):

        optimizer.zero_grad()

        sentence = torch.tensor(sentence, dtype = torch.long)
        y_true = torch.tensor(y_true, dtype = torch.long)

        y_pred = model(sentence)

        loss = loss_function(y_pred, y_true)
        loss.backward()
        optimizer.step()

In [50]:
#Testing

test_sentences = test['statement']
test_labels = test['label']
X_test = []

for l in test_sentences:
    line = l.split()
    sentence = []
    tag = []
    for i in range(len(line)):
        if line[i] == "UNKA":
            sentence.append(line[i])
        elif line[i].lower() not in vocab_train:
            sentence.append("UNKA")
        else:
            sentence.append(line[i].lower())
    X_test.append(sentence)

X_test =  [[vocab_train[word] for word in sentence] for sentence in X_test]

In [51]:
prediction = []

model.eval()

with torch.no_grad():
    for sentence, yt_true in zip(X_test, test_labels):
        
        sentence = torch.tensor(sentence, dtype = torch.long)
        yt_true = torch.tensor(yt_true, dtype = torch.long)

        yt_pred = model(sentence)

        pred = torch.argmax(yt_pred, -1).cpu().numpy()

        prediction.append(pred.tolist())

In [52]:
from sklearn.metrics import f1_score

score = accuracy_score(prediction, test_labels)
print(f"The accuracy of the model is {100*score:6.2f}%")
f1score = f1_score(test_labels, prediction, average='micro')
print(f"The F1 score of the model is {f1score:6.2f}")

The accuracy of the model is  23.46%
The F1 score of the model is   0.23
