# PyTorch Model Exploration

#### **Model 1: Neural Network with Word2Vec Embeddings**

Initial attempt at using the stemmed, TFIDF weighted unigrams alongside the Word2Vec embeddings in a simple NN. Can consider POS tags, n-grams, etc. later.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import time
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split
from sklearn.metrics import average_precision_score
from gensim.models import KeyedVectors, Word2Vec

torch.manual_seed(123)

<torch._C.Generator at 0x7f2062ce17d0>

In [2]:
# Load Word2Vec embeddings and read into dictionary
wv = KeyedVectors.load_word2vec_format('dataset_features/word_embedding_output')
wv_dict = dict({})
for index, key in enumerate(wv.vocab):
    wv_dict[key] = wv[key]

print(len(wv_dict), 'words in dict')

1258 words in dict


In [3]:
# Load stemmed, TFIDF weighted terms & label/topic for each tweet
data = pd.read_csv('tfidf_with_stemming.csv', index_col=0)
labels = pd.read_csv('translated_tweets_t1.csv', usecols=['label', 'topicID'])
data

Unnamed: 0,aa,aaaaaa,aaaaaaagel,aaaier,aaar,aaarsat,aabernamha,aabyuhzha,aada,aajabi,...,zlv,zlzlhm,zmrolha,zndqoh,zone,zouk,zti,ztoot,zugheib,zuo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Create matrix of embedding weights

embed_dim = len(wv_dict['zone']) # 200, same for all terms
matrix_len = len(data.columns) # Number of terms in tweets vocab
embed_weights = np.zeros((matrix_len, embed_dim)) # Initialise weights
words_found = 0

for i, word in enumerate(data.columns):
    try:
        embed_weights[i] = wv_dict[word]
        words_found += 1
    except KeyError:
        embed_weights[i] = np.random.normal(scale=0.6, size=(embed_dim, ))
        
# Convert embeddings and data to tensor format
embed_weights = torch.from_numpy(embed_weights)
data = torch.from_numpy(data.to_numpy())

print(words_found, 'words found out of', len(embed_weights))

660 words found out of 6115


In [5]:
# Code to create embeddings layer
def create_embedding_layer(embed_weights):
    
    num_embeddings, embed_dim = embed_weights.shape
    embedding_layer = nn.Embedding(num_embeddings, embed_dim)
    embedding_layer.load_state_dict({'weight': embed_weights})

    return embedding_layer, num_embeddings, embed_dim

# Define simple NN model w/ embeddings
class SimpleNN(nn.Module):
    
    def __init__(self, embed_weights, num_classes):
        super().__init__()
        self.embedding, num_embeddings, embed_dims = create_embedding_layer(embed_weights)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embed_weights.shape[0] * embed_weights.shape[1], num_classes)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
        
    def init_weights(self):
        init_limit = 0.5
        self.embedding.weight.data.uniform_(- init_limit, init_limit)
        self.fc.weight.data.uniform_(- init_limit, init_limit)
        self.fc.bias.data.zero_()
        
    def forward(self, text):
        x = self.embedding(text.long())
        x = self.flatten(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
    
# Instantiate the model
model = SimpleNN(embed_weights, num_classes=2)

In [6]:
# Format data for insertion into Dataset object

labels_tensor = torch.tensor(labels['label'].to_numpy(), dtype=torch.long)
entries = []

for i, tweet in enumerate(data):
    label = labels_tensor[i].item()
    weights = tweet
    entry = (label, weights)
    entries.append(entry)
    
# Create Dataset object

class Tweets(Dataset):
    def __init__(self):
        self.samples = entries

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
tweets_dataset = Tweets()

In [7]:
# Define training and testing behaviour

def train(sub_train_):
    train_loss, train_acc = 0, 0
    data = DataLoader(sub_train_, batch_size=1, shuffle=True)
    for cls, text in data:
        optimiser.zero_grad()
        output = model(text)
        loss = criterion(output, cls.long())
        train_loss += loss.item()
        loss.backward()
        optimiser.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    scheduler.step() # Adjust learning rate

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    test_loss, test_acc = 0, 0
    preds, true = [], []
    data = DataLoader(data_, batch_size=1)
    for cls, text in data:
        with torch.no_grad():
            output = model(text)
            loss = criterion(output, cls.long())
            test_loss += loss.item()
            test_acc += (output.argmax(1) == cls).sum().item()
            preds.append(max(output))
            true.append(cls)

    return test_loss / len(data_), test_acc / len(data_), preds, true

In [8]:
# Train model

EPOCHS = 2
min_valid_loss = float('inf')

criterion = nn.CrossEntropyLoss()
optimiser = optim.SGD(model.parameters(), lr=3.0)
scheduler = optim.lr_scheduler.StepLR(optimiser, 1, gamma=0.9)

train_len = int(len(tweets_dataset) * 0.90)
sub_train_, sub_valid_ = random_split(tweets_dataset, [train_len, len(tweets_dataset) - train_len])

for epoch in range(EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(sub_train_)
    valid_loss, valid_acc, preds, true = test(sub_valid_)
    
    #map_val = average_precision_score(y_true=true, y_score=preds, average='macro')

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    #print('Mean Average Precision:', round(map_val, 4))

Epoch: 1  | time in 0 minutes, 46 seconds
	Loss: 0.6931(train)	|	Acc: 30.8%(train)
	Loss: 0.6931(valid)	|	Acc: 28.0%(valid)
Epoch: 2  | time in 1 minutes, 2 seconds
	Loss: 0.6931(train)	|	Acc: 30.8%(train)
	Loss: 0.6931(valid)	|	Acc: 28.0%(valid)


**Poor performance, need to get figure out issues**