<a href="https://colab.research.google.com/github/WillN202/NLU_CW/blob/main/NLU_Task2_DemoCode_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demo Code

This code is used to demonstrate the model inferenece performance using the best saved model.

In [None]:
!pip install torchmetrics
!pip install gensim
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import os.path
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import random

TEST_DATASET_LOCATION = "/content/drive/MyDrive/AV_trial.csv"
TEST_MODEL_LOCATION = "/content/drive/MyDrive/twolinear_onernnlayer_gru_model_74_accuracy.model"
WORD2VEC_EMBEDDINGS = "/content/drive/MyDrive/word2vec_256embeddings_bestyet.model"
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/train.csv"

DEVICE = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(DEVICE)
torch.set_default_device(DEVICE)

cuda:0


## Model Definition

In [None]:
class BaseGruRNN(torch.nn.Module):
  def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
    super(BaseGruRNN, self).__init__()
    self.vocab = vocab
    self.get_embedding = torch.nn.Embedding.from_pretrained(embeddings)
    self.GRU_Layer = torch.nn.GRU(embedding_size, hidden_size, batch_first=True, num_layers=rnn_layers, dropout=0.1, bidirectional=is_bidirectional)

  def forward(self, x, linear_layer):
    unk_embedding = self.vocab["<UNK>"]
    #x = [re.sub(f"[{re.escape(string.punctuation)}]", "", sentence) for sentence in x]
    x = [sentence.split() for sentence in x]
    x = [[self.vocab.get(word, unk_embedding) for word in sentence ] for sentence in x]
    # TODO -> instead of padding, use pack sequence instead. Note this may break the output from the lstm (woo)
    max_len = max([len(words) for words in x])
    x = [([self.vocab["<pad>"]] * (max_len -  len(words))) + words for words in x]
    input = torch.tensor(x)

    embeddings = self.get_embedding(input)
    GRU_int_results = self.GRU_Layer(embeddings)[1]
    GRU_values = GRU_int_results[0] #Get last hidden state(s)
    result = linear_layer(GRU_values)
    return result

class OneLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      self.base = super(OneLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, output_size),
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class TwoLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super().__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class ThreeLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(ThreeLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class FourLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(FourLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

## Data and Model Processing

In [None]:
def generic_preprocessor(sentence):
  sentence = sentence.lower()

  return sentence

class AVDataset(Dataset):
  def __init__(self, csv_file, pre_processor=None):
    self.samples = pd.read_csv(csv_file)
    self.pre_processor = pre_processor

  def __len__(self):
      return len(self.samples)

  def __getitem__(self, index):
    sample = self.samples.iloc[index]
    sample_text = f"{sample[0]} <sep> {sample[1]}"
    return (self.pre_processor(sample_text), sample[2])

test_samples = AVDataset(TEST_DATASET_LOCATION, pre_processor=generic_preprocessor)

In [23]:
phrases = pd.read_csv(TRAINING_DATASET_LOCATION)
phrases = phrases.loc[:, "text_1":"text_2"].to_numpy().flatten().tolist()
#phrases = [re.sub(f"[{re.escape(string.punctuation)}]", "", str(phrase)).split() for phrase in phrases]
# phrases = [str(phrase).split() for phrase in phrases]

# phrases.append("<UNK>")
# phrases.append("<sep>")
# phrases.append("<pad>")

def add_unk(phrase1, phrase2):
  phrase1 = ["<UNK>" if random.random() < 0.05 else word for word in phrase1.split()]
  phrase2 = ["<UNK>" if random.random() < 0.05 else word for word in phrase2.split()]

  return ['<pad>'] * random.randint(1,15) + phrase1 + ["<sep>"] + phrase2

phrases = [add_unk(str(phrases[i]), str(phrases[i+1])) for i in range(0, len(phrases), 2)]

def generate_word2vec_embeddings(size):
  embeddings = Word2Vec(sentences=phrases, workers=300, min_count=1, vector_size=size)
  # embeddings.wv["<UNK>"] = np.random.rand(size)
  # embeddings.wv["<sep>"] = np.random.rand(size)
  # embeddings.wv["<pad>"] = np.random.rand(size)

  return embeddings


embeddings = generate_word2vec_embeddings(256)
coded_embeddings = torch.FloatTensor(embeddings.wv.vectors).to(DEVICE)
vocab = embeddings.wv.key_to_index

['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'Nick', '(', 'Kevin', 'Anderson', ')', 'goes', 'back', 'to', 'his', 'hometown', 'to', 'take', 'care', 'of', 'his', 'dying', 'mother', '(', 'Kim', 'Novak', ')', '.', 'There', 'he', 'encounters', 'an', 'old', 'college', 'buddy', '(', 'Bill', 'Pullman', ')', 'and', 'his', 'beautiful', 'wife', '(', 'Pamela', 'Gidley', ')', '.', '<UNK>', 'also', 'gets', 'involved', 'with', 'a', '40', 'year', 'old', 'sex', 'murder', 'that', 'may', 'have', '<UNK>', 'to', 'do', 'with', 'him', '.', '.', '.', 'I', 'caught', 'this', 'in', '<UNK>', 'theatre', 'back', 'in', '1991', '.', 'It', 'was', 'part', 'of', 'the', 'Boston', 'Film', 'Festival', 'and', 'I', 'had', 'heard', 'it', 'had', 'some', 'incredibly', 'beautiful', 'cinematography', '.', 'Well', '-', '<UNK>', 'it', 'does', '.', 'It', 'just', "doesn't", 'have', 'much', 'of', 'a', 'story', 'to', 'go', 'with', '<UNK>', '.', '<UNK>', 'the', 'film', 'moves', 'so', 'SLOWLY', '-', '

In [24]:
model = TwoLinearLayerGruRNN(256, 1, 256, coded_embeddings, vocab)
model.load_state_dict(torch.load(TEST_MODEL_LOCATION))



RuntimeError: Error(s) in loading state_dict for TwoLinearLayerGruRNN:
	size mismatch for get_embedding.weight: copying a param with shape torch.Size([234231, 256]) from checkpoint, the shape in current model is torch.Size([229074, 256]).

## Inference Loop and Result Reporting

In [None]:
model.eval()
torch.set_grad_enabled(False)
num_correct = 0

loader = DataLoader(test_samples, batch_size=64, shuffle=True, generator=torch.Generator(device=DEVICE))

for index, value in enumerate(loader):
    data, labels = value
    labels = labels.reshape(-1,1)
    labels = labels.type(torch.FloatTensor)
    labels = labels.to(DEVICE)

    outputs = model(data)

    for answer, standard in zip(outputs, labels):
        normalised_answer = torch.round(torch.sigmoid(answer[0]))
        num_correct += 1 if normalised_answer == standard[0] else 0

torch.set_grad_enabled(True)

print(f"Accuracy {num_correct / len(loader.dataset)}")

In [None]:
word_vectors = embeddings
word_vectors.save("word2vectest.model")

In [None]:
wv = Word2Vec.load("word2vectest.model", mmap='r')

## Metrics Reporting

In [None]:
## TODO IMPLEMENT NE