<a href="https://colab.research.google.com/github/WillN202/NLU_CW/blob/main/NLU_Task_2_TrainedWord2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLU Task 2


For this task I've implemented a GRU model using TODO word embeddings. TODO WRITE ME

## Imports and Setup

In [None]:
!pip install torchmetrics
!pip install ray[tune]
from google.colab import drive
drive.mount('/content/drive')

Collecting torchmetrics
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/841.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/841.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m839.7/841.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_runtime_

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import os.path
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
import string
import re
import gensim.downloader
import tempfile
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/train.csv"
DEV_DATASET_LOCATION = "/content/drive/MyDrive/dev.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/AV_trial.csv"
WORD2VEC_EMBEDDINGS = "/content/drive/MyDrive/word2vec_embeddings_tessty.model"
EPOCHS = 4

DEVICE = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# print(DEVICE)
# torch.set_default_device(DEVICE)

## Data Loading and Pre Processing

In [None]:
def generic_preprocessor(sentence):
  sentence = sentence.lower()

  return sentence

class AVDataset(Dataset):
  def __init__(self, csv_file, pre_processor=None):
    self.samples = pd.read_csv(csv_file)
    self.pre_processor = pre_processor

  def __len__(self):
      return len(self.samples)

  def __getitem__(self, index):
    sample = self.samples.iloc[index]
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[0])
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[1])
    sample_text = f"{sample[0]} <sep> {sample[1]}"
    return (self.pre_processor(sample_text), sample[2])


In [None]:
training_samples = AVDataset(TRAINING_DATASET_LOCATION, pre_processor=generic_preprocessor)
dev_samples = AVDataset(DEV_DATASET_LOCATION, pre_processor=generic_preprocessor)
test_samples = AVDataset(TEST_DATASET_LOCATION, pre_processor=generic_preprocessor)

## Word Embedding Generation

In [None]:
phrases = pd.read_csv(TRAINING_DATASET_LOCATION)
phrases = phrases.loc[:, "text_1":"text_2"].to_numpy().flatten().tolist()
#phrases = [re.sub(f"[{re.escape(string.punctuation)}]", "", str(phrase)).split() for phrase in phrases]
phrases = [str(phrase).split() for phrase in phrases]

def generate_word2vec_embeddings(size):
  embeddings = Word2Vec(sentences=phrases, workers=300, min_count=1, vector_size=size)
  embeddings.wv["<UNK>"] = np.random.rand(size)
  embeddings.wv["<sep>"] = np.random.rand(size)
  embeddings.wv["<pad>"] = np.random.rand(size)

  return embeddings


embeddings_512 = generate_word2vec_embeddings(512)
embeddings_256 = generate_word2vec_embeddings(256)
embeddings_128 = generate_word2vec_embeddings(128)
embeddings_64 = generate_word2vec_embeddings(64)

## Model Creation

To reduce code content, the GRU models are defined from a base class. New linear layers are added between each class of model since this could not be done using parameters alone

In [None]:
class BaseGruRNN(torch.nn.Module):
  def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
    super(BaseGruRNN, self).__init__()
    self.vocab = vocab
    self.get_embedding = torch.nn.Embedding.from_pretrained(embeddings)
    self.GRU_Layer = torch.nn.GRU(embedding_size, hidden_size, batch_first=True, num_layers=rnn_layers, dropout=0.1, bidirectional=is_bidirectional)

  def forward(self, x, linear_layer):
    unk_embedding = self.vocab["<UNK>"]
    #x = [re.sub(f"[{re.escape(string.punctuation)}]", "", sentence) for sentence in x]
    x = [sentence.split() for sentence in x]
    x = [[self.vocab.get(word, unk_embedding) for word in sentence ] for sentence in x]
    # TODO -> instead of padding, use pack sequence instead. Note this may break the output from the lstm (woo)
    max_len = max([len(words) for words in x])
    x = [([self.vocab["<pad>"]] * (max_len -  len(words))) + words for words in x]
    input = torch.tensor(x)

    embeddings = self.get_embedding(input)
    GRU_int_results = self.GRU_Layer(embeddings)[1]
    GRU_values = GRU_int_results[0] #Get last hidden state(s)
    result = linear_layer(GRU_values)
    return result

class OneLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      self.base = super(OneLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, output_size),
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class TwoLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super().__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class ThreeLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(ThreeLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class FourLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(FourLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )


    def forward(self, x):
      return super().forward(x, self.linear_layer)


## Model Training

In [None]:
def train_and_validate_loop(model, batch_size, learning_rate, epochs, tuning=False):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
  torch.set_default_device(device)
  loss_function = torch.nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  training_loader = DataLoader(training_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  dev_loader = DataLoader(dev_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  accuracy = torchmetrics.Accuracy(task="binary")
  f1_score = torchmetrics.F1Score(task="binary")
  torch.set_grad_enabled(True)

  if not tuning:
    print(f"Epochs: {epochs}")

  for epoch in range(0, epochs):
      model.train()
      running_loss = 0.0

      # Training
      for index, value in enumerate(training_loader):
          optimizer.zero_grad()
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)
          outputs = model(data)

          loss = loss_function(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      # Validation to ensure the model is learning
      model.eval()
      torch.set_grad_enabled(False)
      running_loss = 0
      num_correct = 0

      pred = torch.Tensor().to(device)
      gold_standard = torch.Tensor().to(device)
      for index, value in enumerate(dev_loader):
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)

          outputs = model(data)
          loss = loss_function(outputs, labels)
          running_loss += loss

          normalised_outputs = torch.sigmoid(outputs)
          pred = torch.cat((pred, normalised_outputs))
          gold_standard = torch.cat((gold_standard, labels))

      torch.set_grad_enabled(True)
      batch_loss = running_loss / len(dev_loader)
      score = f1_score(pred, gold_standard)
      with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
        checkpoint = None
        if (epoch + 1) % 3 == 0:
            # This saves the model to the trial directory
            torch.save(
                model.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pth")
            )
            checkpoint = train.Checkpoint.from_directory(temp_checkpoint_dir)

        if tuning:
          train.report({"score": score}, checkpoint=checkpoint)
        else:
          print(f"---------------------EPOCH {epoch+1} / {epochs}---------------------")
          print(f"Batch Loss {batch_loss}")
          print(f"Accuracy {accuracy(pred, gold_standard)}")


In [None]:
# test_embeddings = gensim.downloader.load("word2vec-google-news-300")
# test_embeddings["<UNK>"] = np.random.rand(300)
# test_embeddings["<sep>"] = np.random.rand(300)
# test_embeddings["<pad>"] = np.random.rand(300)
# coded_embeddings = torch.FloatTensor(test_embeddings.vectors).to(DEVICE)
# vocab = test_embeddings.key_to_index
# model = FourLinearLayerGruRNN(300, 1, 300, coded_embeddings, vocab, 1, is_bidirectional=True).to(DEVICE)
# train_and_validate_loop(model,64, 0.000019238, 3)

## Hyperparameter Selection

In [None]:
def tune_training_step(config, embedding, network):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)

  chosen_network = network[config["linear_layers"]]
  coded_embeddings = torch.FloatTensor(np.array(embedding.vectors)).to(device)
  vocab = embedding.key_to_index
  generated_model = chosen_network(300, 1, 300, coded_embeddings, vocab, config["rnn_layers"], config["is_bidirectional"]).to(device)
  train_and_validate_loop(generated_model, config["batch_size"], config["lr"], config["epochs"], True)


config = {
  "lr": tune.loguniform(1e-6, 1e-2),
  "batch_size": tune.choice([8, 16, 32, 64, 128]),
  "is_bidirectional": tune.choice([True, False]),
  "epochs": tune.choice(list(range(3,15))),
  "linear_layers": tune.choice(list(range(1,5))),
  "rnn_layers": tune.choice(list(range(1,5))),
  # "embedding_size": tune.choice([64,128,256,512]),
  # "hidden_size": tune.choice(list(range(32, 513, 32)))
}


embedding = {
    64: embeddings_64,
    128: embeddings_128,
    256: embeddings_256,
    512: embeddings_512
}

network = {
    1: OneLinearLayerGruRNN,
    2: TwoLinearLayerGruRNN,
    3: ThreeLinearLayerGruRNN,
    4: FourLinearLayerGruRNN
}

asha_scheduler = ASHAScheduler(
        metric="score",
        mode="max",
        max_t=14,
        grace_period=2,
        reduction_factor=2
    )

chosen_search_alg = HyperOptSearch(metric="score", mode="max")

chosen_embedding = gensim.downloader.load("word2vec-google-news-300")
chosen_embedding.save(WORD2VEC_EMBEDDINGS)
chosen_embedding["<UNK>"] = np.random.rand(300)
chosen_embedding["<sep>"] = np.random.rand(300)
chosen_embedding["<pad>"] = np.random.rand(300)

training_wrapper = tune.with_resources(tune.with_parameters(tune_training_step, embedding=chosen_embedding, network=network), {"CPU": 1.6, "GPU": 1/3})
tuner = tune.Tuner(
    training_wrapper,
    tune_config=tune.TuneConfig(
        num_samples=35,
        search_alg=chosen_search_alg,
        scheduler=asha_scheduler,
    ),
    param_space=config
)

result = tuner.fit()


KeyboardInterrupt: 

## Model Evaluation

In [None]:
best_result = result.get_best_result("loss", mode="min")
with best_result.checkpoint.as_directory() as checkpoint_dir:
    state_dict = torch.load(os.path.join(checkpoint_dir, "model.pth"))

print(best_result)

Result(
  metrics={'loss': 0.6928823590278625},
  path='/root/ray_results/tune_training_step_2024-04-23_17-59-11/tune_training_step_9b7af96a_5_batch_size=128,epochs=4,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=4_2024-04-23_18-05-05',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_17-59-11/tune_training_step_9b7af96a_5_batch_size=128,epochs=4,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=4_2024-04-23_18-05-05/checkpoint_000000)
)


In [None]:
coded_embeddings = torch.FloatTensor(chosen_embedding.vectors).to(DEVICE)
vocab = chosen_embedding.key_to_index
model = OneLinearLayerGruRNN(300, 1, 300, coded_embeddings, vocab, 4, is_bidirectional=False).to(DEVICE)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [None]:
model.eval()
torch.set_default_device(DEVICE)
torch.set_grad_enabled(False)
num_correct = 0
loader = DataLoader(test_samples, batch_size=16, generator=torch.Generator(device=DEVICE))

answers = torch.Tensor().to(DEVICE)
gold_standard = torch.Tensor().to(DEVICE)

for index, value in enumerate(loader):
    data, labels = value
    labels = labels.reshape(-1,1)
    labels = labels.type(torch.FloatTensor)
    labels = labels.to(DEVICE)

    outputs = model(data)

    answers = torch.cat((answers, outputs))
    gold_standard = torch.cat((gold_standard, labels))


torch.set_grad_enabled(True)
accuracy = torchmetrics.Accuracy(task="binary")
mcc = torchmetrics.MatthewsCorrCoef(task="binary")
print(f"Accuracy {accuracy(answers, gold_standard)}")
print(f"MCC {mcc(answers, gold_standard)}")

Accuracy 0.4399999976158142
MCC -0.13093073666095734
