## Initialization

In [None]:
import torch
from IPython.core.display import display
import data_loader
import traineval
import model as model
import importlib
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


seed = 42

pd.set_option('max_columns', 100)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"device used is {device}")

## Loading the Data

In [None]:
train_dataset, tokens_vocab, y_vocab = data_loader.load_train_dataset()
dev_dataset = data_loader.load_dev_dataset(tokens_vocab, y_vocab)
print(train_dataset, dev_dataset, sep='\n')

sa_train_dataset = data_loader.WSDSentencesDataset.from_word_dataset(train_dataset)
sa_dev_dataset = data_loader.WSDSentencesDataset.from_word_dataset(dev_dataset)
print(sa_train_dataset, sa_dev_dataset, sep='\n')

## Part 1: Query-Based Attention

In [None]:
importlib.reload(model)

dropout = 0.25
D = 300
lr = 8e-5
batch_size = 100
num_epochs = 5
set_seed(seed)

m1 = model.WSDModel(
    tokens_vocab.size(),
    y_vocab.size(),
    D=D,
    dropout_prob=dropout
).to(device)

optimizer = torch.optim.Adam(m1.parameters(), lr=lr)

losses1, train_acc1, val_acc1 = traineval.train(
    m1, optimizer, train_dataset, dev_dataset, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
print(f"Validation accuracy: {val_acc1[-1]:.3f}, Training accuracy:{train_acc1[-1]:.3f}")
assert round(val_acc1[-1],
             3) >= 0.514, "The last validation accuracy should be at least 0.514. Please check your implementation before you continue"

In [None]:
_, axs = plt.subplots(nrows=2, figsize=(15, 6))

axs[0].plot(losses1, '-', label='Train Loss')
axs[0].legend()
axs[1].plot(train_acc1, '-o', label='Train Acc')
axs[1].plot(val_acc1, '-o', label='Val Acc')
axs[1].legend()

plt.tight_layout()

In [None]:
traineval.higlight_samples(m1, dev_dataset, sample_size=5)

In [None]:
eval_df, attention_df = traineval.evaluate_verbose(m1, dev_dataset, iter_lim=100)

idxs = np.where(eval_df['y_true'] != eval_df['y_pred'])
idxs = list(idxs[0][:5])
display(traineval.highlight(eval_df, attention_df, idxs))

idxs = np.where(eval_df['query_token'] == 'left')
display(traineval.highlight(eval_df, attention_df, idxs))

## Part 2: Padding

In [None]:
importlib.reload(model)

dropout = 0.25
D = 300
lr = 8e-5
batch_size = 100
num_epochs = 5
set_seed(seed)

m2 = model.WSDModel(
    tokens_vocab.size(),
    y_vocab.size(),
    D=D,
    dropout_prob=dropout,
    use_padding=True
).to(device)

optimizer = torch.optim.Adam(m2.parameters(), lr=lr)

losses2, train_acc2, val_acc2 = traineval.train(
    m2, optimizer, train_dataset, dev_dataset, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
print(f"Validation accuracy: {val_acc2[-1]:.3f}, Training accuracy:{train_acc2[-1]:.3f}")
assert round(val_acc2[-1],
             3) >= 0.527, "The last validation accuracy should be at least 0.527. Please check your implementation before you continue"

In [None]:
_, axs = plt.subplots(nrows=2, figsize=(15, 6))

axs[0].plot(losses2, '-', label='Train Loss')
axs[0].legend()
axs[1].plot(train_acc2, '-o', label='Train Acc')
axs[1].plot(val_acc2, '-o', label='Val Acc')
axs[1].legend()

plt.tight_layout()

In [None]:
traineval.higlight_samples(m2, dev_dataset, sample_size=5)

In [None]:
eval_df, attention_df = traineval.evaluate_verbose(m2, dev_dataset, iter_lim=100)

idxs = np.where(eval_df['y_true'] != eval_df['y_pred'])
idxs = list(idxs[0][:5])
display(traineval.highlight(eval_df, attention_df, idxs))

idxs = np.where(eval_df['query_token'] == 'left')
display(traineval.highlight(eval_df, attention_df, idxs))

## Part 3: Self-Attention

In [None]:
importlib.reload(model)

lr = 2e-4
dropout = 0.2
D = 300
batch_size = 20
num_epochs = 5
set_seed(seed)

m3 = model.WSDModel(
    tokens_vocab.size(),
    y_vocab.size(),
    D=D,
    dropout_prob=dropout,
    use_padding=True
).to(device)

optimizer = torch.optim.Adam(m3.parameters(), lr=lr)

losses3, train_acc3, val_acc3 = traineval.train(
    m3, optimizer, sa_train_dataset, sa_dev_dataset, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
print(f"Validation accuracy: {val_acc3[-1]:.3f}, Training accuracy:{train_acc3[-1]:.3f}")
assert val_acc3[
           -1] >= 0.543, "The last validation accuracy should be at least 0.543. Please check your implementation before you continue"

In [None]:
_, axs = plt.subplots(nrows=2, figsize=(15, 6))

axs[0].plot(losses3, '-', label='Train Loss')
axs[0].legend()
axs[1].plot(train_acc3, '-o', label='Train Acc')
axs[1].plot(val_acc3, '-o', label='Val Acc')
axs[1].legend()

plt.tight_layout()

In [None]:
traineval.higlight_samples(m3, dev_dataset, sample_size=5)

In [None]:
eval_df, attention_df = traineval.evaluate_verbose(m3, dev_dataset, iter_lim=100)

idxs = np.where(eval_df['y_true'] != eval_df['y_pred'])
idxs = list(idxs[0][:5])
display(traineval.highlight(eval_df, attention_df, idxs))

idxs = np.where(eval_df['query_token'] == 'left')
display(traineval.highlight(eval_df, attention_df, idxs))

## Part 4: Positional embeddings

In [None]:
importlib.reload(model)

lr = 2e-4
dropout = 0.2
D = 300
batch_size = 20
num_epochs = 5
set_seed(seed)

m4 = model.WSDModel(
    tokens_vocab.size(),
    y_vocab.size(),
    D=D,
    dropout_prob=dropout,
    use_padding=True,
    use_positional=True
).to(device)

optimizer = torch.optim.Adam(m4.parameters(), lr=lr)

losses4, train_acc4, val_acc4 = traineval.train(
    m4, optimizer, sa_train_dataset, sa_dev_dataset, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
print(f"Validation accuracy: {val_acc4[-1]:.3f}, Training accuracy:{train_acc4[-1]:.3f}")
assert val_acc4[
           -1] >= 0.543, "The last validation accuracy should be at least 0.543. Please check your implementation before you continue"

In [None]:
_, axs = plt.subplots(nrows=2, figsize=(15, 6))

axs[0].plot(losses4, '-', label='Train Loss')
axs[0].legend()
axs[1].plot(train_acc4, '-o', label='Train Acc')
axs[1].plot(val_acc4, '-o', label='Val Acc')
axs[1].legend()

plt.tight_layout()

In [None]:
traineval.higlight_samples(m4, sa_dev_dataset, sample_size=5)

In [None]:
eval_df, attention_df = traineval.evaluate_verbose(m4, dev_dataset, iter_lim=100)

idxs = np.where(eval_df['y_true'] != eval_df['y_pred'])
idxs = list(idxs[0][:5])
display(traineval.highlight(eval_df, attention_df, idxs))

idxs = np.where(eval_df['query_token'] == 'left')
display(traineval.highlight(eval_df, attention_df, idxs))

## Part 5: Causal Attention

In [None]:
importlib.reload(model)

set_seed(seed)

m5 = model.WSDModel(
    tokens_vocab.size(),
    y_vocab.size(),
    D=D,
    dropout_prob=dropout,
    use_padding=True,
    use_positional=True,
    use_causal=True
).to(device)

optimizer = torch.optim.Adam(m5.parameters(), lr=lr)

losses5, train_acc5, val_acc5 = traineval.train(
    m5, optimizer, sa_train_dataset, sa_dev_dataset, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
print(f"Validation accuracy: {val_acc5[-1]:.3f}, Training accuracy:{train_acc5[-1]:.3f}")
assert val_acc5[
           -1] >= 0.543, "The last validation accuracy should be at least 0.543. Please check your implementation before you continue"

In [None]:
_, axs = plt.subplots(nrows=2, figsize=(15, 6))

axs[0].plot(losses5, '-', label='Train Loss')
axs[0].legend()
axs[1].plot(train_acc5, '-o', label='Train Acc')
axs[1].plot(val_acc5, '-o', label='Val Acc')
axs[1].legend()

plt.tight_layout()

In [None]:
traineval.higlight_samples(m5, sa_dev_dataset, sample_size=5)

In [None]:
eval_df, attention_df = traineval.evaluate_verbose(m5, dev_dataset, iter_lim=100)

idxs = np.where(eval_df['y_true'] != eval_df['y_pred'])
idxs = list(idxs[0][:5])
display(traineval.highlight(eval_df, attention_df, idxs))

idxs = np.where(eval_df['query_token'] == 'left')
display(traineval.highlight(eval_df, attention_df, idxs))