# Import Packages

In [None]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchtext
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

import wandb
from model_training_utils import training_loop, plot_metrics
from models import LSTM
from text_preprocessing_utils import preprocess_tweet
from tweet_data_set import TweetsDataset

print(f"PyTorch version: {torch.__version__}")
print(f"torchtext version: {torchtext.__version__}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything(42)

In [None]:
os.environ[
    "WANDB_NOTEBOOK_NAME"] = r"C:\Users\Reslan Al Tinawi\Desktop\CS918-natural-language-processing\assignment-2\03-LSTM.ipynb"

In [None]:
wandb.login()

# Load GloVe embedding

In [None]:
def load_embedding(embedding_file):
    glove_embedding_dict = {}

    with open(embedding_file, encoding="utf8") as embedding_file:
        for line in embedding_file:
            tokens = line.split()
            word = tokens[0]
            word_embedding_vector = np.array(tokens[1:], dtype=np.float64)
            glove_embedding_dict[word] = word_embedding_vector

    return glove_embedding_dict

In [None]:
glove_embedding_dict = load_embedding("data/glove.6B/glove.6B.100d.txt")

In [None]:
print(f"Number of words in GloVe embedding: {len(glove_embedding_dict)}")

# Load data

In [None]:
def read_data(file_name: str):
    data_list = []

    with open(f"data/semeval-tweets/{file_name}.txt", encoding="utf8") as f:
        for line in f:
            fields = line.strip().split("\t")
            data_list.append(fields)

    df = pd.DataFrame(
        data=data_list,
        columns=[
            "tweet_id",
            "tweet_sentiment",
            "tweet_text",
        ],
    )

    return df

In [None]:
training_data = read_data("twitter-training-data")

In [None]:
development_data = read_data("twitter-dev-data")

In [None]:
testing_1_data = read_data("twitter-test1")

In [None]:
testing_2_data = read_data("twitter-test2")

In [None]:
testing_3_data = read_data("twitter-test3")

In [None]:
training_data.head()

# Build vocabulary

We build the vocabulary **only** on the training data.

In [None]:
tokenizer = get_tokenizer("basic_english")

In [None]:
training_tweets_preprocessed = [preprocess_tweet(tweet, tokenizer) for tweet in training_data["tweet_text"]]

In [None]:
development_tweets_preprocessed = [preprocess_tweet(tweet, tokenizer) for tweet in development_data["tweet_text"]]

In [None]:
special_tokens = ["<unk>", "<pad>"]
vocab = build_vocab_from_iterator(training_tweets_preprocessed, specials=special_tokens)

In [None]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(vocab["<unk>"])

In [None]:
print(f"Vocabulary size: {len(vocab)}")

# Build embedding matrix

In [None]:
vocab_size = len(vocab)
embedding_dim = 100
embedding_matrix = torch.zeros((vocab_size, embedding_dim))

In [None]:
print(embedding_matrix.shape)

In [None]:
unknown_words = []

In [None]:
for word, idx in tqdm(vocab.get_stoi().items()):
    if word in glove_embedding_dict:
        embedding_matrix[idx] = torch.tensor(glove_embedding_dict[word])
    else:
        unknown_words.append(word)
        embedding_matrix[idx] = torch.randn(embedding_dim)

In [None]:
embedding_matrix.shape

In [None]:
print(
    f"There are {len(unknown_words)} ({len(unknown_words) / len(vocab):.2f}%) words in the vocabulary that are not in the GloVe embedding.")

In [None]:
print(unknown_words[:100])

# Define Dataset

In [None]:
encoder = LabelEncoder()

encoder.fit(training_data['tweet_sentiment'])

In [None]:
print(encoder.classes_)

In [None]:
train_dataset = TweetsDataset(training_data['tweet_id'], training_tweets_preprocessed, training_data["tweet_sentiment"],
                              vocab, encoder)

In [None]:
development_dataset = TweetsDataset(development_data['tweet_id'], development_tweets_preprocessed,
                                    development_data["tweet_sentiment"], vocab, encoder)

In [None]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Development dataset size: {len(development_dataset)}")

In [None]:
def collate_batch(batch):
    tweet_ids = np.array([item[0] for item in batch])
    tweets = [item[1] for item in batch]
    labels = np.array([item[2] for item in batch])

    padded_tweets = pad_sequence(tweets, batch_first=True, padding_value=vocab["<pad>"])

    return tweet_ids, padded_tweets, torch.from_numpy(labels).to(dtype=torch.long)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate_batch)

In [None]:
development_dataloader = DataLoader(development_dataset, batch_size=256, shuffle=False, collate_fn=collate_batch)

# Define Model

In [None]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 300
output_dim = 3
n_layers = 2
bidirectional = True
dropout_rate = 0.5

model = LSTM(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)

In [None]:
print(model)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "weight" in name:
                nn.init.orthogonal_(param)

In [None]:
model.apply(initialize_weights)

In [None]:
model.embedding.weight.data = embedding_matrix

In [None]:
# lr = 5e-4
lr = 0.1
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
n_epochs = 30

In [None]:
wandb_run = wandb.init(
    project="sentiment-analysis",
    name="glove-lstm-5",
    config={
        "learning_rate": lr,
        "architecture": "Bi-LSTM",
        "features": "GloVe embedding",
        "batch_size": 256,
        "epochs": 10,
        "optimizer": optimizer.__class__.__name__,
        "activation": "ReLU",
        "loss_function": "CrossEntropyLoss",
        "seed": 42,
    },
)

In [None]:
metrics = training_loop(
    n_epochs,
    train_dataloader,
    development_dataloader,
    model,
    criterion,
    optimizer,
    device,
    wandb_run,
)

In [None]:
plot_metrics(metrics)