# Install needed packages


In [None]:

! pip install ekphrasis wordcloud wordsnake nltk contractions transformers tqdm emoji umap-learn pillow wandb

# Load packages


In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchtext
import transformers
import umap
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

import wandb
from scripts.data_loading_utils import load_embedding, read_tweet_data
from scripts.model_training_utils import plot_classification_results
from scripts.model_training_utils import plot_metrics, training_loop
from scripts.models import BERTClassifier, LSTM, count_parameters
from scripts.models import LSTMWithAttention
from scripts.plotting_utilities import generate_ngram_frequencies, generate_wordcloud_with_ngrams, \
    plot_top_common_ngrams
from scripts.text_preprocessing_utils import advanced_preprocessing
from scripts.tweet_data_set import BERTTweetsDataset
from scripts.tweet_data_set import TweetsDataset

print(f"PyTorch version: {torch.__version__}")
print(f"torchtext version: {torchtext.__version__}")

In [None]:
nltk.download('stopwords')

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(f"Using device: {device}")

# Setup Data Path


In [None]:
data_dir_path = Path(os.path.join(os.getcwd(), "data"))

In [ ]:
models_weights_dir_path = Path(os.path.join(os.getcwd(), "models_weights"))

# Read data


In [None]:
training_data = read_tweet_data("twitter-training-data")

In [None]:
development_data = read_tweet_data("twitter-dev-data")

In [None]:
test1_data = read_tweet_data("twitter-test1-data")

In [None]:
test2_data = read_tweet_data("twitter-test2-data")

In [None]:
test3_data = read_tweet_data("twitter-test3-data")

In [None]:
training_data.head()

In [None]:
print(f"Training data: {training_data['tweet_sentiment'].value_counts().to_dict()}")
print(
    f"Development data: {development_data['tweet_sentiment'].value_counts().to_dict()}"
)

In [None]:
print(
    f"Training data: {training_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)
print(
    f"Development data: {development_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)

# Data Cleaning & Exploratory Data Analysis


In [None]:
tokenizer = TweetTokenizer()

In [None]:
training_data["tweet_text_cleaned"] = training_data["tweet_text"].apply(
    lambda tweet: advanced_preprocessing(tweet, tokenizer)
)

In [None]:
positive_tweets = training_data[training_data["tweet_sentiment"] == "positive"]
negative_tweets = training_data[training_data["tweet_sentiment"] == "negative"]
neutral_tweets = training_data[training_data["tweet_sentiment"] == "neutral"]

## Generate n-grams frequencies


In [None]:
positive_unigram_freq = generate_ngram_frequencies(
    corpus=positive_tweets["tweet_text_cleaned"], n_grams=1, max_features=1000
)
positive_bigram_freq = generate_ngram_frequencies(
    corpus=positive_tweets["tweet_text_cleaned"], n_grams=2, max_features=1000
)
positive_trigram_freq = generate_ngram_frequencies(
    corpus=positive_tweets["tweet_text_cleaned"], n_grams=3, max_features=1000
)

In [None]:
negative_unigram_freq = generate_ngram_frequencies(
    corpus=negative_tweets["tweet_text_cleaned"], n_grams=1, max_features=1000
)
negative_bigram_freq = generate_ngram_frequencies(
    corpus=negative_tweets["tweet_text_cleaned"], n_grams=2, max_features=1000
)
negative_trigram_freq = generate_ngram_frequencies(
    corpus=negative_tweets["tweet_text_cleaned"], n_grams=3, max_features=1000
)

In [None]:
neutral_unigram_freq = generate_ngram_frequencies(
    corpus=neutral_tweets["tweet_text_cleaned"], n_grams=1, max_features=1000
)
neutral_bigram_freq = generate_ngram_frequencies(
    corpus=neutral_tweets["tweet_text_cleaned"], n_grams=2, max_features=1000
)
neutral_trigram_freq = generate_ngram_frequencies(
    corpus=neutral_tweets["tweet_text_cleaned"], n_grams=3, max_features=1000
)

In [None]:
plot_top_common_ngrams(
    [positive_unigram_freq, positive_bigram_freq, positive_trigram_freq]
)

In [None]:
plot_top_common_ngrams(
    [negative_unigram_freq, negative_bigram_freq, negative_trigram_freq]
)

In [None]:
plot_top_common_ngrams(
    [neutral_unigram_freq, neutral_bigram_freq, neutral_trigram_freq]
)

## Generate wordclouds


In [None]:
for idx, n_gram_freq_dict in enumerate(
        [positive_unigram_freq, positive_bigram_freq, positive_trigram_freq]
):
    generate_wordcloud_with_ngrams(n_gram_freq_dict, idx + 1, "Positive tweets")

In [None]:
for idx, n_gram_freq_dict in enumerate(
        [negative_unigram_freq, negative_bigram_freq, negative_trigram_freq]
):
    generate_wordcloud_with_ngrams(n_gram_freq_dict, idx + 1, "Negative tweets")

In [None]:
for idx, n_gram_freq_dict in enumerate(
        [neutral_unigram_freq, neutral_bigram_freq, neutral_trigram_freq]
):
    generate_wordcloud_with_ngrams(n_gram_freq_dict, idx + 1, "Neutral tweets")

# Traditional classification


## Naive Bayes


In [None]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    training_data["tweet_text_cleaned"],
    training_data["tweet_sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=training_data["tweet_sentiment"],
)

In [None]:
print(f"Training data: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Test data: {y_test.value_counts(normalize=True).to_dict()}")

In [None]:
# Define a pipeline combining a text feature extractor with a Naive Bayes classifier
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),  # Placeholder, will be tuned by GridSearchCV
        ("clf", MultinomialNB()),  # Naive Bayes classifier
    ]
)

In [None]:
parameters = {
    "vect": [TfidfVectorizer(), CountVectorizer()],
    "vect__stop_words": ["english"],
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__min_df": [5, 10, 15],
    "vect__max_features": (None, 5000, 10000, 50000),
    "vect__ngram_range": [(1, 1), (1, 2), (1, 3)],  # Unigrams or bigrams
    "clf__alpha": (0.01, 0.1, 1),  # Additive (Laplace/Lidstone) smoothing parameter
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
# Evaluate the best grid search pipeline on the test dataset
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# create confusion matrix display
import seaborn as sns

conf_matrix = confusion_matrix(y_test, y_pred, normalize="true")
sns.heatmap(
    conf_matrix,
    annot=True,
    cmap="Blues",
    xticklabels=["negative", "neutral", "positive"],
    yticklabels=["negative", "neutral", "positive"],
)
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title("Confusion matrix")
plt.show()

In [None]:
best_vectorizer = grid_search.best_estimator_.named_steps["vect"]
best_classifier = grid_search.best_estimator_.named_steps["clf"]

In [None]:
best_vectorizer.get_feature_names_out().shape

In [None]:
best_classifier.feature_log_prob_.shape

In [None]:
best_classifier.classes_

In [None]:
# Get the most important features (words) for each class
def get_most_important_features(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names_out()
    topn_class1 = sorted(
        zip(classifier.feature_log_prob_[0], feature_names), reverse=True
    )[:n]
    topn_class2 = sorted(
        zip(classifier.feature_log_prob_[1], feature_names), reverse=True
    )[:n]
    topn_class3 = sorted(
        zip(classifier.feature_log_prob_[2], feature_names), reverse=True
    )[:n]

    print(f"Top {n} most important features for each class:")
    for i, class_label in enumerate(class_labels):
        print(f"\n{class_label}:")
        for coef, feat in (
                topn_class1 if i == 0 else topn_class2 if i == 1 else topn_class3
        ):
            print(f"{feat} ({coef:.2f})")

In [None]:
get_most_important_features(best_vectorizer, best_classifier, n=10)

## Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),  # Placeholder, will be tuned by GridSearchCV
        ("clf", LogisticRegression(max_iter=500)),  # Logistic Regression classifier
    ]
)

In [None]:
parameters = {
    "vect": [TfidfVectorizer(), CountVectorizer()],
    "vect__stop_words": ["english"],
    "vect__min_df": [10, 15, 25],
    "vect__max_features": (500, 1000),
    "vect__ngram_range": [(1, 1), (1, 2), (1, 3)],  # Unigrams or bigrams
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
# Evaluate the best grid search pipeline on the test dataset
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# create confusion matrix display
import seaborn as sns

conf_matrix = confusion_matrix(y_test, y_pred, normalize="true")
sns.heatmap(
    conf_matrix,
    annot=True,
    cmap="Blues",
    xticklabels=["negative", "neutral", "positive"],
    yticklabels=["negative", "neutral", "positive"],
)
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title("Confusion matrix")
plt.show()

## UMAP


In [None]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=5, stop_words="english", ngram_range=(1, 3), max_features=5000
)
tfidf_word_doc_matrix = tfidf_vectorizer.fit_transform(
    training_data["tweet_text_cleaned"]
)

In [None]:
tfidf_embedding = umap.UMAP(metric="hellinger").fit(tfidf_word_doc_matrix)

In [None]:
sns.scatterplot(
    x=tfidf_embedding.embedding_[:, 0],
    y=tfidf_embedding.embedding_[:, 1],
    hue=training_data["tweet_sentiment"],
)

# Deep Learning Models


## Load GloVe embedding:


In [None]:
embedding_file_name = "glove.6B.100d.txt"
glove_embedding_dict = load_embedding(data_dir_path / embedding_file_name)

In [None]:
print(f"Number of words in GloVe embedding: {len(glove_embedding_dict)}")

In [None]:
# set max tokens to 5000
special_tokens = ["<unk>", "<pad>"]
min_freq = 5
max_tokens = 5000
vocab = build_vocab_from_iterator(
    iterator=training_data["tweet_text_cleaned"],
    min_freq=min_freq,
    specials=special_tokens,
    max_tokens=max_tokens,
)

In [None]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(vocab["<unk>"])

In [None]:
print(f"Vocabulary size: {len(vocab)}")

## Build embedding matrix


In [None]:
vocab_size = len(vocab)
embedding_dim = 100
embedding_matrix = torch.zeros((vocab_size, embedding_dim))

In [None]:
print(embedding_matrix.shape)

In [None]:
unknown_words = []

In [None]:
for word, idx in tqdm(vocab.get_stoi().items()):
    if word in glove_embedding_dict:
        embedding_matrix[idx] = torch.tensor(glove_embedding_dict[word])
    else:
        unknown_words.append(word)
        embedding_matrix[idx] = torch.randn(embedding_dim)

In [None]:
embedding_matrix.shape

In [None]:
print(
    f"There are {len(unknown_words)} ({len(unknown_words) / len(vocab):.2f}%) words in the vocabulary that are not in the GloVe embedding."
)

In [None]:
print(unknown_words)

## Define Datasets and Dataloaders


In [None]:
encoder = LabelEncoder()

encoder.fit(training_data["tweet_sentiment"])

In [None]:
print(encoder.classes_)

In [None]:
bert_train_dataset = TweetsDataset(
    tweet_ids=training_data['tweet_id'], tweets=training_data['tweet_text_cleaned'],
    labels=training_data['tweet_sentiment'], vocab=vocab, label_encoder=encoder
)

In [None]:
development_dataset = TweetsDataset(
    tweet_ids=development_data['tweet_id'], tweets=development_data['tweet_text_cleaned'],
    labels=development_data['tweet_sentiment'], vocab=vocab, label_encoder=encoder
)

In [None]:
test1_dataset = TweetsDataset(
    tweet_ids=test1_data['tweet_id'], tweets=test1_data['tweet_text_cleaned'],
    labels=test1_data['tweet_sentiment'], vocab=vocab, label_encoder=encoder
)

In [None]:
test2_dataset = TweetsDataset(
    tweet_ids=test2_data['tweet_id'], tweets=test2_data['tweet_text_cleaned'],
    labels=test2_data['tweet_sentiment'], vocab=vocab, label_encoder=encoder
)

In [None]:
test3_dataset = TweetsDataset(
    tweet_ids=test3_data['tweet_id'], tweets=test3_data['tweet_text_cleaned'],
    labels=test3_data['tweet_sentiment'], vocab=vocab, label_encoder=encoder
)

In [None]:
def collate_batch(batch):
    tweet_ids = np.array([item[0] for item in batch])
    tweets = [item[1] for item in batch]
    labels = np.array([item[2] for item in batch])

    padded_tweets = pad_sequence(tweets, batch_first=True, padding_value=vocab["<pad>"])

    return tweet_ids, padded_tweets, torch.from_numpy(labels).to(dtype=torch.long)

In [None]:
batch_size = 256

In [None]:
train_dataloader = DataLoader(
    bert_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
development_dataloader = DataLoader(
    development_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)
test1_dataloader = DataLoader(
    test1_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)
test2_dataloader = DataLoader(
    test2_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)
test3_dataloader = DataLoader(
    test3_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

## LSTM


In [None]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 300
output_dim = 3
n_layers = 2
bidirectional = True
dropout_rate = 0.5

In [None]:
lstm_model = LSTM(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)

In [None]:
print(f"The LSTM model has {count_parameters(lstm_model):,} trainable parameters")

In [None]:
lstm_model.embedding.weight.data = embedding_matrix

In [None]:
n_epochs = 10
lr = 5e-4

# TODO:
# add class weights to the loss function: https://stackoverflow.com/questions/61414065/pytorch-weight-in-cross-entropy-loss

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
lstm_model = lstm_model.to(device)
criterion = criterion.to(device)

In [None]:
wandb_run = wandb.init(
    project="sentiment-analysis",
    name="lstm-with-attention",
    config={
        "learning_rate": lr,
        "architecture": "Bi-LSTM with attention",
        "features": "GloVe embedding",
        "batch_size": 256,
        "epochs": n_epochs,
        "optimizer": optimizer.__class__.__name__,
        "activation": "ReLU",
        "loss_function": "CrossEntropyLoss",
        "seed": 42,
    },
)

In [ ]:
lstm_model_path = models_weights_dir_path / "lstm_model.pt"

In [None]:
metrics = training_loop(
    n_epochs,
    train_dataloader,
    development_dataloader,
    lstm_model,
    criterion,
    optimizer,
    device,
    False,
    wandb_run,
    lstm_model_path
)

In [None]:
wandb_run.finish()

In [None]:
plot_metrics(metrics)

In [None]:
plot_classification_results(lstm_model, development_dataloader, encoder, device)

## LSTM with Attention


In [None]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 300
output_dim = 3
n_layers = 2
bidirectional = True
dropout_rate = 0.5

In [None]:
lstm_with_attention_model = LSTMWithAttention(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)

In [None]:
print(
    f"The LSTM with attention model has {count_parameters(lstm_with_attention_model):,} trainable parameters"
)

In [None]:
lstm_with_attention_model.embedding.weight.data = embedding_matrix

In [None]:
n_epochs = 10
lr = 5e-4

# TODO:
# add class weights to the loss function: https://stackoverflow.com/questions/61414065/pytorch-weight-in-cross-entropy-loss

optimizer = torch.optim.Adam(lstm_with_attention_model.parameters(), lr=lr)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
lstm_with_attention_model = lstm_with_attention_model.to(device)
criterion = criterion.to(device)

In [None]:
wandb_run = wandb.init(
    project="sentiment-analysis",
    name="lstm-with-attention",
    config={
        "learning_rate": lr,
        "architecture": "Bi-LSTM with attention",
        "features": "GloVe embedding",
        "batch_size": 256,
        "epochs": n_epochs,
        "optimizer": optimizer.__class__.__name__,
        "activation": "ReLU",
        "loss_function": "CrossEntropyLoss",
        "seed": 42,
    },
)

In [ ]:
lstm_with_attention_model_path = models_weights_dir_path / "lstm_with_attention_model.pt"

In [None]:
metrics = training_loop(
    n_epochs,
    train_dataloader,
    development_dataloader,
    lstm_with_attention_model,
    criterion,
    optimizer,
    device,
    False,
    wandb_run,
    lstm_with_attention_model_path
)

In [None]:
wandb_run.finish()

In [None]:
plot_metrics(metrics)

In [None]:
plot_classification_results(lstm_with_attention_model, development_dataloader, encoder, device)

## BERT


In [None]:
transformer_name = "bert-base-uncased"
bert_tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)
bert_transformer = transformers.AutoModel.from_pretrained(transformer_name)

In [None]:
print(bert_transformer.config.hidden_size)

In [None]:
bert_train_dataset = BERTTweetsDataset(
    tweet_ids=training_data["tweet_id"],
    tweets=training_data["tweet_text"],
    labels=training_data["tweet_sentiment"],
    tokenizer=bert_tokenizer,
    label_encoder=encoder,
)

In [None]:
bert_development_dataset = BERTTweetsDataset(
    tweet_ids=development_data["tweet_id"],
    tweets=development_data["tweet_text"],
    labels=development_data["tweet_sentiment"],
    tokenizer=bert_tokenizer,
    label_encoder=encoder,
)

In [None]:
bert_test1_dataset = BERTTweetsDataset(
    tweet_ids=test1_data["tweet_id"],
    tweets=test1_data["tweet_text"],
    labels=test1_data["tweet_sentiment"],
    tokenizer=bert_tokenizer,
    label_encoder=encoder,
)

In [None]:
bert_test2_dataset = BERTTweetsDataset(
    tweet_ids=test2_data["tweet_id"],
    tweets=test2_data["tweet_text"],
    labels=test2_data["tweet_sentiment"],
    tokenizer=bert_tokenizer,
    label_encoder=encoder,
)

In [None]:
bert_test3_dataset = BERTTweetsDataset(
    tweet_ids=test3_data["tweet_id"],
    tweets=test3_data["tweet_text"],
    labels=test3_data["tweet_sentiment"],
    tokenizer=bert_tokenizer,
    label_encoder=encoder,
)

In [None]:
pad_index = bert_tokenizer.pad_token_id

In [None]:
batch_size = 8

In [None]:
bert_train_dataloader = DataLoader(
    bert_train_dataset, batch_size=batch_size, shuffle=True
)
bert_development_dataloader = DataLoader(
    bert_development_dataset, batch_size=batch_size, shuffle=False
)
bert_test1_dataloader = DataLoader(
    bert_test1_dataset, batch_size=batch_size, shuffle=False
)
bert_test2_dataloader = DataLoader(
    bert_test2_dataset, batch_size=batch_size, shuffle=False
)
bert_test3_dataloader = DataLoader(
    bert_test3_dataset, batch_size=batch_size, shuffle=False
)

In [None]:
print(bert_train_dataset[0])

In [None]:
bert_sentiment_model = BERTClassifier(
    transformer=bert_transformer, output_dim=len(encoder.classes_), freeze=False
)

In [None]:
print(
    f"The BERT sentiment model has {count_parameters(bert_sentiment_model):,} trainable parameters"
)

In [None]:
lr = 1e-5

optimizer = torch.optim.Adam(bert_sentiment_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
bert_sentiment_model = bert_sentiment_model.to(device)
criterion = criterion.to(device)

In [None]:
wandb_run = wandb.init(
    project="sentiment-analysis",
    name="bert",
    config={
        "learning_rate": lr,
        "architecture": "BERT",
        "features": "BERT",
        "batch_size": batch_size,
        "epochs": 5,
        "optimizer": optimizer.__class__.__name__,
        "activation": "ReLU",
        "loss_function": "CrossEntropyLoss",
        "seed": 42,
    },
)

In [ ]:
bert_sentiment_model_path = models_weights_dir_path / "bert_sentiment_model.pt"

In [None]:
training_loop(
    5,
    bert_train_dataloader,
    bert_development_dataloader,
    bert_sentiment_model,
    criterion,
    optimizer,
    device,
    True,
    wandb_run,
    bert_sentiment_model_path
)