In [10]:
# Author: Hassan Ali
# Problem 10.3: Using GloVe and Cosine Similarity
# Objective: Find the most similar word for each from a given list.

# Importing Libraries
import torch
from torchtext.vocab import GloVe

# Loading GloVe embeddings
# "6B" = trained on Wikipedia 2014 + Gigaword 5 | dim=50 means each word vector is 50-dimensional.
glove = GloVe(name="6B", dim=50)

# Helper function to compute cosine similarity between two 1D tensors (embedding vectors).
def cosine_similarity(vec1, vec2):
    return torch.nn.functional.cosine_similarity(vec1, vec2, dim=0).item()

# DFunction to find the most similar word to a given input word (excluding the word itself).
def find_most_similar_word(word, glove_vectors):
    # If the word is not in the vocabulary, return None.
    if word not in glove_vectors.stoi:
        return None, None

    # Getting the vector for the input word
    target_vec = glove_vectors[word]
    best_sim = float("-inf")
    best_word = None

    # Iterating through all vocabulary words in GloVe
    for candidate in glove_vectors.stoi.keys():
        # Skipping the same word
        if candidate == word:
            continue
        # Computing similarity
        candidate_vec = glove_vectors[candidate]
        sim = cosine_similarity(target_vec, candidate_vec)
        if sim > best_sim:
            best_sim = sim
            best_word = candidate

    return best_word, best_sim

# Words to check
words_to_check = ["dog", "whale", "before", "however", "fabricate"]

# For each word, find the most similar word and print the result
for w in words_to_check:
    most_similar, similarity_score = find_most_similar_word(w, glove)
    if most_similar is None:
        print(f"'{w}' is not in the GloVe vocabulary.")
    else:
        print(f"For the word '{w}', the most similar word is '{most_similar}' "
              f"with cosine similarity = {similarity_score:.4f}")


# Analysis:
# - Loading the GloVe vectors from the '6B' dataset with 50-dimensional embeddings. This
#  dataset has been trained on a large text corpus (Wikipedia 2014 + Gigaword 5). Each
#  word in the vocabulary is mapped to a vector of length 50, capturing semantic and
#  syntactic information.
#  - Cosine similarity measures how close two vectors are in terms of angle rather than
#  magnitude. A value of 1.0 means the vectors point in exactly the same direction,
#  while a value of -1.0 means they point in opposite directions. A value of 0.0 means
#  they are orthogonal (unrelated). In this script, we compute similarity for every
#  vocabulary word in GloVe and pick the one that yields the highest similarity.
# - Iterating through the entire GloVe vocabulary for each query word, computing
#  cosine similarity. Since GloVe has a large vocabulary, this can be time-consuming
#  for many queries, but is straightforward for a small set of words. We skip the
#  original word itself to avoid trivial matches.
#  - Five chosen words: 'dog', 'whale', 'before', 'however', and 'fabricate'. 
#  For each, the code prints the word in the GloVe vocabulary that is most similar
#  in meaning or usage. The reported cosine similarity indicates how close the two
#  word vectors are. 
# - In an example run, 'dog' is closest to 'cat' with similarity around 0.92,
#  'whale' is closest to 'whales' (approx. 0.90), 'before' is closest to 'after'
#  (approx. 0.95), 'however' is closest to 'although' (approx. 0.98), and 'fabricate'
#  is closest to 'fabricating' (approx. 0.76). These results are plausible given how
#  GloVe embeddings capture semantic relationships.
#  - Functionality depends on the presence of words in GloVe’s vocabulary. Words that
#   are not present will return None.
#  - Cosine similarity in GloVe space often correlates with shared semantic or 
#   syntactic context. 
#  - High similarity typically indicates that two words appear in similar contexts.

For the word 'dog', the most similar word is 'cat' with cosine similarity = 0.9218
For the word 'whale', the most similar word is 'whales' with cosine similarity = 0.8987
For the word 'before', the most similar word is 'after' with cosine similarity = 0.9512
For the word 'however', the most similar word is 'although' with cosine similarity = 0.9801
For the word 'fabricate', the most similar word is 'fabricating' with cosine similarity = 0.7595


In [2]:
# Author: Hassan Ali
# Problem 10.5 (BERT-based Tweet Sentiment Classification)
# Dataset: Tweets.csv (US Airline Sentiment or a similar tweet dataset)
# Building a BERT-based model augmented with a GRU layer for tweet sentiment classification.
# The model will classify tweets into multiple sentiment classes (e.g., negative, neutral, positive).

# Importing necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel


# Setting random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
np.random.seed(SEED)

# Loading and preprocess the dataset
df = pd.read_csv("Tweets.csv")
# Renaming columns for clarity (assume original columns: 'airline_sentiment' and 'text')
df = df.rename(columns={"airline_sentiment": "label", "text": "tweet"})
# Dropping rows with missing tweet text or label
df = df.dropna(subset=["label", "tweet"]).copy()
# Retaining only valid sentiment labels (adjust if needed)
valid_labels = ["negative", "neutral", "positive"]
df = df[df["label"].isin(valid_labels)].copy()
# Encodnig sentiment labels as integers
label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["label"])
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


# Splitting data into training, validation, and test sets
# Stratifying splits to preserve class distribution; 70% train, 15% validation, 15% test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=SEED, stratify=df["encoded_label"])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED, stratify=temp_df["encoded_label"])


# Defining a custom Dataset class for tweets
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenizing the text with padding and truncation
        tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        # Removing batch dimension
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

# Initializing the BERT tokenizer (using bert-base-uncased)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Creating Dataset objects for each split
train_dataset = TweetDataset(train_df["tweet"], train_df["encoded_label"], tokenizer)
valid_dataset = TweetDataset(valid_df["tweet"], valid_df["encoded_label"], tokenizer)
test_dataset  = TweetDataset(test_df["tweet"], test_df["encoded_label"], tokenizer)

# Defining DataLoader batch size
BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# Defining the BERT-GRU model for tweet sentiment classification
class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, num_classes, dropout):
        super(BERTGRUSentiment, self).__init__()
        self.bert = bert  # Pre-trained BERT model
        # GRU layer: bidirectional; input size equals BERT's hidden size
        self.gru = nn.GRU(bert.config.hidden_size, hidden_dim, batch_first=True, bidirectional=True)
        # Fully connected layer: concatenated hidden states from both directions
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_ids, attention_mask):
        # Obtaining embeddings from BERT (BERT parameters are frozen during training)
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embedded = outputs.last_hidden_state
        # Passing embeddings through GRU layer; hidden shape: [num_layers * num_directions, batch, hidden_dim]
        _, hidden = self.gru(embedded)
        # Concatenating the final forward and backward hidden states
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        hidden_cat = self.dropout(hidden_cat)
        # Fully connected layer produces class logits
        return self.fc(hidden_cat)

# Loading the pre-trained BERT model
from transformers import BertModel
bert = BertModel.from_pretrained('bert-base-uncased')
# Setting number of sentiment classes (3: negative, neutral, positive)
num_classes = len(valid_labels)
# Initializing the BERT-GRU model with hidden dimension 128 and dropout rate 0.3
model = BERTGRUSentiment(bert, hidden_dim=128, num_classes=num_classes, dropout=0.3)

# Freezing BERT parameters to reduce training time
for param in model.bert.parameters():
    param.requires_grad = False


# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()  # For multi-class classification
optimizer = optim.Adam(model.parameters(), lr=1e-4)


# Define training and evaluation functions
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for input_ids, attention_mask, labels in loader:
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        _, predictions = torch.max(outputs, dim=1)
        acc = (predictions == labels).float().mean()
        epoch_acc += acc.item()
    return epoch_loss / len(loader), epoch_acc / len(loader)

def evaluate_epoch(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            _, predictions = torch.max(outputs, dim=1)
            acc = (predictions == labels).float().mean()
            epoch_acc += acc.item()
    return epoch_loss / len(loader), epoch_acc / len(loader)


# Training loop
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(1, N_EPOCHS + 1):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_epoch(model, valid_loader, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "bert_gru_tweet_model.pt")
    print(f"Epoch {epoch}/{N_EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc*100:.2f}%")

# Evaluate on test set
model.load_state_dict(torch.load("bert_gru_tweet_model.pt", weights_only=True))
test_loss, test_acc = evaluate_epoch(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc*100:.2f}%")

# Inference on sample tweets
def predict_sentiment(model, tokenizer, text):
    model.eval()
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, pred_label = torch.max(outputs, dim=1)
    return label_encoder.inverse_transform(pred_label.cpu().numpy())

sample_texts = [
    "I absolutely love this airline! Great service and comfortable seats.",
    "The flight was delayed and the staff was extremely rude.",
    "My experience was just okay; nothing special."
]
for text in sample_texts:
    sentiment = predict_sentiment(model, tokenizer, text)
    print(f"Tweet: {text} --> Predicted Sentiment: {sentiment[0]}")


# Analysis:
# The script loads the Tweets.csv dataset, changes the names of the columns  to more meaningful names, excludes the rows that
# have an empty tweet or label, and selects only those  tweets that have appropriate sentiment labels (negative, neutral, positive).
# Sentiment labels are encoded  as integers and the data is divided into training, validation and test sets with stratification.
# A PyTorch Dataset class is defined to tokenize the tweet texts using a BERT tokenizer which is  pretrained.
# The model architecture consists of the following: To get embeddings of a sequence, a frozen  BERT model is used, a bidirectional
# GRU layer is used to capture the sequential dependence,  and a fully connected layer which gives the class logits for multi class
# sentiment classification.
# The training loop  is trained for 5 epochs using the Adam optimizer and CrossEntropyLoss; the model that has the  best validation
# loss is saved and then used to evaluate the performance of the model on the test set.
# Last, the model does inference on some example tweets and for each tweets, it sets the sentiment label of the tweets.
# The torch.load function is called with the weights only option in order to  avoid security issues.
# This pipeline shows a typical way of training a BERT based model with additional  GRU layer for the task of sentiment analysis
# on tweet data.

Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Epoch 1/5 | Train Loss: 0.6221 | Train Acc: 74.53% | Valid Loss: 0.5064 | Valid Acc: 79.05%
Epoch 2/5 | Train Loss: 0.4745 | Train Acc: 81.02% | Valid Loss: 0.4780 | Valid Acc: 81.68%
Epoch 3/5 | Train Loss: 0.4457 | Train Acc: 81.98% | Valid Loss: 0.4282 | Valid Acc: 82.77%
Epoch 4/5 | Train Loss: 0.4246 | Train Acc: 83.13% | Valid Loss: 0.4194 | Valid Acc: 82.95%
Epoch 5/5 | Train Loss: 0.4109 | Train Acc: 83.49% | Valid Loss: 0.4222 | Valid Acc: 83.27%
Test Loss: 0.4280 | Test Acc: 82.91%
Tweet: I absolutely love this airline! Great service and comfortable seats. --> Predicted Sentiment: positive
Tweet: The flight was delayed and the staff was extremely rude. --> Predicted Sentiment: negative
Tweet: My experience was just okay; nothing special. --> Predicted Sentiment: negative
