In [None]:
!pip install wandb
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets evaluate

In [78]:
# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn
import pandas as pd

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

### NEW ##########################
# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# wandb library
import wandb

In [79]:
import CustomPreprocessorSpacy as cp
from sklearn.model_selection import train_test_split

### Data loading and Preprocessing

In [80]:
model_folder = Path('./models/nlp_spring_2024/twitter/nn')
model_folder.mkdir(exist_ok=True, parents = True)

In [None]:
!unzip /content/emotion-detection-spring2014.zip

Archive:  /content/emotion-detection-spring2014.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [50]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [51]:
X, y = train['Tweet'].values, train.drop(columns = ['ID', 'Tweet']).values

In [52]:
X.shape, y.shape

((7724,), (7724, 11))

#### Train test split

In [300]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=7)

#### Cleaning Data using spaCy

In [301]:
X_train_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=False, remove_hashtag_prefix=True,
                                        basic_clean_only=False).transform(X_train)

X_val_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=False, remove_hashtag_prefix=True,
                                        basic_clean_only=False).transform(X_val)

  soup = BeautifulSoup(text, "html.parser")


In [302]:
test = pd.read_csv('test.csv')

X_test, y_test = test['Tweet'].values, test.drop(columns = ['ID', 'Tweet'])
y_test = y_test.replace('NONE', 0).values

X_test_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=False, remove_hashtag_prefix=True,
                                        basic_clean_only=False).transform(X_test)

  soup = BeautifulSoup(text, "html.parser")


#### Storing data as a HuggingFace dataset

In [303]:
trainset = Dataset.from_dict({
    'texts': X_train_cleaned,
    'labels': y_train
})

validset = Dataset.from_dict({
    'texts': X_val_cleaned,
    'labels': y_val
})

testset = Dataset.from_dict({
    'texts': X_test_cleaned,
    'labels': y_test
})

### Creating model

In [319]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput

class CustomConfig(PretrainedConfig):
  def __init__(self, vocab_size=0, embedding_dim=0, hidden_dim=0, lstm_layers=3,
               hidden_dim1=0, hidden_dim2=0, hidden_dim3=0, num_labels=11, **kwargs):
      super().__init__()
      self.vocab_size = vocab_size
      self.embedding_dim = embedding_dim
      self.hidden_dim = hidden_dim
      self.lstm_layers = lstm_layers

      self.hidden_dim1 = hidden_dim1
      self.hidden_dim2 = hidden_dim2
      self.hidden_dim3 = hidden_dim3

      self.num_labels = num_labels

class CustomLSTM(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        # Define an LSTM layer
        self.lstm = nn.LSTM(input_size=config.embedding_dim,
                            hidden_size=config.hidden_dim,  # LSTM hidden dimension
                            num_layers=config.lstm_layers,  # Number of LSTM layers
                            bidirectional=True,
                            dropout=0.5,
                            batch_first=True)  # Input & output will have batch size as 1st dimension

        # Define a classifier layer
        self.classifier_1 = nn.Linear(config.hidden_dim*2, 96)
        self.classifier_2 = nn.Linear(config.hidden_dim, 128)
        self.classifier_3 = nn.Linear(224, 3)
        self.classifier_4 = nn.Linear(config.num_labels+3, config.num_labels)
        self.dropout_layer = nn.Dropout(p=0.5)

        # Neural Network
        self.embedding_bag = nn.EmbeddingBag(config.vocab_size, config.embedding_dim)
        self.layers = nn.Sequential(
            nn.Linear(config.embedding_dim, config.hidden_dim1),
            nn.BatchNorm1d(num_features=config.hidden_dim1),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(config.hidden_dim1, config.hidden_dim2),
            nn.BatchNorm1d(num_features=config.hidden_dim2),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(config.hidden_dim2, config.num_labels)
        )


    def forward(self, input_ids_lstm, input_ids_nn, offsets, labels=None):
        embedded = self.embedding(input_ids_lstm)
        # LSTM layer
        lstm_output, (hidden, cell) = self.lstm(embedded)
        hidden_permuted = hidden.permute(1, 0, 2)
        final_output = torch.mean(lstm_output, dim=1)
        final_output_2 = torch.mean(hidden_permuted, dim=1)
        final_output = self.classifier_1(final_output)
        final_output_2 = self.classifier_2(final_output_2)

        cat = torch.cat((final_output, final_output_2), dim=1)
        logits_lstm = self.classifier_3(cat)

        # NN layer
        embed_out = self.embedding_bag(input_ids_nn, offsets)
        logits_nn = self.layers(embed_out)

        # Final layer
        logits_cat = torch.cat((logits_lstm, logits_nn), dim=1)
        logits = self.classifier_4(logits_cat)

        logits = logits_nn

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [320]:
def get_vocab(dataset, min_freq=1):
    """
    Generate a vocabulary from a dataset.

    Args:
        dataset (Dataset): A Hugging Face Dataset object. The dataset should
                           have a key 'texts' that contains the text data.
        min_freq (int): The minimum frequency for a token to be included in
                        the vocabulary.

    Returns:
        torchtext.vocab.Vocab: Vocabulary object containing tokens from the
                               dataset that meet or exceed the specified
                               minimum frequency. It also includes a special
                               '<unk>' token for unknown words.
    """
    # Initialize a counter object to hold token frequencies
    counter = Counter()

    # Update the counter with tokens from each text in the dataset
    # Iterating through texts in the dataset
    for text in dataset['texts']:  ###### Change from previous function ####
        counter.update(str(text).split())

    # Create a vocabulary using the counter object
    # Tokens that appear fewer times than `min_freq` are excluded
    my_vocab = vocab(counter, min_freq=min_freq)

    # Insert a '<unk>' token at index 0 to represent unknown words
    my_vocab.insert_token('<unk>', 0)

    # Set the default index to 0
    # This ensures that any unknown word will be mapped to '<unk>'
    my_vocab.set_default_index(0)

    return my_vocab

In [321]:
# Creating a function that will be used to get the indices of words from vocab
def tokenizer(text, vocab):
    """Converts text to a list of indices using a vocabulary dictionary"""
    return [vocab[token] for token in str(text).split()]

In [322]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch, my_vocab):
    labels = [sample['labels'] for sample in batch]
    texts = [sample['texts'] for sample in batch]

    labels = torch.tensor(labels, dtype=torch.long)
    list_of_indices = [tokenizer(text, my_vocab) for text in texts]

    input_ids_lstm = pad_sequence([torch.tensor(i, dtype=torch.int64) for i in list_of_indices],
                             batch_first=True, padding_value=my_vocab['<pad>'])

    #####################################################################
    list_of_list_of_indices = [tokenizer(text, my_vocab) for text in texts]
    input_ids_nn = torch.cat([torch.tensor(i, dtype=torch.int64) for i in list_of_list_of_indices])

    offsets = [0] + [len(i) for i in list_of_list_of_indices]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    return {
        'input_ids_lstm': input_ids_lstm,
        'input_ids_nn': input_ids_nn,
        'offsets': offsets,
        'labels': labels
    }

In [323]:
twitter_vocab = get_vocab(trainset, min_freq=2)
collate_fn = partial(collate_batch, my_vocab=twitter_vocab)

### Instantiate model

In [326]:
my_config = CustomConfig(vocab_size=len(twitter_vocab),
                         embedding_dim=300,
                         hidden_dim1=200,
                         hidden_dim2=64,
                         hidden_dim=64,
                         lstm_layers=2,
                         num_labels=11)

model = CustomLSTM(config=my_config)

### Metrics

In [327]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine([evaluate.load("f1", average="macro")])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)

   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

### Training Arguments

In [328]:
# Configure training parameters
training_args = TrainingArguments(

    # Training-specific configurations
    num_train_epochs=5,
    per_device_train_batch_size=128, # Number of samples per training batch
    per_device_eval_batch_size=128, # Number of samples per validation batch
    weight_decay=0.1, # weight decay (L2 regularization)
    learning_rate=0.01, # learning arte
    optim='adamw_torch', # optimizer
    remove_unused_columns=False, # flag to retain unused columns

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=50,  # Perform evaluation every 50 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=50,  # Save a model checkpoint every 50 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="f1",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations
    logging_strategy='steps',
    logging_steps=50,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name='twitter_hf_trainer',  # Experiment name for Weights & Biases
)

In [329]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset = validset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [330]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33msarthak-vajpayee[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [331]:
%env WANDB_PROJECT = nlp_course_spring_2024-sentiment-analysis-hf-trainerm

env: WANDB_PROJECT=nlp_course_spring_2024-sentiment-analysis-hf-trainerm


In [333]:
trainer.train()

Step,Training Loss,Validation Loss,F1
50,0.3327,0.386647,0.549225
100,0.3126,0.38191,0.566525
150,0.2903,0.376367,0.573802
200,0.2701,0.379351,0.580101
250,0.257,0.384053,0.582553


There were missing keys in the checkpoint model loaded: ['lstm.weight_hh_l0', 'lstm.bias_ih_l0', 'lstm.bias_hh_l0', 'lstm.weight_ih_l0_reverse', 'lstm.weight_hh_l0_reverse', 'lstm.bias_ih_l0_reverse', 'lstm.bias_hh_l0_reverse', 'lstm.weight_ih_l1', 'lstm.weight_hh_l1', 'lstm.bias_ih_l1', 'lstm.bias_hh_l1', 'lstm.weight_ih_l1_reverse', 'lstm.weight_hh_l1_reverse', 'lstm.bias_ih_l1_reverse', 'lstm.bias_hh_l1_reverse'].


TrainOutput(global_step=260, training_loss=0.29132640728583703, metrics={'train_runtime': 22.9185, 'train_samples_per_second': 1432.25, 'train_steps_per_second': 11.345, 'total_flos': 0.0, 'train_loss': 0.29132640728583703, 'epoch': 5.0})

In [289]:
trainer.evaluate()

{'eval_loss': 0.3853737711906433,
 'eval_f1': 0.580140734949179,
 'eval_runtime': 3.4145,
 'eval_samples_per_second': 452.484,
 'eval_steps_per_second': 3.807,
 'epoch': 10.0}

In [290]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▆▆▇▇███████████▁▆▆▇▇▇▇███
eval/loss,▇▃▂▁▁▁▁▂▂▄▄▅▅▆▇▇█▃▂▁▁▁▁▂▂▂
eval/runtime,▄▇▁▁▁▂▁▁▁▁▂▁▁▁▂▁█▆▆█▆▆▇▆█▆
eval/samples_per_second,▃▁██▇▆▇▇▇█▅▇██▆▇▆██▆██▇█▆█
eval/steps_per_second,▃▁███▆████▅███▆█▆▇▇▆▇▇▆▇▅▇
train/epoch,▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇██▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆
train/global_step,▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇██▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅
train/grad_norm,▄▂▂▂▂▁▂▁▂▂█▃▃▂▃▂▂▂▂▃▂▃▂▂▃
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁█▇▆▅▅▄▃▂▁
train/loss,█▆▅▄▄▃▃▃▃▂▂▂▂▁▁▁█▆▅▄▄▃▃▃▃

0,1
eval/f1,0.58014
eval/loss,0.38537
eval/runtime,3.4145
eval/samples_per_second,452.484
eval/steps_per_second,3.807
train/epoch,10.0
train/global_step,490.0
train/grad_norm,0.09436
train/learning_rate,0.00082
train/loss,0.2637


### Getting predictions for the test data and saving the results into test_pred.csv for kaggle submission.

In [291]:
test_predictions = trainer.predict(testset)

In [292]:
y_pred = test_predictions.predictions
y_pred = (sigmoid(y_pred)>0.5).astype(int)
df_test = pd.read_csv('test.csv')
df_test[['anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']] = y_pred

df_test = df_test.drop(columns=['Tweet'])
df_test.to_csv('test_pred.csv', index=False)