In [None]:
!pip install wandb
!pip install accelerate -U
!pip install datasets evaluate
!pip install transformers==3.0.2

In [None]:
# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn
import pandas as pd

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

### NEW ##########################
# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# wandb library
import wandb

In [None]:
import CustomPreprocessorSpacy as cp
from sklearn.model_selection import train_test_split

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
model_folder = Path('./models/nlp_spring_2024/twitter/nn')
model_folder.mkdir(exist_ok=True, parents = True)

In [None]:
# !unzip /content/emotion-detection-spring2014.zip

In [None]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [None]:
X, y = train['Tweet'].values, train.drop(columns = ['ID', 'Tweet']).values

In [None]:
X.shape, y.shape

((7724,), (7724, 11))

### Train test split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

### Cleaning Data

In [None]:
X_train_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                        basic_clean_only=False).transform(X_train)

X_val_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                        basic_clean_only=False).transform(X_val)

  soup = BeautifulSoup(text, "html.parser")


In [None]:
test = pd.read_csv('test.csv')

X_test, y_test = test['Tweet'].values, test.drop(columns = ['ID', 'Tweet'])
y_test = y_test.replace('NONE', 0).values

X_test_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                      lemmatize=True, lower=True,
                                      remove_stop=False, remove_punct=False,
                                      remove_email=True, remove_url=True,
                                      add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                      basic_clean_only=False).transform(X_test)

  soup = BeautifulSoup(text, "html.parser")


In [None]:
# Create a Dataset object for the training set
trainset = Dataset.from_dict({
    'texts': X_train_cleaned,
    'labels': y_train
})

# Create a Dataset object for the validation set
validset = Dataset.from_dict({
    'texts': X_val_cleaned,
    'labels': y_val
})

# Create a Dataset object for the test set
testset = Dataset.from_dict({
    'texts': X_test_cleaned,
    'labels': y_test
})

#### Metrics

In [None]:
import evaluate
import numpy as np

# Load the F1 score with macro average using the 'evaluate' library
clf_metrics = evaluate.load("f1", average="macro")

def sigmoid(x):
    """Apply the sigmoid function to convert logits to probabilities."""
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    """
    Compute the F1 score for binary classification predictions.

    Args:
    eval_pred (tuple): A tuple containing the model's raw output logits and the true labels.

    Returns:
    dict: A dictionary containing the computed F1 score.
    """
    predictions, labels = eval_pred

    # Apply sigmoid to convert logits to probabilities
    probabilities = sigmoid(predictions)

    # Convert probabilities to binary predictions using a 0.5 threshold
    predictions = (probabilities > 0.5).astype(int)

    # Flatten predictions and labels for metric computation
    predictions = predictions.reshape(-1)
    labels = labels.astype(int).reshape(-1)

    # Compute and return the F1 score
    return clf_metrics.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

### Experiment 1

#### Model initialization

In [None]:
class DistilBERTClass(torch.nn.Module):
    """
    A custom PyTorch neural network module based on the DistilBert architecture, adapted for a multi-class
    classification task with 11 classes. The model enhances the base DistilBertModel with additional
    linear layers and dropout for regularization.

    Attributes:
    bert (DistilBertModel): The pre-trained DistilBert model loaded with weights from 'distilbert-base-uncased'.
    pre_classifier (torch.nn.Linear): A linear layer to transform the output from the DistilBert model.
    dropout (torch.nn.Dropout): Dropout layer to reduce overfitting by randomly setting input units to 0 during training.
    classifier (torch.nn.Linear): The final linear layer that outputs logits for each of the 11 classes.

    Methods:
    forward(input_ids, attention_mask, labels=None): Defines the forward pass of the model.
    """

    def __init__(self):
        """
        Initializes the DistilBERTClass model with the DistilBert base, a pre-classification linear layer,
        a dropout layer, and a final classification layer.
        """
        super(DistilBERTClass, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)  # Linear layer to process the [CLS] token embedding
        self.dropout = torch.nn.Dropout(0.1)             # Dropout for regularization
        self.classifier = torch.nn.Linear(768, 11)       # Final classifier with 11 output classes

    def forward(self, input_ids, attention_mask, labels=None):
        """
        The forward method for the DistilBERTClass. This method is called by PyTorch during training and
        evaluation to compute the output of the model.

        Args:
        input_ids (torch.Tensor): Tensor of token ids to be fed to the DistilBert model.
        attention_mask (torch.Tensor): Tensor representing attention masking, indicating to the model which
                                       tokens should be attended to, and which should not.
        labels (torch.Tensor, optional): Tensor of labels used for calculating the loss when training.

        Returns:
        SequenceClassifierOutput: An object containing the loss (if labels are provided and loss is computed)
                                  and the logits which are the model's predictions for each class before applying activation function.
        """
        # Processing input through the DistilBERT base model
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]  # The first element contains the last hidden state
        pooler = hidden_state[:, 0]  # We take the output of the [CLS] token (first token)

        # Passing through the pre-classifier
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)  # Activation function
        pooler = self.dropout(pooler)     # Apply dropout

        # Generating logits for each class
        logits = self.classifier(pooler)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for multi-label classification
            loss = loss_fct(logits, labels.float())  # Compute the loss based on the model output and actual labels

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )


#### Collate function

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
from transformers import DistilBertTokenizer
import torch

# Initialize the DistilBert tokenizer for the 'distilbert-base-uncased' model configuration
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

def collate_batch(batch, max_length):
    """
    Prepare a batch of text samples and labels for model training or evaluation.

    This function tokenizes text data in a batch using the DistilBert tokenizer and organizes the
    batch data into a dictionary format suitable for input into a DistilBERT-based neural network model.
    The function handles tokenization, adds special tokens, applies truncation and padding, and creates
    attention masks necessary for the DistilBERT model.

    Args:
    batch (list of dicts): A list of dictionaries, each containing 'texts' and 'labels' for a sample.
    max_length (int): The maximum length to which the text will be truncated or padded.

    Returns:
    dict: A dictionary containing the following fields:
        'input_ids' (torch.Tensor): Tensor of token ids to be fed to the model.
        'attention_mask' (torch.Tensor): Tensor indicating to the model which tokens should be attended to.
        'labels' (torch.Tensor): Tensor of labels corresponding to each text input.

    The function ensures that all text inputs are tokenized in a uniform manner and that data batches are
    structured consistently for straightforward usage in a training or evaluation loop.
    """
    # Extract labels and texts from the batch
    labels = [sample['labels'] for sample in batch]
    texts = [sample['texts'] for sample in batch]

    # Tokenize text using the DistilBERT tokenizer with specific configurations
    inputs = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,        # Adds special tokens (like [CLS] and [SEP]) to sequences
        truncation=True,                # Ensures sequences exceeding `max_length` are truncated to fit
        max_length=max_length,          # Specifies the maximum sequence length
        padding='max_length',           # Pads sequences to `max_length`
        return_attention_mask=True,     # Generates attention masks for each sequence
        return_token_type_ids=False,    # DistilBERT does not use segment IDs, so these are not returned
        return_tensors='pt'             # Returns PyTorch tensors
    )

    # Extract token ids and attention masks from the tokenization output
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    # Convert label list to a tensor, specifying the data type
    labels = torch.tensor(labels, dtype=torch.float)

    # Return a dictionary containing processed batch data
    return {
        'input_ids': ids,
        'attention_mask': mask,
        'labels': labels
    }


In [None]:
collate_fn = partial(collate_batch, max_length=256)

In [None]:
model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

#### Training Argument

In [None]:
# Configure training parameters
training_args = TrainingArguments(

    # Training-specific configurations
    num_train_epochs=1,
    per_device_train_batch_size=4, # Number of samples per training batch
    per_device_eval_batch_size=4, # Number of samples per validation batch
    weight_decay=0.1, # weight decay (L2 regularization)
    learning_rate=0.00001, # learning arte
    optim='adamw_torch', # optimizer
    remove_unused_columns=False, # flag to retain unused columns

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=50,  # Perform evaluation every 50 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=50,  # Save a model checkpoint every 50 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="f1",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations
    logging_strategy='steps',
    logging_steps=50,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name='twitter_hf_trainer',  # Experiment name for Weights & Biases
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = trainset,
    eval_dataset = validset,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
!wandb login
%env WANDB_PROJECT = nlp_course_spring_2024-sentiment-analysis-hf-trainerm

[34m[1mwandb[0m: Currently logged in as: [33msarthak-vajpayee[0m. Use [1m`wandb login --relogin`[0m to force relogin
env: WANDB_PROJECT=nlp_course_spring_2024-sentiment-analysis-hf-trainerm


In [None]:
trainer.train()
trainer.evaluate()

[34m[1mwandb[0m: Currently logged in as: [33msarthak-vajpayee[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,F1
50,0.2418,0.321892,0.658713
100,0.2303,0.319658,0.664706
150,0.232,0.322655,0.662558
200,0.2204,0.323864,0.662604
250,0.2404,0.324798,0.657976
300,0.2425,0.325108,0.670807
350,0.2213,0.327977,0.669163
400,0.2171,0.329468,0.668484
450,0.2155,0.330943,0.667674
500,0.2227,0.329078,0.665239


{'eval_loss': 0.3303358256816864,
 'eval_f1': 0.6713537771129394,
 'eval_runtime': 13.1048,
 'eval_samples_per_second': 117.896,
 'eval_steps_per_second': 29.531,
 'epoch': 1.0}

In [None]:
# torch.cuda.empty_cache()

In [None]:
trainer.evaluate()

{'eval_loss': 0.3270490765571594,
 'eval_f1': 0.6675667566756677,
 'eval_runtime': 13.5026,
 'eval_samples_per_second': 114.423,
 'eval_steps_per_second': 28.661,
 'epoch': 2.0}

## Observations:

1. **`eval_loss` (0.3270490765571594)**:
   - This is the average loss recorded during the evaluation. The loss function quantifies the difference between the predicted values and the actual target values. In this case, a loss of approximately 0.327 suggests that the model has achieved a relatively low error rate on the evaluation data, which is indicative of good predictive performance, though there's always room for improvement.

2. **`eval_f1` (0.6675667566756677)**:
   - The F1 score is a harmonic mean of precision and recall, and is particularly useful in situations where classes are imbalanced. An F1 score of about 0.668 is moderate, indicating that the model has a balanced precision and recall for the evaluation dataset. This score can be considered decent depending on the complexity of the task and the distribution of classes but suggests there is significant room for improvement in either precision, recall, or both.

3. **`eval_runtime` (13.5026 seconds)**:
   - This represents the total time taken to complete the evaluation phase, clocking in at roughly 13.5 seconds. This metric is useful for assessing the computational efficiency and speed of the model during inference.

4. **`eval_samples_per_second` (114.423)**:
   - This metric shows how many samples the model is processing per second, with a rate of over 114 samples per second. This indicates a high throughput during evaluation, suggesting that the model is relatively efficient at processing data.

5. **`eval_steps_per_second` (28.661)**:
   - Similar to the above, this measures the number of batches (or steps) the model processes per second. A rate of about 28.661 steps per second is quite high, further emphasizing the model's efficiency in handling batches of data during evaluation.

6. **`epoch` (2.0)**:
   - This indicates that the evaluation metrics were recorded at the end of the second epoch of training. An epoch is a full pass through the entire training dataset. This context helps in understanding at what stage of the training these evaluation results were obtained.

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▅▆▅▇▇▇▇▇▇▇▇█▇▇▇▇█▇▇██▇█████████████████
eval/loss,██▄▄▄▄▇▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▆▁▂▃▂▃▂▄▃▃▃█▄▄▃▃▃▄▃▃▃▄▃▄▃▄▄▄▄▃▄▃▃▄▄▄▃▃▆
eval/samples_per_second,▇▃█▇▆▇▆▇▅▆▆▆▁▅▅▆▆▅▅▅▆▆▅▆▅▆▅▅▅▅▆▅▆▆▅▅▅▆▆▃
eval/steps_per_second,▇▃█▇▆▇▆▇▅▆▆▆▁▅▅▆▆▅▅▅▆▆▅▆▅▆▅▅▅▅▆▅▆▆▅▅▅▆▆▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▂▁▂▃▂▁▃█▂▄▃▂▂▂▁█▄▂▁▂▂▅▂▂▂▂▆▄▃▂▇▁▃▃▃▂▂▃▂▂
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,██▇▆▆▅▄▅▆▄▅▅▄▄▃▅▅▅▄▅▄▃▃▃▂▃▃▂▃▂▃▃▁▂▃▂▂▂▂▂

0,1
eval/f1,0.66757
eval/loss,0.32705
eval/runtime,13.5026
eval/samples_per_second,114.423
eval/steps_per_second,28.661
train/epoch,2.0
train/global_step,3090.0
train/grad_norm,1.72877
train/learning_rate,0.0
train/loss,0.2552


#### Get test predictions

In [None]:
test_predictions = trainer.predict(testset)
y_pred = test_predictions.predictions
y_pred = (sigmoid(y_pred)>0.5).astype(int)
df_test = pd.read_csv('test.csv')
df_test[['anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']] = y_pred

df_test = df_test.drop(columns=['Tweet'])
df_test.to_csv('test_pred.csv', index=False)