In [1]:
!pip install wandb
!pip install accelerate -U
!pip install datasets evaluate
!pip install transformers==3.0.2

Successfully installed accelerate-0.29.2 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [3

In [2]:
# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn
import pandas as pd

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

### NEW ##########################
# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# wandb library
import wandb

In [3]:
import CustomPreprocessorSpacy as cp
from sklearn.model_selection import train_test_split

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
model_folder = Path('./models/nlp_spring_2024/twitter/nn')
model_folder.mkdir(exist_ok=True, parents = True)

In [6]:
# !unzip /content/emotion-detection-spring2014.zip

In [12]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [13]:
X, y = train['Tweet'].values, train.drop(columns = ['ID', 'Tweet']).values

In [14]:
X.shape, y.shape

((7724,), (7724, 11))

### Train test split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)

### Cleaning Data

In [16]:
X_train_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                        basic_clean_only=False).transform(X_train)

X_val_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                        lemmatize=True, lower=True,
                                        remove_stop=False, remove_punct=False,
                                        remove_email=True, remove_url=True,
                                        add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                        basic_clean_only=False).transform(X_val)

  soup = BeautifulSoup(text, "html.parser")


In [17]:
test = pd.read_csv('test.csv')

X_test, y_test = test['Tweet'].values, test.drop(columns = ['ID', 'Tweet'])
y_test = y_test.replace('NONE', 0).values

X_test_cleaned = cp.SpacyPreprocessor(model='en_core_web_sm',
                                      lemmatize=True, lower=True,
                                      remove_stop=False, remove_punct=False,
                                      remove_email=True, remove_url=True,
                                      add_user_mention_prefix=True, remove_hashtag_prefix=False,
                                      basic_clean_only=False).transform(X_test)

  soup = BeautifulSoup(text, "html.parser")


In [18]:
# Create a Dataset object for the training set
trainset = Dataset.from_dict({
    'texts': X_train_cleaned,
    'labels': y_train
})

# Create a Dataset object for the validation set
validset = Dataset.from_dict({
    'texts': X_val_cleaned,
    'labels': y_val
})

# Create a Dataset object for the test set
testset = Dataset.from_dict({
    'texts': X_test_cleaned,
    'labels': y_test
})

#### Metrics

In [19]:
import evaluate
import numpy as np

# Load the F1 score with macro average using the 'evaluate' library
clf_metrics = evaluate.load("f1", average="macro")

def sigmoid(x):
    """Apply the sigmoid function to convert logits to probabilities."""
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    """
    Compute the F1 score for binary classification predictions.

    Args:
    eval_pred (tuple): A tuple containing the model's raw output logits and the true labels.

    Returns:
    dict: A dictionary containing the computed F1 score.
    """
    predictions, labels = eval_pred

    # Apply sigmoid to convert logits to probabilities
    probabilities = sigmoid(predictions)

    # Convert probabilities to binary predictions using a 0.5 threshold
    predictions = (probabilities > 0.5).astype(int)

    # Flatten predictions and labels for metric computation
    predictions = predictions.reshape(-1)
    labels = labels.astype(int).reshape(-1)

    # Compute and return the F1 score
    return clf_metrics.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

### Experiment 2

In [20]:
from transformers import RobertaModel, RobertaTokenizer
from transformers import AlbertModel, AlbertTokenizer

In [21]:
tokenizer_robert = RobertaTokenizer.from_pretrained('distilroberta-base', do_lower_case=True)
tokenizer_albert = AlbertTokenizer.from_pretrained('albert-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [22]:
def collate_batch(batch, max_length):
    """
    Prepare a batch for processing by dual-input models using tokenizers for RoBERTa and ALBERT.

    This function takes a batch of samples containing text and labels, and performs tokenization
    suitable for input to both RoBERTa and ALBERT models. It handles special token addition,
    truncation, padding, and conversion to PyTorch tensors, separately encoding the same texts
    with each model's tokenizer.

    Args:
    batch (list): A list of dictionaries, where each dictionary corresponds to data points
                  in a dataset. Each dictionary typically contains 'texts' and 'labels'.
    max_length (int): The maximum length of the tokenized output. Texts longer than `max_length`
                      are truncated to this length.

    Returns:
    dict: A dictionary containing the following fields:
        'input_ids_robert' (torch.Tensor): The tensor of input IDs for RoBERTa.
        'attention_mask_robert' (torch.Tensor): The attention mask for RoBERTa.
        'input_ids_albert' (torch.Tensor): The tensor of input IDs for ALBERT.
        'attention_mask_albert' (torch.Tensor): The attention mask for ALBERT.
        'labels' (torch.Tensor): The tensor of labels for the batch, converted to floats.
    """

    # Extract labels and texts from the batch data
    labels = [sample['labels'] for sample in batch]
    texts = [sample['texts'] for sample in batch]

    # Tokenize texts for RoBERTa
    inputs_robert = tokenizer_robert.batch_encode_plus(
        texts,
        add_special_tokens=True,        # Add '[CLS]' and '[SEP]'
        truncation=True,                # Truncate to max_length
        max_length=max_length,          # Max length cut-off
        padding='max_length',           # Pad to max_length
        return_attention_mask=True,     # Return attention mask
        return_token_type_ids=False,    # Do not return token type ids
        return_tensors='pt'             # Return PyTorch tensors
    )

    # Tokenize texts for ALBERT
    inputs_albert = tokenizer_albert.batch_encode_plus(
        texts,
        add_special_tokens=True,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt'
    )

    # Retrieve encoded data from tokenization output
    ids_robert = inputs_robert['input_ids']
    mask_robert = inputs_robert['attention_mask']
    ids_albert = inputs_albert['input_ids']
    mask_albert = inputs_albert['attention_mask']

    # Convert labels to a tensor, adjusting data type for model compatibility
    labels = torch.tensor(labels, dtype=torch.float)

    # Return a dictionary organizing all outputs for model consumption
    return {
        'input_ids_robert': ids_robert,
        'attention_mask_robert': mask_robert,
        'input_ids_albert': ids_albert,
        'attention_mask_albert': mask_albert,
        'labels': labels
    }


In [23]:
import torch
import torch.nn as nn
from transformers import RobertaModel, AlbertModel
from transformers.modeling_outputs import SequenceClassifierOutput

class AlbertRobertClass(nn.Module):
    """
    A neural network module combining the embeddings from RoBERTa and ALBERT models
    to perform sequence classification.

    Attributes:
    robert (RobertaModel): The RoBERTa model to extract features.
    albert (AlbertModel): The ALBERT model to extract features.
    pre_classifier_ro (nn.Linear): Linear layer to preprocess RoBERTa outputs.
    pre_classifier_al (nn.Linear): Linear layer to preprocess ALBERT outputs.
    dropout_1 (nn.Dropout): Dropout layer to prevent overfitting (after combining features).
    dropout_2 (nn.Dropout): Another dropout layer to prevent overfitting (after first classification layer).
    classifier_1 (nn.Linear): First linear classifier layer.
    classifier_2 (nn.Linear): Final linear classifier layer to predict class scores.
    """
    def __init__(self):
        super(AlbertRobertClass, self).__init__()
        self.robert = RobertaModel.from_pretrained("distilroberta-base")
        self.albert = AlbertModel.from_pretrained("albert-base-v2")

        # Linear layers for dimension reduction from RoBERTa and ALBERT features
        self.pre_classifier_ro = nn.Linear(768, 256)  # RoBERTa's hidden size to smaller size
        self.pre_classifier_al = nn.Linear(768, 256)  # ALBERT's hidden size to smaller size

        # Dropout layers for regularization
        self.dropout_1 = nn.Dropout(0.1)  # Light dropout after initial feature combination
        self.dropout_2 = nn.Dropout(0.3)  # Heavier dropout after first classification layer

        # Classifier layers to further process and classify the combined features
        self.classifier_1 = nn.Linear(512, 256)  # Combines input from both models
        self.classifier_2 = nn.Linear(256, 11)  # Outputs logits for 11 classes

    def forward(self, input_ids_robert, attention_mask_robert, input_ids_albert, attention_mask_albert, labels=None):
        """
        Defines the forward pass of the model.

        Parameters:
        input_ids_robert (Tensor): Input IDs for RoBERTa.
        attention_mask_robert (Tensor): Attention mask for RoBERTa.
        input_ids_albert (Tensor): Input IDs for ALBERT.
        attention_mask_albert (Tensor): Attention mask for ALBERT.
        labels (Tensor, optional): Actual labels for the input data; used to compute loss if provided.

        Returns:
        SequenceClassifierOutput: Contains loss (if labels are provided) and logits.
        """
        # Process input through both RoBERTa and ALBERT models
        output_robert = self.robert(input_ids=input_ids_robert, attention_mask=attention_mask_robert)
        output_albert = self.albert(input_ids=input_ids_albert, attention_mask=attention_mask_albert)

        # Extract the last hidden states from both model outputs
        hidden_state_robert = output_robert.last_hidden_state
        hidden_state_albert = output_albert.last_hidden_state

        # Select the first token's representation from both models to use as the "pooled" output
        pooler_robert = hidden_state_robert[:, 0]
        pooler_albert = hidden_state_albert[:, 0]

        # Preprocess pooled outputs through respective linear layers
        pooler_robert = self.pre_classifier_ro(pooler_robert)
        pooler_albert = self.pre_classifier_al(pooler_albert)

        # Concatenate both processed outputs
        pooler = torch.cat((pooler_robert, pooler_albert), dim=1)

        # Apply tanh activation to help stabilize the learning process
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout_1(pooler)

        # First classifier layer
        output = self.classifier_1(pooler)
        output = self.dropout_2(output)

        # Final classifier layer producing logits for each class
        logits = self.classifier_2(output)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()  # Suitable for multi-label classification
            loss = loss_fct(logits, labels.float())  # Calculate loss

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [24]:
collate_fn = partial(collate_batch, max_length=384)

In [25]:
model = AlbertRobertClass()
model.to(device)

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

AlbertRobertClass(
  (robert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [33]:
# Configure training parameters
training_args = TrainingArguments(

    # Training-specific configurations
    num_train_epochs=2,
    per_device_train_batch_size=8, # Number of samples per training batch
    per_device_eval_batch_size=8, # Number of samples per validation batch
    weight_decay=0.1, # weight decay (L2 regularization)
    # learning_rate=0.0001, # learning arte
    optim='adamw_torch', # optimizer
    remove_unused_columns=False, # flag to retain unused columns

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=50,  # Perform evaluation every 50 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=50,  # Save a model checkpoint every 50 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="f1",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations
    logging_strategy='steps',
    logging_steps=50,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name='twitter_hf_trainer_roberta_albert',  # Experiment name for Weights & Biases
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = trainset,
    eval_dataset = validset,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [35]:
!wandb login
%env WANDB_PROJECT = nlp_course_spring_2024-sentiment-analysis-hf-trainerm

[34m[1mwandb[0m: Currently logged in as: [33msarthak-vajpayee[0m. Use [1m`wandb login --relogin`[0m to force relogin
env: WANDB_PROJECT=nlp_course_spring_2024-sentiment-analysis-hf-trainerm


In [36]:
trainer.train()

Step,Training Loss,Validation Loss,F1
50,0.4414,0.415588,0.234605
100,0.4152,0.412254,0.493627
150,0.4289,0.388589,0.539262
200,0.4114,0.394934,0.552597
250,0.4019,0.381087,0.573474
300,0.409,0.383715,0.548195
350,0.3787,0.361772,0.603276
400,0.3863,0.359178,0.595
450,0.3735,0.350073,0.595222
500,0.3639,0.347689,0.611157


TrainOutput(global_step=1546, training_loss=0.34500834180034917, metrics={'train_runtime': 3346.6973, 'train_samples_per_second': 3.693, 'train_steps_per_second': 0.462, 'total_flos': 0.0, 'train_loss': 0.34500834180034917, 'epoch': 2.0})

I mistakenly deleted the code block and cannot recover the evaluation output, however, you can follow the below link to check out a copy of the report generated during training.

link to W&B Project: https://api.wandb.ai/links/sarthak-vajpayee/y8v5lbyg

In [37]:
trainer.evaluate()

{'eval_loss': 0.31154319643974304,
 'eval_f1': 0.661290322580645,
 'eval_runtime': 61.6682,
 'eval_samples_per_second': 25.053,
 'eval_steps_per_second': 3.146,
 'epoch': 2.0}

In [38]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▅▆▆▇▆▇▇▇▇▇▇███████████████████
eval/loss,██▆▇▆▆▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
eval/runtime,▂▆▇▆▅▇▃▃▃▃▃▂▃▂▁▃▂▂▃▃▃▄▃▃▁▂▃▃█▃▆
eval/samples_per_second,▇▃▂▃▄▂▆▆▆▆▆▇▆▇█▆▇▇▆▆▆▅▆▆█▆▆▆▁▆▂
eval/steps_per_second,▇▃▂▃▄▂▆▆▆▆▆▇▆▇█▆▇▇▆▆▆▅▆▆█▆▆▆▁▆▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▁▄█▂▃▂▂▁▂▁▁▃▂▃▁▃▂▂▂▂▂▃▁▂▁▂▂▃▄▁
train/learning_rate,████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▆▅▆▅▅▅▄▄▄▃▃▃▄▃▃▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁

0,1
eval/f1,0.66129
eval/loss,0.31154
eval/runtime,61.6682
eval/samples_per_second,25.053
eval/steps_per_second,3.146
train/epoch,2.0
train/global_step,1546.0
train/grad_norm,0.97702
train/learning_rate,0.0
train/loss,0.3035


#### Observations

1. **`eval_loss`: 0.31154319643974304**
   - **Description**: This value represents the average loss on the evaluation dataset. Loss is a quantification of how much the model’s predictions deviate from the actual labels. Lower values indicate better model performance, with zero being a perfect score.
   - **Interpretation**: A loss of 0.3115 suggests that the model has a reasonably good fit to the data. However, whether this is an acceptable loss depends on the complexity of the task and the baseline (or typical) losses for similar models on similar tasks.

2. **`eval_f1`: 0.661290322580645**
   - **Description**: The F1 score is a harmonic mean of precision and recall, providing a single metric that balances both the false positives and false negatives. It is particularly useful when the class distribution is imbalanced.
   - **Interpretation**: An F1 score of 0.6613 indicates a good but not excellent ability of the model to correctly classify the positive class while balancing both precision (the accuracy of positive predictions) and recall (the ability to find all positive instances).

3. **`eval_runtime`: 61.6682 seconds**
   - **Description**: This is the total time taken to evaluate the model on the validation dataset.
   - **Interpretation**: The runtime provides insight into the computational efficiency of the model during inference. Longer runtimes might be acceptable during training, but for real-time applications, shorter evaluation runtimes are preferred.

4. **`eval_samples_per_second`: 25.053**
   - **Description**: This metric indicates the number of samples the model can process per second during evaluation.
   - **Interpretation**: Processing approximately 25 samples per second, the model shows a moderate speed. This metric is crucial for understanding throughput in production environments, especially for applications requiring real-time or near-real-time predictions.

5. **`eval_steps_per_second`: 3.146**
   - **Description**: This metric shows how many batches (or "steps") of data the model can process per second.
   - **Interpretation**: A rate of about 3.146 steps per second is relatively moderate, depending on the batch size and the computational resources available (e.g., GPU). This rate impacts how quickly the model can be applied to new data.

6. **`epoch`: 2.0**
   - **Description**: This indicates the number of complete passes the model has made over the entire training dataset.
   - **Interpretation**: By the end of the second epoch, the given evaluation metrics were recorded. This provides context for the metrics, suggesting how much training the model had undergone before evaluation.


### Predicting the test data labels

In [None]:
test_predictions = trainer.predict(testset)
y_pred = test_predictions.predictions
y_pred = (sigmoid(y_pred)>0.5).astype(int)
df_test = pd.read_csv('test.csv')
df_test[['anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']] = y_pred

df_test = df_test.drop(columns=['Tweet'])
df_test.to_csv('test_pred_Robert_Albert.csv', index=False)