In [None]:
!pip install wandb
!pip install accelerate -U
!pip install datasets evaluate
!pip install transformers==3.0.2

In [None]:
import re
import sys
import random
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import wandb
from datetime import datetime
from torch.utils.data import DataLoader
from collections import Counter
from functools import partial
from pathlib import Path
from pprint import pprint
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from torchtext.vocab import vocab
from datasets import load_dataset, DatasetDict, Dataset, ClassLabel
from transformers import (
    Pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    pipeline, TrainingArguments, Trainer, DistilBertTokenizer, DistilBertModel,
    DistilBertForSequenceClassification, PreTrainedModel, PretrainedConfig
)
import evaluate

# Optional: if you use matplotlib in a Jupyter notebook, uncomment the next line
# %matplotlib inline

In [None]:
import CustomPreprocessorSpacy as cp
from sklearn.model_selection import train_test_split

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Experiment 3

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['labels'] = train[['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']].values.tolist()
train['labels'] = train['labels'].apply(lambda labels: [float(x) for x in labels])

In [None]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=4, shuffle=True)

In [None]:
train_dataset = Dataset.from_pandas(X_train)
valid_dataset = Dataset.from_pandas(X_val)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

#### Metric

In [None]:
def compute_metrics(eval_pred):
    """
    Calculate and return the accuracy and macro-averaged F1 score for a given set of predictions and true labels.

    This function is intended for use as a metrics computation callback in machine learning evaluation loops,
    particularly when using the Hugging Face `Trainer` API or similar frameworks where such a callback may be required.

    Args:
    eval_pred (tuple): A tuple containing two elements:
        - logits (numpy.ndarray or tuple): The raw model outputs before activation. If logits is a tuple,
          typically from models that might return multiple outputs like in a multi-task setting, only the first element
          (logits[0]) is considered.
        - labels (numpy.ndarray): The ground truth labels corresponding to the inputs for which logits were computed.

    Returns:
    dict: A dictionary with two key-value pairs:
        - "accuracy": The accuracy of the predictions, defined as the proportion of true results (both true positives
          and true negatives) among the total number of cases examined.
        - "f1": The F1 score computed with a macro average, which treats all classes equally, regardless of their
          frequency. This is particularly useful in imbalanced datasets.

    The function uses the `evaluate` library to load and compute the specified metrics based on the provided predictions
    and references. It automatically handles binary classification thresholds, where predictions are determined to be 1
    if the corresponding logit is greater than 0, and 0 otherwise.
    """
    logits, labels = eval_pred  # Unpack the tuple containing logits and labels

    # Check if logits are given as a tuple, usually indicating multiple output formats (common in multi-task models)
    if isinstance(logits, tuple):
        logits = logits[0]  # Consider only the first element if logits is a tuple

    # Convert logits to binary predictions based on a threshold of 0
    predictions = (logits > 0).astype(int).reshape(-1)

    # Load and compute accuracy using the 'evaluate' library
    accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels.astype(int).reshape(-1))["accuracy"]

    # Load and compute the F1 score with macro averaging
    f1 = evaluate.load("f1", average="macro").compute(predictions=predictions, references=labels.astype(int).reshape(-1))["f1"]

    # Return the computed metrics as a dictionary
    return {"accuracy": accuracy, "f1": f1}


In [None]:
def tokenize_fn(x):
  return tokenizer(text = x["Tweet"],padding = "max_length", truncation=True,max_length=128, return_tensors="pt")

In [None]:
tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_valid = valid_dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/6179 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

In [None]:
tokenized_train.remove_columns(['Tweet','ID', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust','__index_level_0__'])
tokenized_valid.remove_columns(['Tweet','ID', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust','__index_level_0__'])

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1545
})

In [None]:
tokenized_train.set_format(type='torch')
tokenized_valid.set_format(type='torch')

In [None]:
# Configure training parameters
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size = 8,
    output_dir = './model_t5',
    evaluation_strategy = 'steps',
    eval_steps = 200,
    save_strategy = "steps",
    save_steps = 200,
    load_best_model_at_end = True,
    save_total_limit=2,
    metric_for_best_model = "accuracy",
    greater_is_better = True,
    logging_strategy = 'steps',
    logging_steps = 200,
    report_to = 'wandb',
    run_name = 'FLAN-T5'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('google/flan-t5-base', num_labels = 11,
                                                           problem_type="multi_label_classification")

trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset = tokenized_train,
          eval_dataset = tokenized_valid,
          compute_metrics=compute_metrics,
          tokenizer=tokenizer
      )

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
!wandb login 59bb7096ee29a58139e6dbf2d2a080a62d477743
%env WANDB_PROJECT = nlp_course_spring_2024_tweet_analysis_flant5_ver3_trainer_assignment_6

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
env: WANDB_PROJECT=nlp_course_spring_2024_tweet_analysis_flant5_ver3_trainer_assignment_6


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
200,0.443,0.368655,0.851133,0.567964
400,0.3761,0.344676,0.859194,0.61828
600,0.3512,0.33696,0.860194,0.630597
800,0.3387,0.325465,0.868197,0.653036
1000,0.3101,0.315264,0.870903,0.666363
1200,0.3073,0.309603,0.870962,0.66948
1400,0.2984,0.304198,0.872904,0.666667
1600,0.2963,0.307972,0.874081,0.677711
1800,0.2835,0.309255,0.874492,0.67891
2000,0.2872,0.303746,0.876728,0.683296


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


TrainOutput(global_step=2319, training_loss=0.32258092889296386, metrics={'train_runtime': 1806.7377, 'train_samples_per_second': 10.26, 'train_steps_per_second': 1.284, 'total_flos': 2856244966928640.0, 'train_loss': 0.32258092889296386, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.3037463426589966,
 'eval_accuracy': 0.8767284495439835,
 'eval_f1': 0.6832955404383975,
 'eval_runtime': 34.2232,
 'eval_samples_per_second': 45.145,
 'eval_steps_per_second': 5.669,
 'epoch': 3.0}

In [None]:
wandb.finish()

## Training Run History

### Metrics over Time
- **Accuracy:** ▁▃▃▆▆▆▇▇▇████
- **F1 Score:** ▁▄▅▆▇▇▇██████
- **Loss:** █▅▅▃▂▂▁▁▂▁▁▁▁
- **Runtime:** ▃▅▄█▁▁▁▁▁▁▂▂▂
- **Samples per Second:** ▆▄▅▁██████▇▇▆
- **Steps per Second:** ▆▄▅▁██████▇▇▆
- **Epochs:** ▁▁▂▂▂▂▃▃▄▄▄▄▅▅▆▆▆▆▇▇█████
- **Global Step:** ▁▁▂▂▂▂▃▃▄▄▄▄▅▅▆▆▆▆▇▇█████
- **Gradient Norm:** ▅▆▃█▄▂▆▁▅▂▂
- **Learning Rate:** █▇▇▆▅▄▄▃▂▂▁
- **Loss (Training):** █▅▄▄▂▂▂▂▁▁▁

### Run Summary
- **Evaluation Accuracy:** 0.87673
- **Evaluation F1 Score:** 0.6833
- **Evaluation Loss:** 0.30375
- **Evaluation Runtime:** 34.2232s
- **Evaluation Samples per Second:** 45.145
- **Evaluation Steps per Second:** 5.669
- **Training Epochs:** 3.0
- **Training Global Step:** 2319
- **Training Gradient Norm:** 1.01809
- **Training Learning Rate:** 0.0
- **Training Loss:** 0.2807
- **Training Total FLOPS:** 2,856,244,966,928,640.0
- **Training Loss (Reported):** 0.32258
- **Training Runtime:** 1806.7377s
- **Training Samples per Second:** 10.26
- **Training Steps per Second:** 1.284

### Useful Links
https://api.wandb.ai/links/sarthak-vajpayee/2cfy47kj


#### Observations:

1. **`eval_loss` (0.3315390646457672)**:
   - This represents the model's average loss during the evaluation phase. The loss value quantifies the model's error, where a lower loss indicates better performance. Here, the loss is about 0.332, suggesting that while the model is learning to predict the target variable, there could be room for improvement in reducing the prediction error.

2. **`eval_f1` (0.6508371385083713)**:
   - The F1 score is a harmonic mean of precision and recall, providing a balance between these metrics especially in cases of class imbalance. An F1 score of approximately 0.651 indicates moderate classification performance. It suggests that the balance between precision and recall is reasonable but could benefit from improvement. This is particularly important in scenarios where false positives and false negatives carry significant consequences.

3. **`eval_runtime` (17.7656 seconds)**:
   - This metric indicates the total time taken to complete the evaluation phase, which in this case is roughly 17.77 seconds. It helps gauge the computational efficiency of the model during inference.

4. **`eval_samples_per_second` (43.511)**:
   - This measures the throughput of the model, indicating how many samples are processed per second during the evaluation. A value of 43.511 samples per second suggests a reasonable speed, though this metric could also be influenced by factors such as the complexity of the model, the hardware used for evaluation, and the size of the input data.

5. **`eval_steps_per_second` (10.92)**:
   - Similar to the previous metric but focused on the number of batches (or steps) processed per second. The value of 10.92 steps per second reflects the model's batch processing speed. This provides an insight into how quickly the model handles batches of data, which is crucial for understanding performance in batched operations.

6. **`epoch` (1.0)**:
   - This key indicates that the metrics were recorded at the end of the first training epoch. The number of epochs is a basic measure of how much training the model has undergone (one full pass through the training dataset).

#### Predicting on test data

In [None]:
import pandas as pd

# Load the test dataset from a CSV file located at './test.csv'.
test_df = pd.read_csv('./test.csv')

# Define a list of column names that correspond to the labels for different emotions or sentiments.
label_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
                 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Convert each label from categorical (having 'NONE' or other values) to binary format:
# 0 if the label is 'NONE', and 1 for any other value. This transformation is applied
# across all specified label columns.
for label in label_columns:
    test_df[label] = test_df[label].apply(lambda x: 0 if x == 'NONE' else 1)

# After converting each label column to binary, combine these binary labels into a single
# list for each record. This list of labels is stored in a new column called 'labels'.
test_df['labels'] = test_df[label_columns].values.tolist()

# Convert each element in the 'labels' list from integers to floats. This conversion may be
# necessary for compatibility with certain machine learning frameworks that expect inputs
# of a specific type, like floating point numbers.
test_df['labels'] = test_df['labels'].apply(lambda labels: [float(label) for label in labels])


In [None]:
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_fn, batched=True)
tokenized_test.remove_columns(['Tweet','ID', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'])
tokenized_test.set_format(type='torch')
test_dataset = DataLoader(tokenized_test, batch_size=16)

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.nn import functional as F

# Set the model to evaluation mode. This is crucial as it disables layers like dropout and batch normalization
# which behave differently during training vs. testing.
model.eval()

# Determine whether a CUDA-capable GPU is available and set it as the computation device; otherwise, use the CPU.
computation_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Transfer the model to the appropriate computation device (GPU or CPU).
model.to(computation_device)

# Initialize an empty list to store the probabilities for each batch.
batch_probabilities = []

# Disable gradient calculations since they aren't needed in evaluation mode. This reduces memory usage and speeds up computation.
with torch.no_grad():
    for data_batch in test_dataset:
        # Create a dictionary of input tensors (input_ids and attention_mask) transferred to the computation device.
        input_tensors = {key: value.to(computation_device) for key, value in data_batch.items() if key in ['input_ids', 'attention_mask']}

        # Pass the input tensors to the model and obtain the output. The model should return a data structure containing logits.
        model_outputs = model(**input_tensors)
        logits = model_outputs.logits

        # Apply the sigmoid function to the logits to convert them to probabilities. This is typical for binary classification tasks.
        batch_probs = torch.sigmoid(logits)

        # Move the batch probabilities back to CPU and convert them to a NumPy array for further processing.
        batch_probabilities.append(batch_probs.cpu().numpy())

# Concatenate all batch probabilities into a single array.
all_probabilities = np.concatenate(batch_probabilities, axis=0)

# Define a prediction threshold of 0.5, typical for binary classification tasks.
prediction_threshold = 0.5

# Generate final predictions by applying the threshold: probabilities above or equal to the threshold are classified as 1, others as 0.
predicted_classes = (all_probabilities >= prediction_threshold).astype(int)

In [None]:
df_test = pd.read_csv('test.csv')
df_test[label_columns] = predicted_classes

df_test = df_test.drop(columns=['Tweet'])
df_test.to_csv('test_pred_T5.csv', index=False)