In [1]:
!pip install --quiet datasets

In [2]:
import torch
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import os
from accelerate import Accelerator
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import datasets

In [3]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [4]:
# Freezing the base model
for param in model.parameters():
    param.requires_grad = False
# Unfreeze the last classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [5]:
df = pd.read_csv('/content/eng.csv')

In [6]:
def get_emotion_counts(df):
    anger_count = (df['Anger'] == 1).sum()
    fear_count = (df['Fear'] == 1).sum()
    joy_count = (df['Joy'] == 1).sum()
    sadness_count = (df['Sadness'] == 1).sum()
    surprise_count = (df['Surprise'] == 1).sum()
    emotion_counts = [anger_count, fear_count, joy_count, sadness_count, surprise_count]
    return emotion_counts

In [28]:
# We will oversample the minority classes to balance the dataset
emotion_counts = get_emotion_counts(df)
print(emotion_counts)
# min_emotion_index = emotion_counts.index(min(emotion_counts))
max_emotion_index = emotion_counts.index(max(emotion_counts))
min_emotion_index = 0
if min_emotion_index == 0:
    emotion = 'Anger'
elif min_emotion_index == 1:
    emotion = 'Fear'
elif min_emotion_index == 2:
    emotion = 'Joy'
elif min_emotion_index == 3:
    emotion = 'Sadness'
else:
    emotion = 'Surprise'
emotion_columns = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
matching_rows = df[(df[emotion] == 1) & (df['Fear'] == 0)]
augmented_data = matching_rows.sample(n=100, random_state=42, replace=True)
df = pd.concat([df, augmented_data])
emotion_counts = get_emotion_counts(df)
print(emotion_counts)

[1483, 1611, 1515, 1495, 1501]
[1583, 1611, 1529, 1522, 1528]


In [29]:
type(df.iloc[0]['Anger'])

numpy.int64

In [30]:
# # List of emotions in the GoEmotions dataset
# emotion_labels = [
#     "admiration", "amusement", "anger", "annoyance", "approval",
#     "caring", "confusion", "curiosity", "desire", "disappointment",
#     "disapproval", "disgust", "embarrassment", "excitement", "fear",
#     "gratitude", "grief", "joy", "love", "nervousness",
#     "optimism", "pride", "realization", "relief", "remorse",
#     "sadness", "surprise", "neutral"
# ]

# # Example: Finding the emotion name for label number 3
# label_number = 3  # Using 0-based index for label number
# emotion_name = emotion_labels[label_number]
# print("The emotion is:", emotion_name)

# Setting the labels of our dataset in the same sequence as above
df['labels'] = df.apply(lambda row:[float(row['Anger']), float(row['Fear']), float(row['Joy']), float(row['Sadness']), float(row['Surprise'])], axis=1)
df = df.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df

Unnamed: 0,text,labels
0,But not very happy.,"[0.0, 0.0, 1.0, 1.0, 0.0]"
1,Well she's not gon na last the whole song like...,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,She sat at her Papa's recliner sofa only to mo...,"[0.0, 0.0, 0.0, 0.0, 0.0]"
3,"Yes, the Oklahoma city bombing.","[1.0, 1.0, 0.0, 1.0, 1.0]"
4,They were dancing to Bolero.,"[0.0, 0.0, 1.0, 0.0, 0.0]"
...,...,...
1310,Two words came out of my mouth `` Piss off '' ...,"[1.0, 0.0, 0.0, 0.0, 0.0]"
1660,"but, within seconds i regained myself and pull...","[1.0, 0.0, 0.0, 0.0, 0.0]"
813,"When I was 10, we had to paint birds for art, ...","[1.0, 0.0, 0.0, 0.0, 1.0]"
1310,Two words came out of my mouth `` Piss off '' ...,"[1.0, 0.0, 0.0, 0.0, 0.0]"


In [31]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [32]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)  # Use only the desired columns

# Optionally, print the dataset to check
print(dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 4968
})


In [33]:
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset

Map:   0%|          | 0/4968 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 4968
})

In [34]:
print(len(encoded_dataset[0]["labels"]))

5


In [35]:
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [36]:
print(encoded_dataset[0]["labels"])  # Should output a tensor of floats: e.g., tensor([1., 0., 0., ..., 0., 1.])


tensor([0., 1., 0., 0., 0.])


In [37]:
dataset = encoded_dataset.train_test_split(test_size=0.2)

In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3974
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 994
    })
})

In [39]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Sigmoid is typically used for multi-label classification to get probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    # Convert probabilities to binary predictions (0 or 1)
    predictions = (probs > 0.5).astype(int)

    # Calculate metrics
    f1 = f1_score(labels, predictions, average='macro', zero_division=1)
    precision = precision_score(labels, predictions, average='macro', zero_division=1)
    recall = recall_score(labels, predictions, average='macro', zero_division=1)
    accuracy = accuracy_score(labels, predictions)  # Multi-label accuracy

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [40]:
class ModifiedModel(torch.nn.Module):
    def __init__(self, original_model):
        super(ModifiedModel, self).__init__()
        self.original_model = original_model
        self.fc = torch.nn.Linear(11, 5)  # New FC layer to map 11 -> 5 labels

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.original_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: [batch_size, 11]
        logits = self.fc(logits)  # Shape: [batch_size, 5]
        if labels is not None:
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels.float())
            return {"loss": loss, "logits": logits}

        return {"logits": logits}

modified_model = ModifiedModel(model)

In [41]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    save_steps=0,
    logging_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=modified_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [42]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3517,0.285543,0.565392,0.803385,0.81406,0.797724
2,0.2776,0.261782,0.622736,0.827961,0.830237,0.827308
3,0.2542,0.243884,0.635815,0.833743,0.83362,0.83433
4,0.24,0.233659,0.644869,0.837618,0.865564,0.811831
5,0.2307,0.232176,0.649899,0.844321,0.849307,0.839648


TrainOutput(global_step=2485, training_loss=0.2708171483736403, metrics={'train_runtime': 1155.0315, 'train_samples_per_second': 17.203, 'train_steps_per_second': 2.151, 'total_flos': 0.0, 'train_loss': 0.2708171483736403, 'epoch': 5.0})

In [54]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    save_steps=0,
    logging_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=modified_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2201,0.232079,0.65996,0.851419,0.856283,0.848855
2,0.2056,0.228201,0.669014,0.854625,0.854538,0.855317
3,0.1968,0.218892,0.687123,0.859193,0.868943,0.85002


TrainOutput(global_step=1491, training_loss=0.20750129438741827, metrics={'train_runtime': 693.5528, 'train_samples_per_second': 17.19, 'train_steps_per_second': 2.15, 'total_flos': 0.0, 'train_loss': 0.20750129438741827, 'epoch': 3.0})

##Using the trained model to get predictions on the train and dev sets

In [67]:
df_val = pd.read_csv('/content/eng_a.csv')
df_val

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,"My mouth fell open `` No, no, no... I..",,,,,
1,eng_dev_track_a_00002,You can barely make out your daughter's pale f...,,,,,
2,eng_dev_track_a_00003,But after blinking my eyes for a few times lep...,,,,,
3,eng_dev_track_a_00004,Slowly rising to my feet I came to the conclus...,,,,,
4,eng_dev_track_a_00005,I noticed this months after moving in and doin...,,,,,
...,...,...,...,...,...,...,...
111,eng_dev_track_a_00112,"""ARcH stop your progression.",,,,,
112,eng_dev_track_a_00113,"This 'star', starts to move across the sky.",,,,,
113,eng_dev_track_a_00114,and my feet hurt.,,,,,
114,eng_dev_track_a_00115,so i cried my eyes out and did the drawing.,,,,,


In [68]:
df_val = df_val.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df_val

Unnamed: 0,text
0,"My mouth fell open `` No, no, no... I.."
1,You can barely make out your daughter's pale f...
2,But after blinking my eyes for a few times lep...
3,Slowly rising to my feet I came to the conclus...
4,I noticed this months after moving in and doin...
...,...
111,"""ARcH stop your progression."
112,"This 'star', starts to move across the sky."
113,and my feet hurt.
114,so i cried my eyes out and did the drawing.


In [69]:
val_dataset = Dataset.from_pandas(df_val)  # Use only the desired columns

# Optionally, print the dataset to check
print(val_dataset)

Dataset({
    features: ['text'],
    num_rows: 116
})


In [70]:
def preprocess_function_val(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    # inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset_val = val_dataset.map(preprocess_function_val, batched=True)
encoded_dataset_val

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [71]:
encoded_dataset_val.set_format("torch", columns=["input_ids", "attention_mask"])
encoded_dataset_val

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [72]:
val_training_args = TrainingArguments(
    output_dir="./results",
    # per_device_eval_batch_size=8,
    # num_train_epochs=80,
    # weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)
val_trainer = Trainer(
    model=modified_model,  # model is your trained model
    args=val_training_args,
    tokenizer=tokenizer
)

# Run predictions
predictions = val_trainer.predict(encoded_dataset_val)
pred_logits = predictions.predictions

  val_trainer = Trainer(


In [73]:
pred_labels = (torch.sigmoid(torch.tensor(pred_logits)) > 0.5).int()

In [74]:
pred_labels

tensor([[0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [1, 0, 0, 1, 0],
        [0, 1, 0, 0, 1],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [1, 1, 0, 0, 1],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 1, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],


In [75]:
df_val = pd.DataFrame(pred_labels, columns=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])
df_val

Unnamed: 0,Anger,Fear,Joy,Sadness,Surprise
0,0,1,0,0,1
1,0,1,0,0,1
2,0,1,0,0,1
3,0,1,0,1,0
4,0,0,0,0,0
...,...,...,...,...,...
111,0,0,0,0,1
112,0,0,1,0,1
113,0,1,0,1,0
114,0,0,0,1,0


In [76]:
df_val.insert(loc=0, column='id', value='text')
df_val['id'] = [f'eng_dev_track_a_{i+1:05}' for i in range(len(df_val))]
df_val

Unnamed: 0,id,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,0,1,0,0,1
1,eng_dev_track_a_00002,0,1,0,0,1
2,eng_dev_track_a_00003,0,1,0,0,1
3,eng_dev_track_a_00004,0,1,0,1,0
4,eng_dev_track_a_00005,0,0,0,0,0
...,...,...,...,...,...,...
111,eng_dev_track_a_00112,0,0,0,0,1
112,eng_dev_track_a_00113,0,0,1,0,1
113,eng_dev_track_a_00114,0,1,0,1,0
114,eng_dev_track_a_00115,0,0,0,1,0


In [77]:
df_val.to_csv('bal3.csv', index=False)