In [None]:
# !pip install --quiet datasets

In [None]:
import torch
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import os
from accelerate import Accelerator
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [None]:
# Freezing the base model
for param in model.parameters():
    param.requires_grad = False
# Unfreeze the last classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [None]:
df = pd.read_csv('eng_augmented.csv')

In [None]:
type(df.iloc[0]['Anger'])

numpy.int64

In [None]:
# # List of emotions in the GoEmotions dataset
# emotion_labels = [
#     "admiration", "amusement", "anger", "annoyance", "approval",
#     "caring", "confusion", "curiosity", "desire", "disappointment",
#     "disapproval", "disgust", "embarrassment", "excitement", "fear",
#     "gratitude", "grief", "joy", "love", "nervousness",
#     "optimism", "pride", "realization", "relief", "remorse",
#     "sadness", "surprise", "neutral"
# ]

# # Example: Finding the emotion name for label number 3
# label_number = 3  # Using 0-based index for label number
# emotion_name = emotion_labels[label_number]
# print("The emotion is:", emotion_name)

# Setting the labels of our dataset in the same sequence as above
df['labels'] = df.apply(lambda row:[float(row['Anger']), float(row['Fear']), float(row['Joy']), float(row['Sadness']), float(row['Surprise'])], axis=1)
df = df.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df

Unnamed: 0,text,labels
0,But not very happy.,"[0.0, 0.0, 1.0, 1.0, 0.0]"
1,Well she's not gon na last the whole song like...,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,She sat at her Papa's recliner sofa only to mo...,"[0.0, 0.0, 0.0, 0.0, 0.0]"
3,"Yes, the Oklahoma city bombing.","[1.0, 1.0, 0.0, 1.0, 1.0]"
4,They were dancing to Bolero.,"[0.0, 0.0, 1.0, 0.0, 0.0]"
...,...,...
2763,"""Yeah, but did you just find that?","[0.0, 1.0, 0.0, 0.0, 1.0]"
2764,I did as little as possible with my right hand...,"[0.0, 0.0, 0.0, 0.0, 0.0]"
2765,"Okay that sucks, right?","[1.0, 0.0, 0.0, 1.0, 0.0]"
2766,"The spark leaped through his body into mine, a...","[0.0, 1.0, 0.0, 0.0, 1.0]"


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)  # Use only the desired columns

# Optionally, print the dataset to check
print(dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 2768
})


In [None]:
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset

Map:   0%|          | 0/2768 [00:00<?, ? examples/s]

Map: 100%|██████████| 2768/2768 [00:00<00:00, 7444.56 examples/s]


Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2768
})

In [None]:
print(len(encoded_dataset[0]["labels"]))

5


In [None]:
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
print(encoded_dataset[0]["labels"])  # Should output a tensor of floats: e.g., tensor([1., 0., 0., ..., 0., 1.])


tensor([0., 0., 1., 1., 0.])


In [None]:
dataset = encoded_dataset.train_test_split(test_size=0.2)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2214
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 554
    })
})

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Sigmoid is typically used for multi-label classification to get probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    # Convert probabilities to binary predictions (0 or 1)
    predictions = (probs > 0.5).astype(int)

    # Calculate metrics
    f1 = f1_score(labels, predictions, average='macro', zero_division=1)
    precision = precision_score(labels, predictions, average='macro', zero_division=1)
    recall = recall_score(labels, predictions, average='macro', zero_division=1)
    accuracy = accuracy_score(labels, predictions)  # Multi-label accuracy

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [None]:
class ModifiedModel(torch.nn.Module):
    def __init__(self, original_model):
        super(ModifiedModel, self).__init__()
        self.original_model = original_model
        self.fc = torch.nn.Linear(11, 5)  # New FC layer to map 11 -> 5 labels

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.original_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: [batch_size, 11]
        logits = self.fc(logits)  # Shape: [batch_size, 5]
        if labels is not None:
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels.float())
            return {"loss": loss, "logits": logits}

        return {"logits": logits}

modified_model = ModifiedModel(model)

In [None]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    save_steps=0,
    logging_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=modified_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4711,0.362085,0.456679,0.686606,0.74764,0.643282
2,0.3616,0.345459,0.458484,0.71973,0.740677,0.702601
3,0.3461,0.338912,0.471119,0.719221,0.747496,0.699789
4,0.3374,0.332839,0.471119,0.711509,0.754147,0.677583
5,0.3344,0.33279,0.467509,0.720433,0.755472,0.692273




TrainOutput(global_step=350, training_loss=0.37011222294398716, metrics={'train_runtime': 266.1407, 'train_samples_per_second': 41.595, 'train_steps_per_second': 1.315, 'total_flos': 0.0, 'train_loss': 0.37011222294398716, 'epoch': 5.0})

In [None]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    save_steps=0,
    logging_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=modified_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)
# Train the model
trainer.train()

##Using the trained model to get predictions on the train and dev sets

In [None]:
df_val = pd.read_csv('/DATAT/Projects/TBED/eng_a.csv')
df_val

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,"My mouth fell open `` No, no, no... I..",,,,,
1,eng_dev_track_a_00002,You can barely make out your daughter's pale f...,,,,,
2,eng_dev_track_a_00003,But after blinking my eyes for a few times lep...,,,,,
3,eng_dev_track_a_00004,Slowly rising to my feet I came to the conclus...,,,,,
4,eng_dev_track_a_00005,I noticed this months after moving in and doin...,,,,,
...,...,...,...,...,...,...,...
111,eng_dev_track_a_00112,"""ARcH stop your progression.",,,,,
112,eng_dev_track_a_00113,"This 'star', starts to move across the sky.",,,,,
113,eng_dev_track_a_00114,and my feet hurt.,,,,,
114,eng_dev_track_a_00115,so i cried my eyes out and did the drawing.,,,,,


In [None]:
df_val = df_val.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df_val

Unnamed: 0,text
0,"My mouth fell open `` No, no, no... I.."
1,You can barely make out your daughter's pale f...
2,But after blinking my eyes for a few times lep...
3,Slowly rising to my feet I came to the conclus...
4,I noticed this months after moving in and doin...
...,...
111,"""ARcH stop your progression."
112,"This 'star', starts to move across the sky."
113,and my feet hurt.
114,so i cried my eyes out and did the drawing.


In [None]:
val_dataset = Dataset.from_pandas(df_val)  # Use only the desired columns

# Optionally, print the dataset to check
print(val_dataset)

Dataset({
    features: ['text'],
    num_rows: 116
})


In [None]:
def preprocess_function_val(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    # inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset_val = val_dataset.map(preprocess_function_val, batched=True)
encoded_dataset_val

Map: 100%|██████████| 116/116 [00:00<00:00, 3942.33 examples/s]


Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [None]:
encoded_dataset_val.set_format("torch", columns=["input_ids", "attention_mask"])
encoded_dataset_val

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [None]:
val_training_args = TrainingArguments(
    output_dir="./results",
    # per_device_eval_batch_size=8,
    # num_train_epochs=80,
    # weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)
val_trainer = Trainer(
    model=modified_model,  # model is your trained model
    args=val_training_args,
    tokenizer=tokenizer
)

# Run predictions
predictions = val_trainer.predict(encoded_dataset_val)
pred_logits = predictions.predictions

In [None]:
pred_labels = (torch.sigmoid(torch.tensor(pred_logits)) > 0.5).int()

In [None]:
pred_labels

tensor([[0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 1, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [1, 1, 0, 1, 0],
        [0, 1, 0, 0, 1],
        [0, 1, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 1, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 1],
        [0, 1, 0, 1, 0],
        [0, 0, 0, 0, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],


In [None]:
df_val = pd.DataFrame(pred_labels, columns=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])
df_val

Unnamed: 0,Anger,Fear,Joy,Sadness,Surprise
0,0,1,0,0,1
1,0,1,0,0,1
2,0,1,0,0,1
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
111,0,1,0,0,1
112,0,0,1,0,1
113,0,1,0,1,0
114,0,1,0,1,0


In [None]:
df_val.insert(loc=0, column='id', value='text')
df_val['id'] = [f'eng_dev_track_a_{i+1:05}' for i in range(len(df_val))]
df_val

Unnamed: 0,id,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,0,1,0,0,1
1,eng_dev_track_a_00002,0,1,0,0,1
2,eng_dev_track_a_00003,0,1,0,0,1
3,eng_dev_track_a_00004,0,1,0,0,0
4,eng_dev_track_a_00005,0,1,0,0,0
...,...,...,...,...,...,...
111,eng_dev_track_a_00112,0,1,0,0,1
112,eng_dev_track_a_00113,0,0,1,0,1
113,eng_dev_track_a_00114,0,1,0,1,0
114,eng_dev_track_a_00115,0,1,0,1,0


In [None]:
df_val.to_csv('my_pred_after_FCLayer_and_lastlayer_twitterRoberta.csv', index=False)