LOAD LIBRARIES

In [3]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModel
from datasets import Dataset, DatasetDict
import evaluate

In [2]:
!pip install evaluate
!pip install datasets


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0

LOAD DATA AND PREPROCESS

In [4]:
from google.colab import files

In [5]:
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [6]:
# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [7]:
# Preprocess labels in test_df to be integers (all 0s)
for emotion in ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']:
    test_df[emotion] = 0

In [8]:
# Convert to lowercase and remove non-alphanumeric characters
train_df['Tweet'] = train_df['Tweet'].str.lower().replace(r'[^a-z\s]', '', regex=True)
test_df['Tweet'] = test_df['Tweet'].str.lower().replace(r'[^a-z\s]', '', regex=True)

In [9]:
# Convert multi-labels into a list format for each row
def get_labels(row):
    return [row['anger'], row['anticipation'], row['disgust'], row['fear'], row['joy'], row['love'],
            row['optimism'], row['pessimism'], row['sadness'], row['surprise'], row['trust']]

train_df['labels'] = train_df.apply(get_labels, axis=1)
test_df['labels'] = test_df.apply(get_labels, axis=1)

In [10]:
# Split training data into train and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

CREATE HUGGINGFACE DATASET

In [11]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data[['Tweet', 'labels']])
val_dataset = Dataset.from_pandas(val_data[['Tweet', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['Tweet', 'labels']])

In [12]:
# Create a DatasetDict
emotion_datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

TOKENIZATION

In [13]:
# Tokenization using Hugging Face AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = emotion_datasets.map(tokenize_function, batched=True)

# Set the format of the tokenized datasets to PyTorch tensors
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/6951 [00:00<?, ? examples/s]

Map:   0%|          | 0/773 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

CREATE MODEL

In [14]:
# Use a pre-trained BERT model to extract embeddings
class EmotionModel(nn.Module):
    def __init__(self, num_labels=11):
        super(EmotionModel, self).__init__()
        # Load a pre-trained BERT model
        self.bert = AutoModel.from_pretrained("roberta-base")
        # Classifier layer
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the pooled output (representation of [CLS] token)
        pooled_output = outputs.pooler_output
        # Classify the pooled output
        logits = self.classifier(pooled_output)
        output = self.sigmoid(logits)

        # Compute loss
        loss = None
        if labels is not None:
            loss_fct = nn.BCELoss()
            loss = loss_fct(output, labels.float())

        return {"loss": loss, "logits": output}

INITIALIZE MODEL

In [15]:
# Initialize the model
model = EmotionModel()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAINING ARGUMENTS

In [16]:
# Define Trainer and training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define the metrics for evaluation using the new `evaluate` library
f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    # Convert predictions to binary format with a threshold of 0.5
    predictions = (logits > 0.5).astype(int)

    # Flatten predictions and labels to match evaluate's expected format
    # Convert 2D arrays into 1D lists of labels
    predictions = predictions.tolist()
    labels = labels.tolist()

    # Calculate F1 score for each label and average over labels
    f1_macro = f1_score(y_true=labels, y_pred=predictions, average='macro')
    return {"f1_score": f1_macro}



Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

DEFINE TRAINER

In [17]:
# Train the model using Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


TRAINING AND VALIDATION

In [18]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.329646,0.374022
2,0.357200,0.313831,0.51683
3,0.276800,0.306406,0.54067


TrainOutput(global_step=1305, training_loss=0.30030041106359256, metrics={'train_runtime': 511.7417, 'train_samples_per_second': 40.749, 'train_steps_per_second': 2.55, 'total_flos': 0.0, 'train_loss': 0.30030041106359256, 'epoch': 3.0})

PREDICTIONS ON TEST SET

In [19]:
# Generate predictions for the test set
predictions = trainer.predict(tokenized_datasets["test"]).predictions
predicted_labels = (predictions > 0.5).astype(int)

SUBMISSION FILE

In [20]:
# Prepare the submission file
submission_df = test_df[["ID"]].copy()
submission_df[['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']] = predicted_labels
submission_df.to_csv("submission.csv", index=False)

In [21]:
from google.colab import files

# Download the saved submission file
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>