In [1]:
!pip install transformers datasets

import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
!pip install emoji==0.6.0


Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49719 sha256=cb9590c1b0d866d573fe3c23ad8564b10bfd3c98f6672dd717441edd97bb0c92
  Stored in directory: /root/.cache/pip/wheels/b7/23/31/f9b93f25b95da9b91729c4cd5f35a2b692ab06f688f6759630
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


In [3]:
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                label = parts[1]
                text = parts[2]
                if label in ["positive", "negative", "neutral"]:
                    data.append((text, label))
    return pd.DataFrame(data, columns=["text", "label"])

df = load_data("/content/SemEval2017-task4-dev.subtask-A.english.INPUT.txt")  # Make sure this has tweet texts

# Using only 2000 samples for speed
df = df.sample(n=2000, random_state=42)


In [4]:
label2id = {"positive": 0, "negative": 1, "neutral": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["label"].map(label2id)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_dataset = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})


In [5]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


In [9]:
def train_model(model_name):
    print(f"\n Training model: {model_name}\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding=True)

    train_data = train_dataset.map(tokenize_fn, batched=True)
    val_data = val_dataset.map(tokenize_fn, batched=True)

    args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}_output",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer.evaluate()


In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [11]:
models = {
    "BERT": "bert-base-uncased",
    "DistilBERT": "distilbert-base-uncased",
    "BERTweet": "vinai/bertweet-base"
}

results = {}

for name, path in models.items():
    results[name] = train_model(path)

# Print final comparison
print("\n Model Comparison (Accuracy & F1 Score):")
for name, res in results.items():
    print(f"{name}: Accuracy = {res['eval_accuracy']:.4f}, F1 = {res['eval_f1']:.4f}")



🔍 Training model: bert-base-uncased



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.822743,0.61,0.555719
2,No log,0.701071,0.6825,0.676872
3,No log,0.759886,0.6825,0.675617
4,No log,0.921878,0.7025,0.696255
5,0.453000,1.075363,0.6975,0.698446
6,0.453000,1.24905,0.715,0.714258
7,0.453000,1.453124,0.695,0.686365
8,0.453000,1.494853,0.715,0.713765
9,0.453000,1.548358,0.71,0.70426
10,0.031400,1.522961,0.72,0.718685



🔍 Training model: distilbert-base-uncased



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.758335,0.6575,0.651257
2,No log,0.714727,0.66,0.64869
3,No log,0.718666,0.71,0.708347
4,No log,0.824036,0.695,0.693439
5,0.480600,0.978794,0.695,0.695517
6,0.480600,1.136544,0.6675,0.667946
7,0.480600,1.342097,0.6775,0.675878
8,0.480600,1.424716,0.66,0.661004
9,0.480600,1.481836,0.66,0.658599
10,0.054300,1.504583,0.66,0.660894



🔍 Training model: vinai/bertweet-base



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.777396,0.6575,0.644322
2,No log,0.658595,0.7075,0.701509
3,No log,0.674495,0.7225,0.721602
4,No log,0.79937,0.7275,0.72799
5,0.503200,0.888321,0.735,0.73521
6,0.503200,1.038971,0.7325,0.731306
7,0.503200,1.22008,0.7175,0.717543
8,0.503200,1.269213,0.7175,0.71656
9,0.503200,1.338239,0.715,0.7136
10,0.076400,1.32657,0.725,0.724052



📊 Model Comparison (Accuracy & F1 Score):
BERT: Accuracy = 0.7200, F1 = 0.7187
DistilBERT: Accuracy = 0.7100, F1 = 0.7083
BERTweet: Accuracy = 0.7350, F1 = 0.7352
