In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, set_seed
from sklearn.metrics import accuracy_score, f1_score

# Load data
train = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/stop_words_removed/filtered_train.csv")
dev = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/stop_words_removed/filtered_dev.csv")
test = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/stop_words_removed/filtered_test.csv")


# Map labels to integers
label_map = {label: i for i, label in enumerate(train['label'].unique())}
train['label'] = train['label'].map(label_map)
dev['label']   = dev['label'].map(label_map)
test['label']  = test['label'].map(label_map)

train_ds = Dataset.from_pandas(train)
dev_ds   = Dataset.from_pandas(dev)
test_ds  = Dataset.from_pandas(test)

# Choose a smaller model:

# Option 1: DistilBERT multilingual
model_name = "distilbert-base-multilingual-cased"

# Option 2: MiniLM multilingual
# model_name = "microsoft/Multilingual-MiniLM-L12-H384"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize_fn(batch):
    return tokenizer(batch['tweet'], truncation=True, padding="max_length", max_length=64)

train_ds = train_ds.map(tokenize_fn, batched=True)
dev_ds   = dev_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

# Training setup
set_seed(42)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

training_args = TrainingArguments(
    output_dir="./results/smaller_model",
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="no",              
    load_best_model_at_end=False,     
    report_to="none",                 
    dataloader_pin_memory=False,      
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    compute_metrics=compute_metrics,
)

print("Setup complete. Now run trainer.train()")


Map:   0%|          | 0/7620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1275 [00:00<?, ? examples/s]

Map:   0%|          | 0/1837 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setup complete. Now run trainer.train()




In [2]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.8652,0.733312,0.705098,0.617087
2,0.6881,0.712873,0.721569,0.655875
3,0.5386,0.773131,0.715294,0.648394




TrainOutput(global_step=2859, training_loss=0.6973062083620256, metrics={'train_runtime': 7365.2606, 'train_samples_per_second': 3.104, 'train_steps_per_second': 0.388, 'total_flos': 378532342126080.0, 'train_loss': 0.6973062083620256, 'epoch': 3.0})