In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install transformers datasets scikit-learn torch

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer
from transformers.training_args import TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import EarlyStoppingCallback

In [None]:
train_path = "/content/drive/MyDrive/PMML/dev_phase/subtask2/train/eng.csv"
dev_path   = "/content/drive/MyDrive/PMML/dev_phase/subtask2/dev/eng.csv"

In [None]:
df = pd.read_csv(train_path)
df_dev = pd.read_csv(dev_path)

In [None]:
df.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0


In [None]:
df_dev.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,eng_f66ca14d60851371f9720aaf4ccd9b58,God is with Ukraine and Zelensky,,,,,
1,eng_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 Republicans Luzerne County Council s...",,,,,
2,eng_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,,,,,
3,eng_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",,,,,
4,eng_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in Trump election interference probe,,,,,


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

In [None]:
train_df

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
1920,eng_2f764e2db995e29da3dabee9db02e446,In Gaza there are no Underground Railroads for...,1,1,0,0,0
2018,eng_ba3420871f1d9f55c1ee48381744ed6a,Not when it comes to the radical left.,1,0,0,0,0
1029,eng_3885745e6aa811eda4f1ae11108266d8,STOP RUSSIAN AGGRESSION AGAINST UKRAINE. StopP...,0,0,0,0,0
1621,eng_599cf608885a86719a39232c28af4381,Why didnt Hegseth meet with Democrats?,0,0,0,0,0
69,eng_4b0f94e8114dea01d392718b9155ffca,All in the Family: Sheriff's Son Is New DPS Of...,0,0,0,0,0
...,...,...,...,...,...,...,...
1638,eng_2133f1d07aa40912ff2f7126261ef900,Zelensky video grips my soul !,0,0,0,0,0
1095,eng_9813aaa606a61f1d1d1531c77cc19e9b,The price America paid for its first big immig...,0,0,0,0,0
1130,eng_cbb6f5591151fc895aebd763a9bdc848,This true crime shows newest season recounts t...,0,0,0,0,0
1294,eng_b9419b4a31d134433c0f9cf5fad58cf3,Wow that is a wild take but yeah it does seem ...,0,0,0,0,0


In [None]:
test_df

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
942,eng_7e05b370061f7182e2079178736a250d,Republicans propose splitting Gwinnett between...,0,0,0,0,0
2358,eng_d0ee97b42590bcd5d5ee6f04373fc320,Are you telling me that MAGA folks dont have d...,1,0,0,1,0
443,eng_dcc6c27bb2f1e5b99100dcd9f61ba5a7,GOP members reject housing department nominee due,0,0,0,0,0
1670,eng_934771e122ed79352c57e60b0b1746bb,whats your favorite conspiracy theory?,0,0,0,0,0
1977,eng_92a7ed1a58634e0acad602dd9e7ab564,lol coming from a paid new outlet by the dumps...,1,1,0,0,0
...,...,...,...,...,...,...,...
57,eng_6830f6ee4887cac1df0c0ac419e2dd26,"After Georgia indictments, Young tells Republi...",0,0,0,0,0
1756,eng_30bb246b4d3da4a244ba63631463a3ce,"But not for individual rights, which are now b...",1,0,0,0,0
1867,eng_68e4c856d161421faf794d35c41aac95,Human shields for what? They were protesting. ...,1,0,0,0,0
229,eng_f5582b3c45669c46714e9c1913e2d2c8,David Emanuel Academy Thanks Stillmore Police ...,0,0,0,0,0


# Fine-Tune model xlm-roberta

In [None]:
model_name = "xlm-roberta-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# 1️⃣ Tokenisasi teks
def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

encoded_dataset = dataset.map(preprocess, batched=True)

# 2️⃣ Multi-label encoding
label_cols = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]

def encode_labels(batch):
    batch["labels"] = [float(batch[col]) for col in label_cols]
    return batch

encoded_dataset = encoded_dataset.map(encode_labels)

# 3️⃣ Buang kolom teks & kolom label lama
encoded_dataset = encoded_dataset.remove_columns(["text"] + label_cols)

# 4️⃣ Format dataset untuk PyTorch
encoded_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

In [None]:
# 4️⃣ Jumlah label untuk multi-label classification
num_labels = len(label_cols)   # yaitu 5

# 5️⃣ Load model untuk multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"  # opsional tapi recommended
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Sigmoid: ubah logits menjadi probabilitas multilabel
    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    # Threshold: jika >0.5 maka dianggap 1, else 0
    preds = (probs > 0.5).astype(int)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro"),
        "f1": f1_score(labels, preds, average="macro")
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./results_xlm-roberta-base",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    save_steps=500,
    report_to="none",
    save_total_limit=2,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2868,0.251774,0.669776,0.157692,0.084103,0.109699
2,0.2518,0.227791,0.673507,0.221988,0.137894,0.156998
3,0.2107,0.226972,0.684701,0.23861,0.154457,0.171887
4,0.1726,0.259858,0.688433,0.302265,0.114763,0.159636
5,0.1527,0.248152,0.710821,0.251394,0.188652,0.21418
6,0.1377,0.283208,0.68097,0.259035,0.152602,0.191172
7,0.1224,0.306917,0.688433,0.457813,0.139122,0.184803
8,0.0901,0.273636,0.699627,0.243037,0.18922,0.207318
9,0.0968,0.317271,0.69403,0.386715,0.184985,0.230665
10,0.0832,0.334918,0.677239,0.528802,0.194112,0.244538


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=4020, training_loss=0.07592047358033668, metrics={'train_runtime': 5211.9263, 'train_samples_per_second': 12.318, 'train_steps_per_second': 0.771, 'total_flos': 8446092373094400.0, 'train_loss': 0.07592047358033668, 'epoch': 30.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.40350791811943054, 'eval_accuracy': 0.6865671641791045, 'eval_precision': 0.5515599065345258, 'eval_recall': 0.2936842475758056, 'eval_f1': 0.3483490957358553, 'eval_runtime': 6.464, 'eval_samples_per_second': 82.921, 'eval_steps_per_second': 5.26, 'epoch': 30.0}


In [None]:
trainer.save_model("./saved_model_xlm-roberta_f1_81")
tokenizer.save_pretrained("./saved_model_xlm-roberta_f1_81")

('./saved_model_xlm-roberta_f1_81/tokenizer_config.json',
 './saved_model_xlm-roberta_f1_81/special_tokens_map.json',
 './saved_model_xlm-roberta_f1_81/sentencepiece.bpe.model',
 './saved_model_xlm-roberta_f1_81/added_tokens.json',
 './saved_model_xlm-roberta_f1_81/tokenizer.json')

## Evaluasi

In [None]:
import numpy as np

predictions = trainer.predict(encoded_dataset["test"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1)

In [None]:
from sklearn.metrics import classification_report
import torch # Import torch if not already imported in this scope

# Apply sigmoid to logits to get probabilities
probs = torch.sigmoid(torch.tensor(logits)).numpy()

# Apply a threshold to convert probabilities to binary predictions
preds_multilabel = (probs > 0.5).astype(int)

print(classification_report(labels, preds_multilabel))

In [None]:
from sklearn.metrics import classification_report
import torch # Import torch if not already imported in this scope

# Apply sigmoid to logits to get probabilities
probs = torch.sigmoid(torch.tensor(logits)).numpy()

# Apply a threshold to convert probabilities to binary predictions
preds_multilabel = (probs > 0.5).astype(int)

print(classification_report(labels, preds_multilabel))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72       195
           1       0.51      0.38      0.44        47
           2       0.75      0.15      0.25        20
           3       0.33      0.08      0.13        12
           4       0.44      0.13      0.20        31

   micro avg       0.67      0.55      0.60       305
   macro avg       0.55      0.29      0.35       305
weighted avg       0.64      0.55      0.57       305
 samples avg       0.25      0.22      0.23       305



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## testing data dev

In [None]:
dataset_dev = DatasetDict({
    "dev": Dataset.from_pandas(df_dev)
})

# Tokenisasi
encoded_dataset_dev = dataset_dev.map(preprocess, batched=True)

# Hapus kolom text
encoded_dataset_dev = encoded_dataset_dev.remove_columns(["text"])
encoded_dataset_dev.set_format("torch")

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

In [None]:
# Prediksi
predictions_dev = trainer.predict(encoded_dataset_dev["dev"])
logits_dev = predictions_dev.predictions

# Sigmoid → probabilitas multilabel
probs_dev = torch.sigmoid(torch.tensor(logits_dev)).numpy()

# Threshold 0.5 → label 0/1
preds_multilabel_dev = (probs_dev > 0.5).astype(int)

# Tambahkan kolom hasil prediksi ke df_dev
for i, col in enumerate(label_cols):
    df_dev[col] = preds_multilabel_dev[:, i]

In [None]:
# Simpan kolom labels sebagai list string "1,0,1,0,0"
df_dev["labels"] = preds_multilabel_dev.tolist()
df_dev["labels"] = df_dev["labels"].apply(lambda x: ",".join(map(str, x)))

In [None]:
df_dev.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other,labels
0,eng_f66ca14d60851371f9720aaf4ccd9b58,God is with Ukraine and Zelensky,0,0,0,0,0,0
1,eng_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 Republicans Luzerne County Council s...",0,0,0,0,0,0
2,eng_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,0,0,0,0,0,0
3,eng_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",0,0,0,0,0,0
4,eng_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in Trump election interference probe,0,0,0,0,0,0


In [None]:
df_dev.to_csv("/content/drive/MyDrive/PMML/dev_phase/subtask2/dev_xlm-roberta3.csv")