## 1. Load the Dataset


In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

url = 'https://raw.githubusercontent.com/Polar-SemEval/trial-data/refs/heads/main/Trial_Data.csv'
try:
    df = pd.read_csv(url)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Select only relevant columns
df = df[['text', 'polarization']]

df


Dataset loaded successfully. Shape: (338, 15)


Unnamed: 0,text,polarization
0,እንደምን አደራችሁ ቤዛ! የ እለቱን ቃል በ ፓስተር ፍቅሬ በላይ እነሆ! ...,0
1,@USER ተሌግራምን እንደ አማራጭ መጠቀም መልካም ይመስለኛል::,0
2,አንደኛው አመት የፕሪቶርያው ስምምነት ምክንያት በማድረግ ከህዝባዊ ወያነ ...,0
3,ግደል እንዳልልህ.ሃጢያት ነው አውቃለሁ አትግደል አልልህ.ጨካኝ ፍጡር ናቸ...,0
4,አለ ነገር አለ ነገር እንደው አለው ነገር እንዲህ ታመን ታመን አንገት ደ...,0
...,...,...
333,有些穆斯林特恶心吃猪肉的，不爱带面纱的女人也滚......他们的三观出问题了吗？,1
334,混血，眼睛是蓝色，混了白皮猪的血,1
335,对待黑蛆直接清空弹夹完事,1
336,我在非洲呆过很久，相信我，黑人不值得同情，他们就是劣等民族，没有羞耻心没有上进心，天生就该被...,1


## 2. Preprocessing & Split
### Ensure text is string and labels are integers

In [4]:

df['text'] = df['text'].astype(str)
df['polarization'] = df['polarization'].astype(int)

# Split into training and validation sets (80% train, 20% val)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['polarization'].tolist(),
    test_size=0.2,
    random_state=42
)


## 3. Tokenizer & Dataset Class
### Use multilingual BERT tokenizer

In [5]:

model_name = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = PolarizationDataset(train_texts, train_labels, tokenizer)
val_dataset = PolarizationDataset(val_texts, val_labels, tokenizer)



## 4. Model Initialization
### Load pre-trained BERT with a classification head on top (num_labels=2 for binary)

In [6]:

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Training Setup


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,              # Small dataset, 3-5 epochs usually enough
    per_device_train_batch_size=8,   # Adjust based on your VRAM
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",           # Evaluate at end of every epoch (changed from evaluation_strategy)
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"                 # Disable wandb/mlflow logging for simple run
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



## 6. Train and Evaluate


In [8]:
print("Starting training...")
trainer.train()

print("\nFinal Evaluation on Validation Set:")
eval_results = trainer.evaluate()
print(eval_results)



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6618,0.72967,0.5,0.666667,0.5,1.0
2,0.5901,0.858271,0.5,0.622222,0.5,0.823529
3,0.4475,0.928698,0.558824,0.594595,0.55,0.647059



Final Evaluation on Validation Set:


{'eval_loss': 0.7296702861785889, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_runtime': 1.1305, 'eval_samples_per_second': 60.151, 'eval_steps_per_second': 4.423, 'epoch': 3.0}


In [None]:
# 7. (Optional) Save the model
# model.save_pretrained("./polarization_model")
# tokenizer.save_pretrained("./polarization_model")

# --- Test with new examples ---


In [9]:
import torch.nn.functional as F

def predict_polarization(text):
    # 1. Prepare the text (tokenize)
    # We use the same tokenizer as in training
    inputs = tokenizer(
        text,
        return_tensors="pt",       # Return PyTorch tensors
        truncation=True,           # Truncate if too long
        max_length=128,            # Same max_length as training
        padding=True               # Pad to max_length
    )

    # 2. Move inputs to the same device as the model (GPU or CPU)
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 3. Get Model Predictions
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)

    # 4. Convert Logits to Probabilities using Softmax
    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)

    # 5. Get the most likely class (0 or 1) and its confidence score
    pred_label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][pred_label].item()

    # Map 0/1 back to labels
    label_map = {0: "Not Polarized", 1: "Polarized"}

    return label_map[pred_label], confidence


sample_text_1 = "This group of people is a danger to our society and must be stopped."
sample_text_2 = "The economic policy was discussed in the parliament today."

result_1, conf_1 = predict_polarization(sample_text_1)
print(f"Text: '{sample_text_1}'\nPrediction: {result_1} (Confidence: {conf_1:.2f})\n")

result_2, conf_2 = predict_polarization(sample_text_2)
print(f"Text: '{sample_text_2}'\nPrediction: {result_2} (Confidence: {conf_2:.2f})\n")

Text: 'This group of people is a danger to our society and must be stopped.'
Prediction: Polarized (Confidence: 0.67)

Text: 'The economic policy was discussed in the parliament today.'
Prediction: Polarized (Confidence: 0.60)

