## Subtask 01: Binary classification

`polarized` or `non-polarized`

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/Polar-SemEval/trial-data/refs/heads/main/Trial_Data.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,text,lang,id,polarization,political,racial/ethnic,religious,gender/sexual,other,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
0,እንደምን አደራችሁ ቤዛ! የ እለቱን ቃል በ ፓስተር ፍቅሬ በላይ እነሆ! ...,amh,amh_4a249cbbca6389d5120fe9b556f5ebf2,0,0,0,0,0,0,0,0,0,0,0,0
1,@USER ተሌግራምን እንደ አማራጭ መጠቀም መልካም ይመስለኛል::,amh,amh_01b5780721c676222b91ec97a797a066,0,0,0,0,0,0,0,0,0,0,0,0
2,አንደኛው አመት የፕሪቶርያው ስምምነት ምክንያት በማድረግ ከህዝባዊ ወያነ ...,amh,amh_7aabf3f2c41cbd032e6dbf56eccca9d7,0,0,0,0,0,0,0,0,0,0,0,0
3,ግደል እንዳልልህ.ሃጢያት ነው አውቃለሁ አትግደል አልልህ.ጨካኝ ፍጡር ናቸ...,amh,amh_16df45b84088cd0d09ca4fa3a0e5f35c,0,0,0,0,0,0,0,0,0,0,0,0
4,አለ ነገር አለ ነገር እንደው አለው ነገር እንዲህ ታመን ታመን አንገት ደ...,amh,amh_a28c14c4242c9387bc7a028cfad19c9f,0,0,0,0,0,0,0,0,0,0,0,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              338 non-null    object
 1   lang              338 non-null    object
 2   id                338 non-null    object
 3   polarization      338 non-null    int64 
 4   political         338 non-null    int64 
 5   racial/ethnic     338 non-null    int64 
 6   religious         338 non-null    int64 
 7   gender/sexual     338 non-null    int64 
 8   other             338 non-null    int64 
 9   stereotype        338 non-null    int64 
 10  vilification      338 non-null    int64 
 11  dehumanization    338 non-null    int64 
 12  extreme_language  338 non-null    int64 
 13  lack_of_empathy   338 non-null    int64 
 14  invalidation      338 non-null    int64 
dtypes: int64(12), object(3)
memory usage: 39.7+ KB


### Split data

In [3]:
# !pip install scikit-learn

In [4]:
from sklearn.model_selection import train_test_split

x = df['text']
y = df['polarization']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(270,) (68,) (270,) (68,)


In [6]:
# !pip install transformers torch

### Tokenizing

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast

MODEL_NAME = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(MODEL_NAME)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [8]:


# Dataset
class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        """
        texts: pd.Series or list of strings
        labels: pd.Series or list of ints (0/1)
        tokenizer: transformers tokenizer
        """
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = int(self.labels.iloc[idx])   # ensure 0/1 ints

        # Tokenize single example (we'll batch-pad in DataLoader using collate_fn)
        encoded = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_attention_mask=True,
            return_token_type_ids=False  # XLM-R doesn't use token_type_ids
        )

        # Convert to tensors here; collate_fn will stack them
        item = {
            "input_ids": torch.tensor(encoded["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(encoded["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.float)  # float for BCEWithLogitsLoss
        }
        return item

# Collate function to pad a batch (recommended)
def collate_fn(batch):
    # batch is a list of items from __getitem__
    input_ids = [b["input_ids"] for b in batch]
    attention_mask = [b["attention_mask"] for b in batch]
    labels = torch.stack([b["labels"] for b in batch])

    # pad with tokenizer (convenient)
    padded = tokenizer.pad(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        padding="longest",
        return_tensors="pt"
    )

    padded["labels"] = labels.unsqueeze(1)  # shape (batch, 1) for BCEWithLogits
    return padded

# Create dataset instances
train_dataset = PolarizationDataset(x_train, y_train, tokenizer, max_length=256)
test_dataset  = PolarizationDataset(x_test, y_test, tokenizer, max_length=256)

# Create dataloaders
BATCH_SIZE = 16

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2,       # adjust for your machine; 0 on Windows sometimes safer
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

# Quick smoke test: fetch one batch
batch = next(iter(train_loader))
print({k: v.shape for k, v in batch.items()})
# expected keys: input_ids, attention_mask, labels
# e.g. {'input_ids': (BATCH_SIZE, seq_len), 'attention_mask': (BATCH_SIZE, seq_len), 'labels': (BATCH_SIZE, 1)}


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([16, 138]), 'attention_mask': torch.Size([16, 138]), 'labels': torch.Size([16, 1])}


In [9]:
from transformers import XLMRobertaForSequenceClassification

# Binary classification → 2 labels: polarized vs not polarized
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=2
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded on:", device)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda
