In [8]:
import csv
import os
import shutil
import zipfile
import json

import pandas as pd
import numpy as np
import torch

from collections import OrderedDict

from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers.modeling_outputs import SequenceClassifierOutput

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup
)

import wandb
wandb.init(mode="disabled")

In [3]:
# @title Submission utils

def prepare_submission(
    item_ids: pd.Series,
    texts: pd.Series,
    labels,
    model,
    tokenizer,
    batch_size,
    device,
) -> str:

    item_ids = item_ids.to_numpy()
    dev_dataset = PolarDataset(texts.tolist(), [], tokenizer, train=False)
    dataloader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=DataCollatorWithPadding(tokenizer)
    )

    outputs = []
    out_class = len(labels)

    for item in dataloader:
        ids = item_ids[item["idx"]]
        preds = model(input_ids=item["input_ids"].to(device), attention_mask=item["attention_mask"].to(device)).logits
        preds = torch.sigmoid(preds)
        preds = (preds > 0.5).int()

        for id, pred in zip(ids, preds):
            outputs.append([id] + [str(p.item()) for p in pred])

    return outputs

def save_submission(filename, rows, *header):
    with open(filename, "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(rows)
    

def compile_submission(
    root,
    filename,
    subtask_id,
    langs,
    labels,
    model,
    tokenizer,
    batch_size,
    device,
):
    subtask_dir = f"subtask_{subtask_id}"
    
    if os.path.exists(subtask_dir):
        shutil.rmtree(subtask_dir)
        
    os.makedirs(subtask_dir)
    
    for lang in langs:
        pred_file = f"pred_{lang}.csv"
        
        dev = pd.read_csv(os.path.join(root, f'subtask{subtask_id}/dev/{lang}.csv'))
    
        submission = prepare_submission(
            dev['id'],
            dev['text'],
            labels,
            model,
            tokenizer,
            batch_size,
            device,
        )
    
        save_submission(pred_file, submission, "id", *labels)
    
        shutil.move(pred_file, os.path.join(subtask_dir, pred_file))

    with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(subtask_dir):
            for file in files:
                zip_path = os.path.join(root, file)
                zipf.write(zip_path, arcname=zip_path)

In [4]:
# @title Pytorch Dataset

class PolarDataset(torch.utils.data.Dataset):
  def __init__(self, texts, labels, tokenizer, train=True, max_length = 128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.train = train

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text = self.texts[idx]
    encoding = self.tokenizer(
        text,
        truncation=True,
        padding=False,
        max_length=self.max_length,
        return_tensors='pt'
    )

    item = {key: encoding[key].squeeze() for key in encoding.keys()}

    if self.train:
        label = self.labels[idx]
        item['labels'] = torch.tensor(label, dtype=torch.float)

    item['idx'] = torch.tensor(idx, dtype=torch.long)
    return item

In [5]:
data_eng = pd.read_csv('/kaggle/input/polar-semeval-2026-task-9-dev/subtask1/train/eng.csv')
data_swa = pd.read_csv('/kaggle/input/polar-semeval-2026-task-9-dev/subtask1/train/swa.csv')
data_hau = pd.read_csv('/kaggle/input/polar-semeval-2026-task-9-dev/subtask1/train/hau.csv')

train_eng, val_eng = train_test_split(data_eng, test_size=0.2, random_state=42, stratify=data_eng['polarization'])
train_swa, val_swa = train_test_split(data_swa, test_size=0.2, random_state=42, stratify=data_swa['polarization'])
train_hau, val_hau = train_test_split(data_hau, test_size=0.2, random_state=42, stratify=data_hau['polarization'])

train = pd.concat([train_eng, train_swa, train_hau])
val = pd.concat([val_eng, val_swa, val_hau])

In [6]:
pretrained = [
    "google-bert/bert-base-multilingual-cased",
    "FacebookAI/xlm-roberta-base",
    "microsoft/mdeberta-v3-base",
    "jhu-clsp/mmBERT-base",
]

ROOT_DIR = "./"

In [7]:
# @title Custom model
class PolarModel(torch.nn.Module):
    def __init__(self, checkpoint, num_labels, *hidden_layers, weights=None):
        super(PolarModel, self).__init__()
        
        self.num_labels = num_labels
        self.criterion = torch.nn.BCEWithLogitsLoss(pos_weight=weights)

        self.config = AutoConfig.from_pretrained(checkpoint)
        self.base_model = AutoModel.from_pretrained(checkpoint, config=self.config)

        dense = []
        
        if len(hidden_layers) > 0:
            hidden_layers = [self.config.hidden_size] + list(hidden_layers)
            
            for i in range(len(hidden_layers) - 1):
                dense.append(torch.nn.Linear(hidden_layers[i], hidden_layers[i+1]))
                dense.append(torch.nn.Dropout(0.3))
                dense.append(torch.nn.ReLU())

        output = torch.nn.Linear(hidden_layers[-1] if len(hidden_layers) > 0 else self.config.hidden_size, self.num_labels)
        
        self.classifier = torch.nn.Sequential(
            *dense,
            torch.nn.Dropout(0.3),
            output,
        )
        

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])

        loss = None
        if labels is not None:
            loss = self.criterion(logits.view(*labels.shape), labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
def compute_metrics(p):
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=10,
        learning_rate=1e-5,
        weight_decay=0.01,
        lr_scheduler_type='linear',
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_steps=100,
        disable_tqdm=False
    )

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
for checkpoint in pretrained:

    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)

    experiment_label = f"{checkpoint.split('/')[-1]}"
    
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    train_dataset = PolarDataset(train['text'].tolist(), train['polarization'].tolist(), tokenizer)
    val_dataset = PolarDataset(val['text'].tolist(), val['polarization'].tolist(), tokenizer)
    
    model = PolarModel(checkpoint, 1)
    model.to(device)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,    
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer)
    )
    
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    trainer.train()
    
    eval_results = trainer.evaluate()
    
    compile_submission(
        ROOT_DIR,
        f"{experiment_label}.zip",
        subtask_id=1,
        langs=["eng", "swa", "hau"],
        labels=['polarization'],
        model=model,
        tokenizer=tokenizer,
        batch_size=32,
        device=device,
    )