# Polar SemEval 2026 Task 9

## Setup

In [16]:
# @title Imports

import csv
import os
import shutil
import zipfile
import json

import pandas as pd
import numpy as np
import torch

from collections import OrderedDict
from pprint import pprint

from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)

from transformers.modeling_outputs import SequenceClassifierOutput

In [17]:
# @title Training Class

class PolarDataset(torch.utils.data.Dataset):
    def __init__(self, ids, texts, labels, tokenizer, train=True, max_length=256):
        self.ids = ids
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.train = train

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {k: v.squeeze() for k, v in encoding.items()}

        if self.train:
            label = self.labels[idx]
            dtype = torch.float if isinstance(label, list) else torch.long
            item['labels'] = torch.tensor(label, dtype=dtype)

        item['idx'] = torch.tensor(idx, dtype=torch.long)

        return item


class PolarModel(torch.nn.Module):
    def __init__(self, checkpoint, num_labels, *hidden_layers, full_training=True, weights=None):
        super(PolarModel, self).__init__()

        self.num_labels = num_labels
        self.criterion = torch.nn.BCEWithLogitsLoss(pos_weight=weights)

        self.config = AutoConfig.from_pretrained(checkpoint)
        self.base_model = AutoModel.from_pretrained(checkpoint, config=self.config)

        dense = []

        if len(hidden_layers) > 0:
            hidden_layers = [self.config.hidden_size] + list(hidden_layers)

            for i in range(len(hidden_layers) - 1):
                dense.append(torch.nn.Linear(hidden_layers[i], hidden_layers[i+1]))
                dense.append(torch.nn.ReLU())
                dense.append(torch.nn.Dropout(0.3))

        output = torch.nn.Linear(hidden_layers[-1] if len(hidden_layers) > 0 else self.config.hidden_size, self.num_labels)

        self.classifier = torch.nn.Sequential(
            *dense,
            torch.nn.Dropout(0.3),
            output,
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        logits = self.classifier(outputs.last_hidden_state[:, 0, :])

        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

class PolarTrainer(Trainer):

    def __init__(self, base_lr=2e-5, lr_decay=0.95, classifier_lr=1e-4, *args, **kwargs):
        self.base_lr = base_lr
        self.lr_decay = lr_decay
        self.classifier_lr = classifier_lr
        super().__init__(*args, **kwargs)

    def __get_layer_wise_lr_param_groups(self, base_lr=2e-5, lr_decay=0.95, classifier_lr=1e-4):

        n_layers = len(model.base_model.layers) + 1

        param_groups = [{
            'params': model.base_model.embeddings.parameters(),
            'lr': base_lr * (lr_decay ** (n_layers+1)),
        }]

        for depth in range(1, n_layers):
            decayed_lr = base_lr * (lr_decay ** (n_layers + 1 - depth))
            param_groups.append({
                'params': model.base_model.layers[depth-1].parameters(),
                'lr': decayed_lr,
            })

        param_groups.append({
            'params': model.classifier.parameters(),
            'lr': classifier_lr,
        })

        return param_groups

    def create_optimizer(self):

        if self.optimizer is None:
            param_groups = self.__get_layer_wise_lr_param_groups(
                base_lr=self.base_lr,
                lr_decay=self.lr_decay,
                classifier_lr=self.classifier_lr
            )

            self.optimizer = torch.optim.AdamW(
                param_groups,
                betas=(self.args.adam_beta1, self.args.adam_beta2),
                eps=self.args.adam_epsilon,
                weight_decay=self.args.weight_decay
            )

        return self.optimizer

    def create_scheduler(self, num_training_steps: int, optimizer=None):

        if self.lr_scheduler is None:
            if optimizer is None:
                optimizer = self.optimizer

            warmup_steps = int(num_training_steps * self.args.warmup_ratio)

            self.lr_scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_training_steps
            )

        return self.lr_scheduler

In [3]:
# @title Submission utils

def prepare_submission(dataset, model, thresh, batch_size, device):
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=DataCollatorWithPadding(dataset.tokenizer)
    )

    outputs = []
    model.eval()

    with torch.no_grad():
        for batch in dataloader:
            ids = [dataset.ids[idx] for idx in batch["idx"]]
            logits = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device)
            ).logits

            probs = torch.sigmoid(logits)
            preds = (probs > thresh).int()

            for id, pred in zip(ids, preds):
                outputs.append([id] + [str(p.item()) for p in pred])

    return outputs


def save_submission(filename, rows, header):
    with open(filename, "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(rows)


def compile_submission(
    save_name,
    root_dir,
    subtask_id,
    languages,
    model,
    thresh,
    tokenizer,
    batch_size,
    device
):
    subtask_dir = f"subtask_{subtask_id}"

    if os.path.exists(subtask_dir):
        shutil.rmtree(subtask_dir)
    os.makedirs(subtask_dir)

    for lang in languages:
        dev = pd.read_csv(os.path.join(root_dir, f'subtask{subtask_id}/dev/{lang}.csv'))

        dataset = PolarDataset(
            dev['id'].tolist(),
            dev['text'].tolist(),
            [],
            tokenizer,
            train=False
        )

        submission = prepare_submission(dataset, model, thresh, batch_size, device)
        labels = dev.columns.drop(["id", "text"]).tolist()

        pred_file = os.path.join(subtask_dir, f"pred_{lang}.csv")
        save_submission(pred_file, submission, ["id"] + labels)

    with zipfile.ZipFile(save_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(subtask_dir):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=os.path.join(root, file))

In [4]:
# @title Data Loading

def load_dataset(root_dir, subtask_id, languages, tokenizer, test_size=0.2, random_state=42, stratify=False):
    train_dfs, val_dfs, test_dfs = [], [], []

    for lang in languages:
        train = pd.read_csv(os.path.join(root_dir, f'subtask{subtask_id}/train/{lang}.csv'))
        test = pd.read_csv(os.path.join(root_dir, f'subtask{subtask_id}/dev/{lang}.csv'))

        stratify_key = None
        if stratify:
            label_cols = train.columns.drop(["id", "text"]).tolist()
            stratify_key = train[label_cols].apply(lambda x: ''.join(x.astype(str)), axis=1)

        train_split, val_split = train_test_split(
            train,
            test_size=test_size,
            random_state=random_state,
            stratify=stratify_key
        )

        train_dfs.append(train_split)
        val_dfs.append(val_split)
        test_dfs.append(test)

    train = pd.concat(train_dfs, ignore_index=True)
    val = pd.concat(val_dfs, ignore_index=True)
    test = pd.concat(test_dfs, ignore_index=True)

    labels = train.columns.drop(["id", "text"]).tolist()

    weights = []
    for label in labels:
        vc = train[label].value_counts()
        weights.append(vc[0] / vc[1])

    train_dataset = PolarDataset(train['id'].tolist(), train['text'].tolist(), train[labels].values.tolist(), tokenizer)
    val_dataset = PolarDataset(val['id'].tolist(), val['text'].tolist(), val[labels].values.tolist(), tokenizer)
    test_dataset = PolarDataset(test['id'].tolist(), test['text'].tolist(), [], tokenizer, train=False)

    return {
        "train_dataset": train_dataset,
        "val_dataset": val_dataset,
        "test_dataset": test_dataset,
        "labels": labels,
        "weights": torch.tensor(weights)
    }

In [5]:
# @title Global Config

ROOT_DIR = "/kaggle/input/polar-semeval-2026-task-9-dev"
LANGUAGES = ["eng", "hau", "swa"]
MODEL_NAME = "jhu-clsp/mmBERT-base"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
# @title Training Config

def compute_metrics(p):
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

## Exploration

In [19]:
for sb_id in [1, 2, 3]:
    for lang in LANGUAGES:
        dataset = load_dataset(ROOT_DIR, sb_id, [lang], tokenizer)
        tdataset = dataset["train_dataset"]
        vdataset = dataset["val_dataset"]
        
        print("Train", sb_id, lang, len(tdataset), np.array(tdataset.labels).mean(axis=0))
        print("Val", sb_id, lang, len(vdataset), np.array(vdataset.labels).mean(axis=0))

Train 1 eng 2577 [0.37485448]
Val 1 eng 645 [0.32403101]
Train 1 hau 2920 [0.10684932]
Val 1 hau 731 [0.10943912]
Train 1 swa 5592 [0.49982117]
Val 1 swa 1399 [0.50679056]
Train 2 eng 2577 [0.36670547 0.08731083 0.03492433 0.02328289 0.03841676]
Val 2 eng 645 [0.31782946 0.08682171 0.03410853 0.01860465 0.04186047]
Train 2 hau 2920 [0.04691781 0.03047945 0.02842466 0.00753425 0.00410959]
Val 2 hau 731 [0.05608755 0.03556772 0.01367989 0.00957592 0.00273598]
Train 2 swa 5592 [0.02682403 0.35318312 0.03558655 0.02038627 0.08154506]
Val 2 swa 1399 [0.02573267 0.36311651 0.03431022 0.03002144 0.07076483]
Train 3 eng 2577 [0.15715949 0.27163368 0.12223516 0.2467986  0.11525029 0.18432286]
Val 3 eng 645 [0.12713178 0.24496124 0.11782946 0.20775194 0.09302326 0.17209302]
Train 3 hau 2920 [0.04349315 0.01027397 0.03390411 0.02876712 0.01027397 0.00273973]
Val 3 hau 731 [0.03967168 0.02051984 0.03830369 0.03556772 0.00273598 0.00136799]
Train 3 swa 5592 [0.39270386 0.41255365 0.125      0.23444

## Experiments

In [7]:
# @title Dataset

SUBTASK_ID = 1

dataset = load_dataset(ROOT_DIR, SUBTASK_ID, LANGUAGES, tokenizer, stratify=True)

train_dataset = dataset["train_dataset"]
val_dataset = dataset["val_dataset"]
labels = dataset["labels"]
weights = dataset["weights"].to(DEVICE)

evaluations = {}

In [None]:
# @title Baseline

# Config
BASE_LR = 1e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

NUM_EPOCH = 10
BATCH_SIZE = 64

training_args = TrainingArguments(
    output_dir=f"./",

    num_train_epochs=NUM_EPOCH,
    learning_rate=BASE_LR,
    lr_scheduler_type="linear",
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    logging_steps=100,
    disable_tqdm=False,
    report_to="none"
)

run_name = f"mmbert-s1-baseline"
    
# Initialization
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)

model = PolarModel(MODEL_NAME, len(labels))
model.to(DEVICE)

# Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

trainer.train()
evaluations[run_name] = trainer.evaluate()

# Saving
compile_submission(
    save_name=run_name,
    root_dir=ROOT_DIR,
    subtask_id=SUBTASK_ID,
    languages=LANGUAGES,
    model=model,
    thresh=0.5,
    tokenizer=tokenizer,
    batch_size=BATCH_SIZE,
    device=DEVICE
)

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

  trainer = Trainer(


model.safetensors:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


In [None]:
# @title Layers

# Config
BASE_LR = 1e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

NUM_EPOCH = 10
BATCH_SIZE = 64

training_args = TrainingArguments(
    output_dir=f"./",

    num_train_epochs=NUM_EPOCH,
    learning_rate=BASE_LR,
    lr_scheduler_type="linear",
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    logging_steps=100,
    disable_tqdm=False,
    report_to="none"
)

for hidden_layers in [[256], [512], [1024], [256, 256], [512, 512], [1024, 1024]]:

    run_name = f"mmbert-s1-{hidden_layers}"
    
    # Initialization
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    np.random.seed(42)

    model = PolarModel(MODEL_NAME, len(labels), *hidden_layers)
    model.to(DEVICE)
    
    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    trainer.train()
    evaluations[run_name] = trainer.evaluate()

    # Saving
    compile_submission(
        save_name=run_name,
        root_dir=ROOT_DIR,
        subtask_id=SUBTASK_ID,
        languages=LANGUAGES,
        model=model,
        thresh=0.5,
        tokenizer=tokenizer,
        batch_size=BATCH_SIZE,
        device=DEVICE
    )

In [None]:
# @title Layer wise LR

# Config
BASE_LR = 1e-5
LR_DECAY = 0.9
CLASSIFIER_LR = 1e-4
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

NUM_EPOCH = 10
BATCH_SIZE = 64

training_args = TrainingArguments(
    output_dir=f"./",

    num_train_epochs=NUM_EPOCH,
    learning_rate=BASE_LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    logging_steps=100,
    disable_tqdm=False,
    report_to="none"
)

run_name = f"mmbert-s1-layer_wise"
    
# Initialization
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)

model = PolarModel(MODEL_NAME, len(labels))
model.to(DEVICE)

# Training
trainer = PolarTrainer(
    base_lr=BASE_LR,
    lr_decay=LR_DECAY,
    classifier_lr=CLASSIFIER_LR,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

trainer.train()
evaluations[run_name] = trainer.evaluate()

# Saving
compile_submission(
    save_name=run_name,
    root_dir=ROOT_DIR,
    subtask_id=SUBTASK_ID,
    languages=LANGUAGES,
    model=model,
    thresh=0.5,
    tokenizer=tokenizer,
    batch_size=BATCH_SIZE,
    device=DEVICE
)

  super().__init__(*args, **kwargs)


In [None]:
pprint(evaluations, indent4)

## Submissions

In [None]:
# # @title Subtask 1

# torch.manual_seed(42)
# torch.cuda.manual_seed(42)
# np.random.seed(42)

# SUBTASK_ID = 1
# SAVE_NAME = f"mmbert-subtask-{SUBTASK_ID}.zip"

# # Config
# BASE_LR = 1e-5
# LR_DECAY = 0.9
# CLASSIFIER_LR = 1e-4
# WEIGHT_DECAY = 0.01
# WARMUP_RATIO = 0.06

# NUM_EPOCH = 10
# BATCH_SIZE = 64

# training_args = TrainingArguments(
#     output_dir=f"./",

#     num_train_epochs=NUM_EPOCH,
#     learning_rate=BASE_LR,
#     weight_decay=WEIGHT_DECAY,
#     warmup_ratio=WARMUP_RATIO,

#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     fp16=True,

#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,

#     logging_steps=100,
#     disable_tqdm=False,
#     report_to="none"
# )

# # Initialization
# dataset = load_dataset(ROOT_DIR, SUBTASK_ID, LANGUAGES, tokenizer, stratify=True)

# train_dataset = dataset["train_dataset"]
# val_dataset = dataset["val_dataset"]
# labels = dataset["labels"]
# weights = dataset["weights"]

# model = PolarModel(MODEL_NAME, len(labels), 1024, weights=weights)
# model.to(DEVICE)

# # Training
# trainer = PolarTrainer(
#     base_lr=BASE_LR,
#     lr_decay=LR_DECAY,
#     classifier_lr=CLASSIFIER_LR,
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

# trainer.train()
# print(trainer.evaluate())

# # Saving
# compile_submission(
#     save_name=SAVE_NAME,
#     root_dir=ROOT_DIR,
#     subtask_id=SUBTASK_ID,
#     languages=LANGUAGES,
#     model=model,
#     thresh=0.5,
#     tokenizer=tokenizer,
#     batch_size=BATCH_SIZE,
#     device=DEVICE
# )

In [None]:
# # @title Subtask 2

# torch.manual_seed(42)
# torch.cuda.manual_seed(42)
# np.random.seed(42)

# SUBTASK_ID = 2
# SAVE_NAME = f"mmbert-subtask-{SUBTASK_ID}.zip"

# # Config
# BASE_LR = 1e-5
# LR_DECAY = 0.9
# CLASSIFIER_LR = 1e-4
# WEIGHT_DECAY = 0.01
# WARMUP_RATIO = 0.06

# NUM_EPOCH = 10
# BATCH_SIZE = 64

# training_args = TrainingArguments(
#     output_dir=f"./",

#     num_train_epochs=NUM_EPOCH,
#     learning_rate=BASE_LR,
#     weight_decay=WEIGHT_DECAY,
#     warmup_ratio=WARMUP_RATIO,

#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     fp16=True,

#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,

#     logging_steps=100,
#     disable_tqdm=False,
#     report_to="none"
# )

# # Initialization
# dataset = load_dataset(ROOT_DIR, SUBTASK_ID, LANGUAGES, tokenizer, stratify=False)

# train_dataset = dataset["train_dataset"]
# val_dataset = dataset["val_dataset"]
# labels = dataset["labels"]
# weights = dataset["weights"]

# model = PolarModel(MODEL_NAME, len(labels), 1024, weights=weights)
# model.to(DEVICE)

# # Training
# trainer = PolarTrainer(
#     base_lr=BASE_LR,
#     lr_decay=LR_DECAY,
#     classifier_lr=CLASSIFIER_LR,
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

# trainer.train()
# print(trainer.evaluate())

# # Saving
# compile_submission(
#     save_name=SAVE_NAME,
#     root_dir=ROOT_DIR,
#     subtask_id=SUBTASK_ID,
#     languages=LANGUAGES,
#     model=model,
#     thresh=0.5,
#     tokenizer=tokenizer,
#     batch_size=BATCH_SIZE,
#     device=DEVICE
# )

In [None]:
# # @title Subtask 3

# torch.manual_seed(42)
# torch.cuda.manual_seed(42)
# np.random.seed(42)

# SUBTASK_ID = 3
# SAVE_NAME = f"mmbert-subtask-{SUBTASK_ID}.zip"

# # Config
# BASE_LR = 1e-5
# LR_DECAY = 0.9
# CLASSIFIER_LR = 1e-4
# WEIGHT_DECAY = 0.01
# WARMUP_RATIO = 0.06

# NUM_EPOCH = 10
# BATCH_SIZE = 64

# training_args = TrainingArguments(
#     output_dir=f"./",

#     num_train_epochs=NUM_EPOCH,
#     learning_rate=BASE_LR,
#     weight_decay=WEIGHT_DECAY,
#     warmup_ratio=WARMUP_RATIO,

#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     fp16=True,

#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,

#     logging_steps=100,
#     disable_tqdm=False,
#     report_to="none"
# )

# # Initialization
# dataset = load_dataset(ROOT_DIR, SUBTASK_ID, LANGUAGES, tokenizer, stratify=False)

# train_dataset = dataset["train_dataset"]
# val_dataset = dataset["val_dataset"]
# labels = dataset["labels"]
# weights = dataset["weights"]

# model = PolarModel(MODEL_NAME, len(labels), 1024, weights=weights)
# model.to(DEVICE)

# # Training
# trainer = PolarTrainer(
#     base_lr=BASE_LR,
#     lr_decay=LR_DECAY,
#     classifier_lr=CLASSIFIER_LR,
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

# trainer.train()
# print(trainer.evaluate())

# # Saving
# compile_submission(
#     save_name=SAVE_NAME,
#     root_dir=ROOT_DIR,
#     subtask_id=SUBTASK_ID,
#     languages=LANGUAGES,
#     model=model,
#     thresh=0.5,
#     tokenizer=tokenizer,
#     batch_size=BATCH_SIZE,
#     device=DEVICE
# )

---