In [1]:
import os
import gc
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import torch._dynamo

torch._dynamo.config.suppress_errors = True

In [2]:
# --- Конфигурац ---
class CFG:
    MODEL_NAME = '/kaggle/input/modernbert/transformers/base/1/modernbert-base'
    MAX_LENGTH = 1024
    BATCH_SIZE = 8
    EPOCHS = 2  
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    NUM_CLASSES = 3
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SEED = 42
    TRAIN_FILE = '/kaggle/input/llm-classification-finetuning/train.csv'
    TEST_FILE = '/kaggle/input/llm-classification-finetuning/test.csv'
    SAMPLE_SUBMISSION = '/kaggle/input/llm-classification-finetuning/sample_submission.csv'
    OUTPUT_DIR = '/kaggle/working/'
    USE_FULL_DATA = True
    
# --- Установка SEED для воспроизводимости ---
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG.SEED)


In [3]:
# --- Загрузка и предобработка данных ---
train_df = pd.read_csv(CFG.TRAIN_FILE)
test_df = pd.read_csv(CFG.TEST_FILE)
sample_submission_df = pd.read_csv(CFG.SAMPLE_SUBMISSION)

# Отображение классов в числа
label_mapping = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
train_df['label_encoded'] = train_df['winner_model_a'].astype(int) * 0 + \
                            train_df['winner_model_b'].astype(int) * 1 + \
                            train_df['winner_tie'].astype(int) * 2

In [4]:
# Для быстрой отладки можно использовать часть данных
if not CFG.USE_FULL_DATA:
    train_df = train_df.sample(n=1000, random_state=CFG.SEED).reset_index(drop=True)

# --- Токенизатор ---
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME) 

In [5]:
# --- Создание Dataset ---
class LLMPreferenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.prompts = df['prompt'].values
        self.responses_a = df['response_a'].values
        self.responses_b = df['response_b'].values
        if not self.is_test:
            self.labels = df['label_encoded'].values
        self.ids = df['id'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        prompt = str(self.prompts[idx])
        response_a = str(self.responses_a[idx])
        response_b = str(self.responses_b[idx])

        # Формируем входной текст: "PROMPT [SEP] RESPONSE_A [SEP] RESPONSE_B"
        text = f"{prompt}{self.tokenizer.sep_token}{response_a}{self.tokenizer.sep_token}{response_b}"

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True, # Добавляет [CLS] и [SEP]
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }
        
        current_id = self.ids[idx] # Сохраняем ID

        if not self.is_test:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item
        else:
            # Для тестового набора данных возвращаем и ID
            return item, current_id



In [6]:
# --- Разделение на обучающую и валидационную выборки ---
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df[['id','prompt', 'response_a', 'response_b']],
    train_df['label_encoded'],
    test_size=0.2,
    random_state=CFG.SEED,
    stratify=train_df['label_encoded']
)

train_split_df = pd.DataFrame({
    'id': train_texts['id'],
    'prompt': train_texts['prompt'],
    'response_a': train_texts['response_a'],
    'response_b': train_texts['response_b'],
    'label_encoded': train_labels
})

val_split_df = pd.DataFrame({
    'id': val_texts['id'],
    'prompt': val_texts['prompt'],
    'response_a': val_texts['response_a'],
    'response_b': val_texts['response_b'],
    'label_encoded': val_labels
})


train_dataset = LLMPreferenceDataset(train_split_df, tokenizer, CFG.MAX_LENGTH)
val_dataset = LLMPreferenceDataset(val_split_df, tokenizer, CFG.MAX_LENGTH)
test_dataset = LLMPreferenceDataset(test_df, tokenizer, CFG.MAX_LENGTH, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2)

In [7]:
# --- Модель ---
model_config = AutoConfig.from_pretrained(
    CFG.MODEL_NAME, 
    num_labels=CFG.NUM_CLASSES
)

model = AutoModelForSequenceClassification.from_pretrained(
        CFG.MODEL_NAME,
        config=model_config
    )

model.to(CFG.DEVICE)

# --- Оптимизатор и планировщик ---
optimizer = AdamW(
    params=model.parameters(),      
    lr=CFG.LEARNING_RATE,                      
    betas=(0.9, 0.999),          
    eps=1e-8,                      
    weight_decay=CFG.WEIGHT_DECAY,
    amsgrad=False                 
)
num_training_steps = len(train_loader) * CFG.EPOCHS
num_warmup_steps = int(CFG.WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

criterion = nn.CrossEntropyLoss() #

2025-05-25 19:03:03.686087: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748199783.859366      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748199783.908869      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/modernbert/transformers/base/1/modernbert-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'head.dense.weight', 'head.norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# --- Функции обучения и оценки ---
def train_epoch(model, data_loader, optimizer, scheduler, device, criterion):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)        
        logits = outputs.logits
        loss = criterion(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Градиентный клиппинг
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        
        if batch_idx % 100 == 0: # Логирование каждые 100 батчей
            print(f"  Batch {batch_idx}/{len(data_loader)}, Train Loss: {loss.item():.4f}")

    return total_loss / len(data_loader)

def evaluate_epoch(model, data_loader, device, criterion):
    model.eval()
    total_loss = 0
    all_preds_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # loss = outputs.loss
            
            logits = outputs.logits
            loss = criterion(logits, labels)

            total_loss += loss.item()
            
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            all_preds_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    # Важно: log_loss ожидает вероятности и метки в правильном порядке
    # Убедитесь, что классы в probs соответствуют порядку 0, 1, 2 (winner_a, winner_b, winner_tie)
    # Если label_mapping был {a:0, b:1, tie:2}, то все в порядке
    competition_log_loss = log_loss(all_labels, all_preds_probs, labels=[0, 1, 2])

    return avg_loss, competition_log_loss



In [9]:
# --- Цикл обучения ---
best_val_log_loss = float('inf')
best_model_path = os.path.join(CFG.OUTPUT_DIR, f"{CFG.MODEL_NAME.replace('/', '_')}_best.bin")

for epoch in range(CFG.EPOCHS):
    print(f"--- Epoch {epoch+1}/{CFG.EPOCHS} ---")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, CFG.DEVICE, criterion)
    val_loss, val_log_loss = evaluate_epoch(model, val_loader, CFG.DEVICE, criterion)

    print(f"Epoch {epoch+1} Summary: ")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val LogLoss (Competition Metric): {val_log_loss:.4f}")

    if val_log_loss < best_val_log_loss:
        best_val_log_loss = val_log_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"  New best model saved to {best_model_path} with LogLoss: {val_log_loss:.4f}")
    
    torch.cuda.empty_cache()
    gc.collect()

print(f"--- Training Finished ---")
print(f"Best Validation LogLoss: {best_val_log_loss:.4f}")


--- Epoch 1/2 ---


W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233] WON'T CONVERT compiled_embeddings /usr/local/lib/python3.11/dist-packages/transformers/models/modernbert/modeling_modernbert.py line 212 
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233] due to: 
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233] Traceback (most recent call last):
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233]   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/convert_frame.py", line 1164, in __call__
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233]     result = self._inner_convert(
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233]              ^^^^^^^^^^^^^^^^^^^^
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233]   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/convert_frame.py", line 547, in __call__
W0525 19:03:28.797000 19 torch/_dynamo/convert_frame.py:1233]     return _compile(
W0525 19:03:28.797000 19 t

  Batch 0/5748, Train Loss: 1.1410
  Batch 100/5748, Train Loss: 1.1956
  Batch 200/5748, Train Loss: 1.0706
  Batch 300/5748, Train Loss: 0.9472
  Batch 400/5748, Train Loss: 1.0984
  Batch 500/5748, Train Loss: 1.4391
  Batch 600/5748, Train Loss: 1.0269
  Batch 700/5748, Train Loss: 1.1317
  Batch 800/5748, Train Loss: 1.0631
  Batch 900/5748, Train Loss: 1.2957
  Batch 1000/5748, Train Loss: 1.1350
  Batch 1100/5748, Train Loss: 1.2368
  Batch 1200/5748, Train Loss: 1.2073
  Batch 1300/5748, Train Loss: 1.1618
  Batch 1400/5748, Train Loss: 1.0965
  Batch 1500/5748, Train Loss: 1.0170
  Batch 1600/5748, Train Loss: 1.0871
  Batch 1700/5748, Train Loss: 1.1295
  Batch 1800/5748, Train Loss: 1.1552
  Batch 1900/5748, Train Loss: 1.2208
  Batch 2000/5748, Train Loss: 1.1278
  Batch 2100/5748, Train Loss: 1.2109
  Batch 2200/5748, Train Loss: 1.0389
  Batch 2300/5748, Train Loss: 1.3978
  Batch 2400/5748, Train Loss: 1.3001
  Batch 2500/5748, Train Loss: 1.0444
  Batch 2600/5748, Train

In [10]:
# --- Предсказание на тестовых данных ---
# Загружаем лучшую модель
model.load_state_dict(torch.load(best_model_path))
model.eval()

test_preds_probs = []
test_ids_ordered = []

with torch.no_grad():
    for batch_data, batch_ids in test_loader:
        input_ids = batch_data['input_ids'].to(CFG.DEVICE)
        attention_mask = batch_data['attention_mask'].to(CFG.DEVICE)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        
        test_preds_probs.extend(probs)
        test_ids_ordered.extend(batch_ids.cpu().tolist())

In [11]:
# --- Формирование файла сабмита ---
submission_df = pd.DataFrame(test_preds_probs, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission_df['id'] = test_ids_ordered 

submission_df = submission_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]

submission_path = os.path.join(CFG.OUTPUT_DIR, 'submission.csv')
submission_df.to_csv(submission_path, index=False)

print(f"Submission file created at: {submission_path}")
print("Top 5 rows of submission:")
print(submission_df.head())

Submission file created at: /kaggle/working/submission.csv
Top 5 rows of submission:
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.215080        0.200190    0.584730
1   211333        0.427445        0.291357    0.281198
2  1233961        0.355653        0.291164    0.353183
