In [1]:
import os
import random
import pandas as pd
import numpy as np
import gc

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    get_linear_schedule_with_warmup,
)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import torch._dynamo

torch._dynamo.config.suppress_errors = True

In [2]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
# --- Конфигурац ---
class CFG:
    MODEL_NAME = 'answerdotai/ModernBERT-base'
    MAX_LENGTH = 104
    BATCH_SIZE = 16
    EPOCHS = 4  
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    NUM_CLASSES = 2
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SEED = 42
    OUTPUT_DIR = '/kaggle/working/'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG.SEED)

In [4]:
train_df.drop(['location'], axis=1, inplace=True)
test_df.drop(['location'], axis=1, inplace=True)

In [5]:
train_df.fillna({'keyword': train_df['keyword'].mode()[0]}, inplace=True)
test_df.fillna({'keyword': train_df['keyword'].mode()[0]}, inplace=True)

In [6]:
# --- Создание Dataset ---
class DisasterTweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.texts = df['text'].values
        self.keywords = df['keyword'].values
        if not self.is_test:
            self.labels = df['target'].values
        self.ids = df['id'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        keyword = str(self.keywords[idx])

        # Формируем входной текст: "keyword [SEP] text [SEP]"
        text = f"{keyword}{self.tokenizer.sep_token}{text}"

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True, # Добавляет [CLS] и [SEP]
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }
        
        current_id = self.ids[idx] # Сохраняем ID

        if not self.is_test:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item
        else:
            # Для тестового набора данных возвращаем и ID
            return item, current_id


In [7]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME) 

# --- Разделение на обучающую и валидационную выборки ---
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df[['id','keyword', 'text']],
    train_df['target'],
    test_size=0.2,
    random_state=CFG.SEED,
    stratify=train_df['target']
)

train_split_df = pd.DataFrame({
    'id': train_texts['id'],
    'keyword': train_texts['keyword'],
    'text': train_texts['text'],
    'target': train_labels
})

val_split_df = pd.DataFrame({
    'id': val_texts['id'],
    'keyword': val_texts['keyword'],
    'text': val_texts['text'],
    'target': val_labels
})


train_dataset = DisasterTweetDataset(train_split_df, tokenizer, CFG.MAX_LENGTH)
val_dataset = DisasterTweetDataset(val_split_df, tokenizer, CFG.MAX_LENGTH)
test_dataset = DisasterTweetDataset(test_df, tokenizer, CFG.MAX_LENGTH, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [8]:
# --- Модель ---
model_config = AutoConfig.from_pretrained(
    CFG.MODEL_NAME, 
    num_labels=CFG.NUM_CLASSES
)

model = AutoModelForSequenceClassification.from_pretrained(
        CFG.MODEL_NAME,
        config=model_config
    )

model.to(CFG.DEVICE)

# --- Оптимизатор и скедулер ---
optimizer = AdamW(
    params=model.parameters(),      
    lr=CFG.LEARNING_RATE,                      
    betas=(0.9, 0.999),          
    eps=1e-8,                      
    weight_decay=CFG.WEIGHT_DECAY,
    amsgrad=False                 
)
num_training_steps = len(train_loader) * CFG.EPOCHS
num_warmup_steps = int(CFG.WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

criterion = nn.CrossEntropyLoss()

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

2025-05-30 17:28:09.797333: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748626089.985558      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748626090.045370      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# --- Функции обучения и оценки ---
def train_epoch(model, data_loader, optimizer, scheduler, device, criterion):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)        
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f"  Batch {batch_idx}/{len(data_loader)}, Train Loss: {loss.item():.4f}")

    return total_loss / len(data_loader)

def evaluate_epoch(model, data_loader, device, criterion):
    model.eval()
    total_loss = 0
    all_preds_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            all_preds_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)

    competition_log_loss = log_loss(all_labels, all_preds_probs, labels=[0, 1])

    return avg_loss, competition_log_loss



In [10]:
# --- Цикл трейнинга, рокки 10 бля ---
best_val_log_loss = float('inf')
best_model_path = os.path.join(CFG.OUTPUT_DIR, f"{CFG.MODEL_NAME.replace('/', '_')}_best.bin")

for epoch in range(CFG.EPOCHS):
    print(f"--- Epoch {epoch+1}/{CFG.EPOCHS} ---")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, CFG.DEVICE, criterion)
    val_loss, val_log_loss = evaluate_epoch(model, val_loader, CFG.DEVICE, criterion)

    print(f"Epoch {epoch+1} Summary: ")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val LogLoss (Competition Metric): {val_log_loss:.4f}")

    if val_log_loss < best_val_log_loss:
        best_val_log_loss = val_log_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"  New best model saved to {best_model_path} with LogLoss: {val_log_loss:.4f}")
    
    torch.cuda.empty_cache()
    gc.collect()

print(f"--- Training Finished ---")
print(f"Best Validation LogLoss: {best_val_log_loss:.4f}")


--- Epoch 1/4 ---


W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233] WON'T CONVERT compiled_embeddings /usr/local/lib/python3.11/dist-packages/transformers/models/modernbert/modeling_modernbert.py line 212 
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233] due to: 
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233] Traceback (most recent call last):
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233]   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/convert_frame.py", line 1164, in __call__
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233]     result = self._inner_convert(
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233]              ^^^^^^^^^^^^^^^^^^^^
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233]   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/convert_frame.py", line 547, in __call__
W0530 17:28:29.582000 19 torch/_dynamo/convert_frame.py:1233]     return _compile(
W0530 17:28:29.582000 19 t

  Batch 0/381, Train Loss: 0.8148
  Batch 100/381, Train Loss: 0.2705
  Batch 200/381, Train Loss: 0.4946
  Batch 300/381, Train Loss: 0.1455
Epoch 1 Summary: 
  Train Loss: 0.4840
  Val Loss: 0.3932
  Val LogLoss (Competition Metric): 0.3926
  New best model saved to /kaggle/working/answerdotai_ModernBERT-base_best.bin with LogLoss: 0.3926
--- Epoch 2/4 ---
  Batch 0/381, Train Loss: 0.4805
  Batch 100/381, Train Loss: 0.0980
  Batch 200/381, Train Loss: 0.3808
  Batch 300/381, Train Loss: 0.5372
Epoch 2 Summary: 
  Train Loss: 0.3325
  Val Loss: 0.4085
  Val LogLoss (Competition Metric): 0.4075
--- Epoch 3/4 ---
  Batch 0/381, Train Loss: 0.3469
  Batch 100/381, Train Loss: 0.2219
  Batch 200/381, Train Loss: 0.2173
  Batch 300/381, Train Loss: 0.2209
Epoch 3 Summary: 
  Train Loss: 0.2196
  Val Loss: 0.4245
  Val LogLoss (Competition Metric): 0.4225
--- Epoch 4/4 ---
  Batch 0/381, Train Loss: 0.0842
  Batch 100/381, Train Loss: 0.1980
  Batch 200/381, Train Loss: 0.0964
  Batch 300

In [11]:
# --- Предсказание на тестовых данных ---
model.load_state_dict(torch.load(best_model_path))
model.eval()

test_preds_classes = []
test_ids_ordered = []

with torch.no_grad():
    for batch_data, batch_ids in test_loader:
        input_ids = batch_data['input_ids'].to(CFG.DEVICE)
        attention_mask = batch_data['attention_mask'].to(CFG.DEVICE)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        predicted_classes_batch = torch.argmax(probs, dim=1).cpu().numpy()
        test_preds_classes.extend(predicted_classes_batch)
        
        test_ids_ordered.extend(batch_ids.cpu().tolist())

In [12]:
# --- Формирование файла сабмита ---
submission_df = pd.DataFrame({
    'id': test_ids_ordered,
    'target': test_preds_classes 
})

submission_path = os.path.join(CFG.OUTPUT_DIR, 'submission.csv')
submission_df.to_csv(submission_path, index=False)

print(f"Submission file created at: {submission_path}")
print("Top 5 rows of submission:")
print(submission_df.head())

Submission file created at: /kaggle/working/submission.csv
Top 5 rows of submission:
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1
