In [None]:
# Ячейка 1: Импорты и базовые настройки
from dotenv import load_dotenv
load_dotenv()
import os
import ast
import copy
import gc
import itertools
import joblib
import json
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import pandas as pd
import pickle
import random
import re
import scipy as sp
import string
import sys
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import roc_auc_score, classification_report, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from tqdm.auto import tqdm

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

warnings.filterwarnings("ignore")

# Установка устройства
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device is: {device}")

In [None]:
# Ячейка 2: Пути к файлам и конфигурация
main_dir = os.getenv("MAIN_DIR")
data_dir = os.path.join(main_dir, "liar2")
extra_data_dir = os.path.join(main_dir, "liar-twitter")
output_dir = os.path.join(main_dir, "output")
os.makedirs(output_dir, exist_ok=True)

class config:
    MODEL = "microsoft/deberta-v3-xsmall"
    MAX_LEN = 128
    BATCH_SIZE_TRAIN = 32
    BATCH_SIZE_VALID = 32
    EPOCHS = 10
    LEARNING_RATE = 2e-5
    SEED = 42
    NUM_CLASSES = 5  # меньше на 1, т.к. далее мы объединяем 0 и 1 классы
    NUM_WORKERS = 0  # если мало памяти, лучше 0
    GRADIENT_ACCUMULATION_STEPS = 4
    WEIGHT_DECAY = 0.01

In [None]:
# Ячейка 3: Функция для установки seed
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.SEED)

In [None]:
# Ячейка 4.1: Загрузка данных liar2
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
valid_df = pd.read_csv(os.path.join(data_dir, "valid.csv"))
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))

print(f"Train shape liar2: {train_df.shape}")
print(f"Valid shape liar2: {valid_df.shape}")
print(f"Test shape liar2: {test_df.shape}")
display(train_df.head())

In [None]:
# Ячейка 4.2: Загрузка данных liar-twitter
extra_df = pd.read_csv(os.path.join(extra_data_dir, "Liar_Dataset.csv"))

label_mapping = {"pants-fire": 0, "FALSE": 1, "barely-true": 2, "half-true": 3, "mostly-true": 4, "TRUE": 5}
extra_df['label'] = extra_df['label'].map(label_mapping)

extra_train_df, extra_test_valid_df = train_test_split(
    extra_df,
    test_size=0.2,
    stratify=extra_df['label'],
    random_state=config.SEED
)

extra_valid_df, extra_test_df = train_test_split(
    extra_test_valid_df,
    test_size=0.5,
    stratify=extra_test_valid_df['label'],
    random_state=config.SEED
)

print(f"Train shape liar-twitter: {extra_train_df.shape}")
print(f"Valid shape liar-twitter: {extra_valid_df.shape}")
print(f"Test shape liar-twitter: {extra_test_df.shape}")
display(extra_df.head())

In [None]:
# Ячейка 4.3: Объединение датасетов
train_df = train_df[['statement', 'label']]
valid_df = valid_df[['statement', 'label']]
test_df = test_df[['statement', 'label']]

extra_train_df = extra_train_df[['statement', 'label']]
extra_valid_df = extra_valid_df[['statement', 'label']]
extra_test_df = extra_test_df[['statement', 'label']]

train_df = pd.concat([train_df, extra_train_df], ignore_index=True)
valid_df = pd.concat([valid_df, extra_valid_df], ignore_index=True)
test_df = pd.concat([test_df, extra_test_df], ignore_index=True)

print(f"Train shape: {train_df.shape}")
print(train_df['label'].value_counts())
print(f"Valid shape: {valid_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# Ячейка 5: Изменение данных и балансировка классов

# Объединяем 0 (pants-on-fire) и 1 (false) классы.
# Модель их плохо различает + они оба по сути ложные.
# Эти классы описывают лишь наглость лжи и без
# дополнительного контекста их будет сложно различить.
train_df['label'] = train_df['label'].replace({0:1})
valid_df['label'] = valid_df['label'].replace({0:1})
test_df['label'] = test_df['label'].replace({0:1})

# Теперь сдвигаем все классы на -1, чтобы классы начинались с 0
train_df['label'] = train_df['label'] - 1
valid_df['label'] = valid_df['label'] - 1
test_df['label'] = test_df['label'] - 1

# Андерсемплинг класса 0 до 20% от общего размера
majority_class = 0
max_samples = train_df['label'].value_counts().sort_values().iloc[len(train_df['label'].unique()) - 2]

df_majority = train_df[train_df['label'] == majority_class]
df_minority = train_df[train_df['label'] != majority_class]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=max_samples,
                                   random_state=config.SEED)

train_df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Оверсемплинг остальных классов до 15% от общего размера
min_samples = round(sum(train_df['label'].value_counts())*0.15)

dfs = []
for label in train_df_balanced['label'].unique():
    df_class = train_df_balanced[train_df_balanced['label'] == label]
    if len(df_class) < min_samples:
        df_upsampled = resample(df_class,
                                replace=True,
                                n_samples=min_samples,
                                random_state=config.SEED)
        dfs.append(df_upsampled)
    else:
        dfs.append(df_class)

train_df_balanced = pd.concat(dfs).sample(frac=1, random_state=config.SEED).reset_index(drop=True)

print("Баланс классов до ресэмплинга:")
print(train_df['label'].value_counts())

print()
print(f"Андерсемплинг до: {max_samples}")
print(f"Оверсемплинг до: {min_samples}")
print()

print("Баланс классов после ресэмплинга:")
print(train_df_balanced['label'].value_counts())

In [None]:
# Ячейка 6: Токенизатор
tokenizer = AutoTokenizer.from_pretrained(config.MODEL)

In [None]:
# Ячейка 7: Dataset для многоклассовой классификации
class LiarDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['statement'].fillna("").values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [None]:
# Ячейка 8: Создаем датасеты и загрузчики
train_dataset = LiarDataset(train_df_balanced, tokenizer, config.MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE_TRAIN, shuffle=True, num_workers=config.NUM_WORKERS)

valid_dataset = LiarDataset(valid_df, tokenizer, config.MAX_LEN)
valid_loader = DataLoader(valid_dataset, batch_size=config.BATCH_SIZE_VALID, shuffle=False, num_workers=config.NUM_WORKERS)

In [None]:
# Ячейка 9: Модель с выходом для 6 классов
class CustomModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.output_hidden_states = False
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:,0]  # CLS токен
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
# Ячейка 10: Функции обучения и валидации
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    losses = []
    preds = []
    targets = []

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        preds.append(outputs.argmax(dim=1).detach().cpu().numpy())
        targets.append(labels.detach().cpu().numpy())

    avg_loss = np.mean(losses)
    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    acc = (preds == targets).mean()
    return avg_loss, acc

def valid_epoch(model, dataloader, criterion):
    model.eval()
    losses = []
    preds = []
    targets = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            losses.append(loss.item())
            preds.append(outputs.argmax(dim=1).detach().cpu().numpy())
            targets.append(labels.detach().cpu().numpy())

    avg_loss = np.mean(losses)
    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    acc = (preds == targets).mean()
    return avg_loss, acc

In [None]:
# Ячейка 11: Основной тренировочный цикл
model = CustomModel(config.MODEL, config.NUM_CLASSES).to(device)
#model.load_state_dict(torch.load(os.path.join(output_dir, "deberta_v3_xsmall_liar2.pth")))

optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY)

# Дисбаланс классов
class_counts = train_df_balanced['label'].value_counts().sort_index()
class_weights = 1.0 / torch.tensor(class_counts.values, dtype=torch.float)
class_weights = class_weights / class_weights.sum()
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

# Без дисбаланса
# criterion = nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler()

looping = True

while looping:
  for epoch in range(config.EPOCHS):
      model.train()
      train_losses = []
      train_preds = []
      train_targets = []

      optimizer.zero_grad()

      for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}")):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)

          with torch.cuda.amp.autocast():
              outputs = model(input_ids, attention_mask)
              loss = criterion(outputs, labels)
              loss = loss / config.GRADIENT_ACCUMULATION_STEPS

          scaler.scale(loss).backward()

          if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0 or (step + 1) == len(train_loader):
              scaler.step(optimizer)
              scaler.update()
              optimizer.zero_grad()

          train_losses.append(loss.item() * config.GRADIENT_ACCUMULATION_STEPS)
          preds = outputs.argmax(dim=1).detach().cpu().numpy()
          train_preds.extend(preds)
          train_targets.extend(labels.detach().cpu().numpy())

      train_acc = np.mean(np.array(train_preds) == np.array(train_targets))
      train_loss = np.mean(train_losses)

      # Валидация (без градиентов)
      model.eval()
      val_losses = []
      val_preds = []
      val_targets = []

      with torch.no_grad():
          for batch in tqdm(valid_loader, desc="Validation"):
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              labels = batch['labels'].to(device)

              outputs = model(input_ids, attention_mask)
              loss = criterion(outputs, labels)

              val_losses.append(loss.item())
              preds = outputs.argmax(dim=1).detach().cpu().numpy()
              val_preds.extend(preds)
              val_targets.extend(labels.detach().cpu().numpy())

      val_acc = np.mean(np.array(val_preds) == np.array(val_targets))
      val_loss = np.mean(val_losses)

      print(f"Epoch {epoch+1}/{config.EPOCHS} | Train loss: {train_loss:.4f} acc: {train_acc:.4f} | Val loss: {val_loss:.4f} acc: {val_acc:.4f}")
      print("Classification report (macro):")
      print(classification_report(val_targets, val_preds, digits=4, zero_division=0))
      print("Macro F1-score:", f1_score(val_targets, val_preds, average='macro'))
      print("Confusion matrix:")
      print(confusion_matrix(val_targets, val_preds))
      print("")

  user_input = input("Хотите продолжить тренировку? (yes/no): ").strip().upper()
  if "YES" in user_input or "Y" in user_input:
      looping = True
  else:
      looping = False

# Сохранение модели
torch.save(model.state_dict(), os.path.join(output_dir, "deberta_v3_xsmall_liar2.pth"))
