# Задача классификации наличия матов в тексте отзыва

Импорт библиотек

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import torch

import re
from tqdm import tqdm

    
seed = 42

### Базовый класс

In [None]:
from abc import ABC, abstractmethod


class BaseTextClassifier(ABC):
    def __init__(self, verbose=True, threshold=0.5):
        self.threshold = threshold
        self.is_fitted = False
        self.verbose = verbose
    
    @abstractmethod
    def fit(self, X, y):
        pass
    
    @abstractmethod 
    def predict(self, X):
        pass
    
    def predict_proba(self, X):
        # По умолчанию - заглушка
        predictions = self.predict(X)
        probs = np.zeros((len(X), 2))
        probs[predictions == 0, 0] = 1.0
        probs[predictions == 1, 1] = 1.0
        return probs
    
    def evaluate(self, X, y, model_name="Model"):
        """Стандартная оценка модели"""
        y_pred = self.predict(X)
        f1 = f1_score(y, y_pred)
        
        if self.verbose:
            print(f"\n{model_name} F1-score на validation: {f1:.4f}")
            print("\nClassification Report:")
            print(classification_report(y, y_pred))
            print("\nConfusion Matrix:")
            print(confusion_matrix(y, y_pred))
        
        return f1

## Извлечение данных

In [8]:
data_dir = 'data'
train_df = pd.read_csv(f'{data_dir}/train.csv')
test_df = pd.read_csv(f'{data_dir}/test.csv')
# sample_submission = pd.read_csv('sample_submission.csv')

Разбиение на X и Y

In [9]:
from sklearn.model_selection import train_test_split

target_column = "label"
np.random.seed(seed)

test_size = 0.2
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], 
    train_df[target_column], 
    test_size=test_size, 
    random_state=seed,
    stratify=train_df[target_column]
)

print(f"Train : {X_train.shape} {y_train.shape}")
print(f"Validation : {X_val.shape} {y_val.shape}")
print(f"\nРаспределение классов в train: {y_train.value_counts()}")
print(f"Распределение классов в val: {y_val.value_counts()}")

Train : (192127,) (192127,)
Validation : (48032,) (48032,)

Распределение классов в train: label
0    168369
1     23758
Name: count, dtype: int64
Распределение классов в val: label
0    42092
1     5940
Name: count, dtype: int64


### Просмотр данных

In [10]:
print("Размеры данных:")
print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")
print("\nРаспределение классов в train:")
print(train_df['label'].value_counts())

Размеры данных:
Train: (240159, 3)
Test: (60040, 2)

Распределение классов в train:
label
0    210461
1     29698
Name: count, dtype: int64


In [None]:
pd.set_option('display.max_colwidth', None)  # Показать полную ширину колонок (чтобы видеть отзыв полностью)
pd.set_option('display.width', None)  # Убрать ограничение по ширине


# Извлечём первые 10 строк с меткой 0 
train_df[train_df['label'] == 0].head(10)

In [None]:
# Извлечём первые 10 строк с меткой 1
train_df[train_df['label'] == 1].head(10)

## Бейслайны

### Частотный бейслайн

**Частотный baseline** - предсказывает пропорционально распределению классов

In [8]:
class FrequencyBaseline(BaseTextClassifier):
    def __init__(self, verbose=True):
        super().__init__(verbose=verbose)
        self.class_1_prob = None
    
    def fit(self, X, y):
        # Запоминаем долю класса 1 в тренировочных данных
        self.class_1_prob = y.mean()
        if self.verbose:
            print(f"Доля класса 1 в train: {self.class_1_prob:.3f}")
        self.is_fitted = True
        return self
    
    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
            
        # Случайно предсказываем с вероятностью, равной доле класса 1
        np.random.seed(seed)  # Для воспроизводимости
        predictions = np.random.binomial(1, self.class_1_prob, size=len(X))
        return predictions
    
    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
            
        # Возвращаем вероятности на основе частоты класса
        probs = np.zeros((len(X), 2))
        probs[:, 0] = 1 - self.class_1_prob  # Класс 0
        probs[:, 1] = self.class_1_prob      # Класс 1
        return probs

In [9]:
# Создаем и тестируем модель
baseline_model = FrequencyBaseline()
_ =baseline_model.fit(X_train, y_train)

Доля класса 1 в train: 0.124


In [10]:
# Используем встроенный метод evaluate
f1_baseline = baseline_model.evaluate(X_val, y_val, "Frequency Baseline")


Frequency Baseline F1-score на validation: 0.1280

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     42092
           1       0.13      0.13      0.13      5940

    accuracy                           0.78     48032
   macro avg       0.50      0.50      0.50     48032
weighted avg       0.78      0.78      0.78     48032


Confusion Matrix:
[[36904  5188]
 [ 5179   761]]


### TF-IDF модель + Логистическая регрессия бейслайн

In [None]:
class TfIdfBaseline(BaseTextClassifier):
    def __init__(self, verbose=True):
        super().__init__(verbose=verbose)
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=18000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95,
                lowercase=True,
                token_pattern=r'\b\w+\b'
            )),
            ('classifier', LogisticRegression(
                random_state=seed,
                class_weight='balanced',
                C=4.0,
                max_iter=1000,
                verbose=1 if verbose else 0  # Показывает прогресс LogReg
            ))
        ])
    
    def preprocess_text(self, texts):
        """Простая предобработка текста"""
        processed = []
        if self.verbose:
            print("Предобработка текстов...")
            iterator = tqdm(texts, desc="Processing texts")
        else:
            iterator = texts
            
        for text in iterator:
            text = re.sub(r'[^\w\s]', ' ', str(text).lower())
            text = re.sub(r'\s+', ' ', text.strip())
            processed.append(text)
        return processed
    
    def fit(self, X, y):
        if self.verbose:
            print("=== Обучение TF-IDF модели ===")
        
        # Предобработка с прогрессом
        X_processed = self.preprocess_text(X)
        
        if self.verbose:
            print("Обучение логистической регрессии...")
        self.pipeline.fit(X_processed, y)
        
        self.is_fitted = True

        if self.verbose:
            print("Обучение завершено!")
        return self
    
    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
        
        X_processed = self.preprocess_text(X)
        return self.pipeline.predict(X_processed)
    
    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
            
        X_processed = self.preprocess_text(X)
        return self.pipeline.predict_proba(X_processed)


In [None]:

# Создаем и обучаем модель
tfidf_model = TfIdfBaseline()
_ = tfidf_model.fit(X_train, y_train)


In [11]:
# Оценка с использованием встроенного метода
f1_tfidf = tfidf_model.evaluate(X_val, y_val, "TF-IDF")  # F1 = 0.8156

Предобработка текстов...


Processing texts: 100%|██████████| 48032/48032 [00:00<00:00, 139746.90it/s]



TF-IDF F1-score на validation: 0.8156

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     42092
           1       0.77      0.87      0.82      5940

    accuracy                           0.95     48032
   macro avg       0.87      0.92      0.89     48032
weighted avg       0.95      0.95      0.95     48032


Confusion Matrix:
[[40504  1588]
 [  756  5184]]


## Модели

### Оптимизация TF-IDF через GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

class OptimizedTfIdfBaseline(BaseTextClassifier):
    def __init__(self, verbose=True):
        super().__init__(verbose=verbose)
        
        # Базовый pipeline
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                min_df=2,
                max_df=0.95,
                ngram_range=(1, 2), 
                lowercase=True,
                token_pattern=r'\b\w+\b',
                sublinear_tf=True  # Помогает с производительностью
            )),
            ('classifier', LogisticRegression(
                random_state=seed,
                class_weight='balanced',
                solver='liblinear',
                verbose=1 if verbose else 0
            ))
        ])
        
        # Ограниченный поиск параметров
        self.param_grid = {
            'tfidf__max_features': [20000, 25000],
            'classifier__C': [4.0, 4.5],
            'classifier__max_iter': [500, 800]
        }
        
        self.best_model = None
        self.grid_search = None
    
    def preprocess_text(self, texts):
        """Простая предобработка текста"""
        processed = []
        if self.verbose:
            print("Предобработка текстов...")
            iterator = tqdm(texts, desc="Processing texts")
        else:
            iterator = texts
            
        for text in iterator:
            text = re.sub(r'[^\w\s]', ' ', str(text).lower())
            text = re.sub(r'\s+', ' ', text.strip())
            processed.append(text)
        return processed
    
    def fit(self, X, y):
        if self.verbose:
            print("=== Grid Search для TF-IDF модели ===")
        
        # Предобработка
        X_processed = self.preprocess_text(X)
        
        # Настройка GridSearchCV с экономией памяти
        cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)
        
        if self.verbose:
            total_combinations = 1
            for param_values in self.param_grid.values():
                total_combinations *= len(param_values)
            print(f"Всего комбинаций: {total_combinations}")
            print(f"С 2-fold CV: {total_combinations * 2} обучений")
        
        # GridSearchCV с ограничениями
        self.grid_search = GridSearchCV(
            self.pipeline,
            self.param_grid,
            cv=cv,
            scoring='f1',
            n_jobs=11,  # Ограничиваем процессы
            verbose=1 if self.verbose else 0
        )
        
        if self.verbose:
            print("Запуск Grid Search...")
        
        # Обучение
        self.grid_search.fit(X_processed, y)
        
        # Сохраняем лучшую модель
        self.best_model = self.grid_search.best_estimator_
        self.is_fitted = True
        
        if self.verbose:
            print(f"\nGrid Search завершен!")
            print(f"Лучший F1-score (CV): {self.grid_search.best_score_:.4f}")
            print(f"Лучшие параметры:")
            for param, value in self.grid_search.best_params_.items():
                print(f"  {param}: {value}")
        
        return self
    
    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
        
        X_processed = self.preprocess_text(X)
        return self.best_model.predict(X_processed)
    
    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
            
        X_processed = self.preprocess_text(X)
        return self.best_model.predict_proba(X_processed)

In [None]:
# Запускаем оптимизированную модель
optimized_model = OptimizedTfIdfBaseline()
_ = optimized_model.fit(X_train, y_train)

In [24]:
# Оценка
f1_optimized = optimized_model.evaluate(X_val, y_val, "Optimized TF-IDF")
print(f"Улучшение по сравнению с базовым TF-IDF: {f1_optimized - f1_tfidf:.4f}")

Предобработка текстов...


Processing texts: 100%|██████████| 48032/48032 [00:00<00:00, 125644.36it/s]



Optimized TF-IDF F1-score на validation: 0.8086

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     42092
           1       0.76      0.87      0.81      5940

    accuracy                           0.95     48032
   macro avg       0.87      0.91      0.89     48032
weighted avg       0.95      0.95      0.95     48032


Confusion Matrix:
[[40428  1664]
 [  779  5161]]
Улучшение по сравнению с базовым TF-IDF: -0.0017


### DistilBERT

In [None]:
class DistilBertBaseline(BaseTextClassifier):
    def __init__(self, verbose=True):
        super().__init__(verbose=verbose)

        self.device = self._get_best_device(verbose)    
        if verbose:
            print(f"Устройство: {self.device}")    
        
        # Используем русский DistilBERT
        self.model_name = "distilbert-base-multilingual-cased"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, 
            num_labels=2
        ).to(self.device)  # Переносим модель на нужное устройство

    def _get_best_device(self, verbose):
        """Определяет лучшее доступное устройство"""
                
        # Проверяем NVIDIA CUDA
        if torch.cuda.is_available():
            if verbose:
                print("Используем NVIDIA GPU (CUDA)")
            return torch.device('cuda')
        
        # Fallback на CPU
        if verbose:
            print("Используем CPU")
        return torch.device('cpu')    
    

    def tokenize_texts(self, texts):
        return self.tokenizer(
            list(texts),
            truncation=True,
            padding=True,
            max_length=128,  # Короткие тексты
            return_tensors="pt"
        ).to(self.device)
    
    def fit(self, X, y):
        if self.verbose:
            print(f"Распределение классов: {np.bincount(y)}")
        
        # Создаем Dataset
        train_encodings = self.tokenize_texts(X)
        
        class TextDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels
            
            def __getitem__(self, idx):
                item = {key: val[idx] for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx])
                return item
            
            def __len__(self):
                return len(self.labels)
        
        train_dataset = TextDataset(train_encodings, y.tolist())
        
        # Настройки обучения
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=5,
            per_device_train_batch_size=8,


            learning_rate=3e-5,
            warmup_steps=50,
            weight_decay=0.1,

            logging_steps=25,
            save_strategy="no",

            adam_epsilon=1e-6,
            max_grad_norm=1.0,

            logging_dir='./logs',
        )
        
        # Добавляем балансировку классов вручную
        from sklearn.utils.class_weight import compute_class_weight
        
        class_weights = compute_class_weight(
            'balanced',
            classes=np.unique(y),
            y=y
        )

        # Custom Trainer с взвешенной функцией потерь
        class WeightedTrainer(Trainer):
            def __init__(self, device, class_weights, *args, **kwargs):
                super().__init__(*args, **kwargs)
                self.device = device
                self.class_weights = class_weights
            
            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                labels = inputs.get("labels")
                outputs = model(**inputs)
                logits = outputs.get('logits')
                
                # Взвешенная CrossEntropy
                loss_fct = torch.nn.CrossEntropyLoss(
                    weight=torch.tensor(class_weights, dtype=torch.float).to(self.device)
                )
                loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                
                return (loss, outputs) if return_outputs else loss

        if self.verbose:
            print(f"Веса классов: {dict(zip(np.unique(y), class_weights))}")

        # Обучение
        trainer = WeightedTrainer(
            device=self.device,
            class_weights=class_weights,
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
        )
        
        trainer.train()
        self.is_fitted = True
        return self
    

    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
        
        # Токенизация входных данных
        inputs = self.tokenize_texts(X)
        
        # Предсказание
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Используем более мягкий порог для лучшего recall
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            threshold = 0.4  # Снижаем порог для лучшего recall класса 1
            predicted_labels = (probabilities[:, 1] > threshold).long()
        
        return predicted_labels.cpu().numpy()
    
    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")
        
        # Токенизация входных данных
        inputs = self.tokenize_texts(X)
        
        # Предсказание
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        return probabilities.cpu().numpy()

In [28]:
distilbert_model = DistilBertBaseline()
_ = distilbert_model.fit(X_train[:1000], y_train[:1000])  # Пробуем на маленькой выборке

Используем CPU
Устройство: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Распределение классов: [882 118]
Веса классов: {np.int64(0): np.float64(0.5668934240362812), np.int64(1): np.float64(4.237288135593221)}




Step,Training Loss


KeyboardInterrupt: 

In [None]:
'''
Attempt 1: 0.6786
Attempt 2:
'''
f1_distilbert = distilbert_model.evaluate(X_val[:200], y_val[:200], "DistilBERT")


DistilBERT F1-score на validation: 0.6786

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       430
           1       0.90      0.54      0.68        70

    accuracy                           0.93       500
   macro avg       0.92      0.77      0.82       500
weighted avg       0.93      0.93      0.92       500


Confusion Matrix:
[[426   4]
 [ 32  38]]


### RuBERT

In [None]:
class RuBert(BaseTextClassifier):
    def __init__(self, verbose=True):
        super().__init__(verbose=verbose)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if verbose:
            print(f"Устройство: {self.device}")
            print(f"GPU доступно: {torch.cuda.device_count()} устройств")

        self.model_name = "DeepPavlov/rubert-base-cased"
        # self.model_name = "ai-forever/ruBert-base"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, num_labels=2
        )
        
        if torch.cuda.device_count() > 1:
            if verbose:
                print(f"Используем {torch.cuda.device_count()} GPU с DataParallel")
            self.model = torch.nn.DataParallel(self.model)
            self.model.to(self.device)
            # Сохраняем оригинальную модель для доступа к методам
            self.base_model = self.model.module
        else:
            self.model.to(self.device)
            self.base_model = self.model

    def fit(self, X, y):
        if self.verbose:
            print(f"Размер датасета: {len(X)} образцов")

        class TextDataset(torch.utils.data.Dataset):
            def __init__(self, texts, labels, tokenizer):
                self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
                self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
                self.tokenizer = tokenizer

            def __getitem__(self, idx):
                text = str(self.texts[idx])
                encoding = self.tokenizer(
                    text, truncation=True, padding='max_length',
                    max_length=96, return_tensors="pt"
                )
                return {
                    'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'labels': torch.tensor(self.labels[idx], dtype=torch.long)
                }

            def __len__(self):
                return len(self.labels)

        train_dataset = TextDataset(X, y, self.tokenizer)

        training_args = TrainingArguments(
            output_dir='./results',
            
            num_train_epochs=6,
            per_device_train_batch_size=96,
            
            dataloader_num_workers=0,
            dataloader_pin_memory=False,
            fp16=True,
            
            learning_rate=8e-6,
            warmup_steps=500,
            weight_decay=0.01,
            
            logging_steps=50,
            save_strategy="no",
            remove_unused_columns=False,
            report_to="none"
        )

        # Балансировка классов
        from sklearn.utils.class_weight import compute_class_weight
        class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

        class WeightedTrainer(Trainer):
            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)
                self.step = 0
            
            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                labels = inputs.get("labels")
                
                model_device = next(model.parameters()).device
                inputs = {k: v.to(model_device) for k, v in inputs.items()}
                
                outputs = model(**inputs)

                loss_fct = torch.nn.CrossEntropyLoss(
                    weight=torch.tensor(class_weights, dtype=torch.float).to(model_device)
                )
                loss = loss_fct(outputs.logits.view(-1, 2), labels.view(-1))
                self.step += 1
                if self.step % 50 == 0:
                    print(f"iter {self.step}. loss: {loss}")
                return (loss, outputs) if return_outputs else loss

        if self.verbose:
            print(f"Веса классов: {dict(zip(np.unique(y), class_weights))}")

        trainer = WeightedTrainer(model=self.model, args=training_args, train_dataset=train_dataset)
        trainer.train()
        self.is_fitted = True
        return self

    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")

        batch_size = 64
        predictions = []

        self.model.eval()
        with torch.no_grad():
            for i in range(0, len(X), batch_size):
                batch_texts = X[i:i+batch_size].tolist() if hasattr(X, 'tolist') else X[i:i+batch_size]
                inputs = self.tokenizer(
                    batch_texts, truncation=True, padding=True,
                    max_length=96, return_tensors="pt"
                ).to(self.device)

                outputs = self.model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                batch_preds = (probs[:, 1] > self.threshold).long()
                predictions.extend(batch_preds.cpu().numpy())

        return np.array(predictions)

    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")

        batch_size = 64
        probabilities = []

        self.model.eval()
        with torch.no_grad():
            for i in range(0, len(X), batch_size):
                batch_texts = X[i:i+batch_size].tolist() if hasattr(X, 'tolist') else X[i:i+batch_size]
                inputs = self.tokenizer(
                    batch_texts, truncation=True, padding=True,
                    max_length=96, return_tensors="pt"
                ).to(self.device)

                outputs = self.model(**inputs)
                batch_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                probabilities.extend(batch_probs.cpu().numpy())

        return np.array(probabilities)

In [None]:
rubert_model = RuBert()
_ = rubert_model.fit(X_train[:2000], y_train[:2000])

Устройство: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Размер датасета: 2000 образцов
Ожидаемое время: ~0 минут
Веса классов: {np.int64(0): np.float64(0.5614823133071308), np.int64(1): np.float64(4.566210045662101)}


Step,Training Loss


KeyboardInterrupt: 

In [None]:
f1_rubert = rubert_model.evaluate(X_val[:500], y_val[:500], "RuBERT Test")

### AlBERT

In [None]:
class Albert(BaseTextClassifier):
    def __init__(self, verbose=True, **kwargs):
        super().__init__(verbose=verbose, **kwargs)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if verbose:
            print(f"Устройство: {self.device}")
            print(f"GPU доступно: {torch.cuda.device_count()} устройств")

        self.model_name = "ai-forever/ru-en-RoSBERTa"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, num_labels=2
        )
    
        if torch.cuda.device_count() > 1:
            if verbose:
                print(f"Используем {torch.cuda.device_count()} GPU с DataParallel")
            self.model = torch.nn.DataParallel(self.model)
            self.model.to(self.device)
            # Сохраняем оригинальную модель для доступа к методам
            self.base_model = self.model.module
        else:
            self.model.to(self.device)
            self.base_model = self.model
            
    def fit(self, X, y):
        if self.verbose:
            print(f"Размер датасета: {len(X)} образцов")

        class TextDataset(torch.utils.data.Dataset):
            def __init__(self, texts, labels, tokenizer):
                self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
                self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
                self.tokenizer = tokenizer

            def __getitem__(self, idx):
                text = str(self.texts[idx])
                encoding = self.tokenizer(
                    text, truncation=True, padding='max_length',
                    max_length=96, return_tensors="pt"
                )
                return {
                    'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'labels': torch.tensor(self.labels[idx], dtype=torch.long)
                }

            def __len__(self):
                return len(self.labels)

        train_dataset = TextDataset(X, y, self.tokenizer)

        training_args = TrainingArguments(
            output_dir=f'{output_dir}/results_albert',
            resume_from_checkpoint='/kaggle/input/albert/pytorch/v1-6000it/1/checkpoint-6004',

            num_train_epochs=5,
            per_device_train_batch_size=64,

            dataloader_num_workers=0,
            dataloader_pin_memory=False,
            fp16=True,

            learning_rate=1e-5,  
            warmup_steps=500,   
            weight_decay=0.02,  

            logging_steps=50,
            save_strategy="no",
            remove_unused_columns=False,
            report_to="none"
        )

        # Балансировка классов
        from sklearn.utils.class_weight import compute_class_weight
        class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

        class WeightedTrainer(Trainer):
            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)
                self.step = 0
                
            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                labels = inputs.get("labels")
                
                model_device = next(model.parameters()).device
                inputs = {k: v.to(model_device) for k, v in inputs.items()}
                outputs = model(**inputs)

                loss_fct = torch.nn.CrossEntropyLoss(
                    weight=torch.tensor(class_weights, dtype=torch.float).to(model_device)
                )
                loss = loss_fct(outputs.logits.view(-1, 2), labels.view(-1))
                self.step += 1
                print(f"iter {self.step}. loss: {loss}")
                if self.step % 50 == 0:
                    print(f"iter {self.step}. loss: {loss}")
                return (loss, outputs) if return_outputs else loss

        if self.verbose:
            print(f"Веса классов: {dict(zip(np.unique(y), class_weights))}")

        trainer = WeightedTrainer(model=self.model, args=training_args, train_dataset=train_dataset)
        trainer.train()
        self.is_fitted = True
        return self

    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")

        batch_size = 32 
        predictions = []

        self.model.eval()
        with torch.no_grad():
            for i in range(0, len(X), batch_size):
                batch_texts = X[i:i+batch_size].tolist() if hasattr(X, 'tolist') else X[i:i+batch_size]
                inputs = self.tokenizer(
                    batch_texts, truncation=True, padding=True,
                    max_length=96, return_tensors="pt" 
                ).to(self.device)

                outputs = self.model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                batch_preds = (probs[:, 1] > self.threshold).long() 
                predictions.extend(batch_preds.cpu().numpy())

        return np.array(predictions)

    def predict_proba(self, X):
        if not self.is_fitted:
            raise ValueError("Модель не обучена!")

        batch_size = 32
        probabilities = []

        self.model.eval()
        with torch.no_grad():
            for i in range(0, len(X), batch_size):
                batch_texts = X[i:i+batch_size].tolist() if hasattr(X, 'tolist') else X[i:i+batch_size]
                inputs = self.tokenizer(
                    batch_texts, truncation=True, padding=True,
                    max_length=96, return_tensors="pt"
                ).to(self.device)

                outputs = self.model(**inputs)
                batch_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                probabilities.extend(batch_probs.cpu().numpy())

        return np.array(probabilities)

In [None]:
albert_model = Albert()
_ = albert_model.fit(X_train[:3000], y_train[:3000])  # Больше данных чем для RuBERT

In [None]:
f1_albert_test = albert_model.evaluate(X_val[:1000], y_val[:1000], "ALBERT Test")

In [None]:
print("ОПТИМИЗАЦИЯ THRESHOLD ДЛЯ ALBERT")

# Диапазон threshold для тестирования
thresholds = np.round(np.arange(0.4, 0.95, 0.05), 3).tolist()

results = []

print(f"Тестируем thresholds: {thresholds}")
print("-" * 50)

verbose = albert_model.verbose
albert_model.verbose = False
for threshold in thresholds:
    # Временно изменяем threshold у модели
    albert_model.threshold = threshold
    
    # Оцениваем с новым threshold
    f1 = albert_model.evaluate(
        X_val[:1000], 
        y_val[:1000], 
        f"ALBERT (threshold={threshold})"
    )
    
    # Сохраняем результат
    results.append({
        'threshold': threshold,
        'f1_score': f1
    })
    
    print(f"Threshold {threshold} -> F1: {f1:.4f}")

best = max(results, key=lambda x: x['f1_score'])
print(f"Best: Threshold={best['threshold']} -> F1={best['f1_score']}")
albert_model.verbose = verbose  # Восстанавливаем исходное состояние

# Устанавливаем лучший threshold
albert_model.threshold = best['threshold']

## Выгрузка решения

In [None]:
def create_submission(model, test_data, filename="submission.csv"):
    """
    Создает файл submission для отправки результатов
    
    Args:
        model: обученная модель с методом predict
        test_data: DataFrame с тестовыми данными
        filename: имя файла для сохранения
    """
    
    # Получаем предсказания на тестовых данных
    test_predictions = model.predict(test_data['text'])
    
    # Создаем DataFrame для submission
    submission = pd.DataFrame({
        'ID': test_data['ID'],  # Используем ID из тестовых данных
        'label': test_predictions
    })
    
    # Сохраняем файл
    submission.to_csv(filename, index=False)
    
    print(f"Файл {filename} создан!")
    print(f"Размер: {submission.shape}")
    print(f"Распределение предсказаний:")
    print(submission['label'].value_counts().sort_index())
    print(f"\nПример submission:")
    print(submission.head(10))
    
    return submission


In [None]:
# Создаем submission с TF-IDF моделью
submission_tfidf = create_submission(
    model=tfidf_model, 
    test_data=test_df, 
    filename=f"{data_dir}/tfidf_baseline1_submission.csv"
)

Сохранение модели

In [None]:
import json
import torch
from pathlib import Path


def save_model_complete(model, model_name, save_dir="saved_models"):
    """
    Полное сохранение модели с метаданными
    """
    save_path = Path(save_dir) / model_name
    save_path.mkdir(parents=True, exist_ok=True)
    
    print(f"Сохраняем модель {model_name} в {save_path}")
    
    # Сохраняем веса и конфигурацию модели
    if hasattr(model.model, 'module'):  # DataParallel case
        model.model.module.save_pretrained(save_path / "model")
    else:
        model.model.save_pretrained(save_path / "model")
    
    # Сохраняем токенизатор
    model.tokenizer.save_pretrained(save_path / "tokenizer")
    
    # Сохраняем метаданные класса
    metadata = {
        'class_name': model.__class__.__name__,
        'model_name': model.model_name,
        'threshold': model.threshold,
        'device': str(model.device),
        'is_fitted': model.is_fitted,
        'verbose': model.verbose
    }
    
    with open(save_path / "metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # Сохраняем состояние оптимизатора (если нужно продолжить обучение)
    # torch.save(trainer.state, save_path / "trainer_state.pt")
    
    print(f"Модель сохранена:")
    print(f"Веса модели: {save_path / 'model'}")
    print(f"Токенизатор: {save_path / 'tokenizer'}")
    print(f"Метаданные: {save_path / 'metadata.json'}")
    
    return save_path

def load_model_complete(model_class, save_path):
    """
    Полная загрузка модели
    """
    save_path = Path(save_path)
    
    # Загружаем метаданные
    with open(save_path / "metadata.json", 'r') as f:
        metadata = json.load(f)
    
    # Создаем экземпляр класса
    model = model_class(verbose=metadata['verbose'])
    
    # Загружаем веса модели
    model.model = AutoModelForSequenceClassification.from_pretrained(
        save_path / "model"
    ).to(model.device)
    
    # Загружаем токенизатор
    model.tokenizer = AutoTokenizer.from_pretrained(save_path / "tokenizer")
    
    # Восстанавливаем состояние
    model.threshold = metadata['threshold']
    model.is_fitted = metadata['is_fitted']
    
    print(f"Модель загружена из {save_path}")
    return model

save_path = save_model_complete(rubert_model, "rubert_best")
# loaded_model = load_model_complete(RuBert, save_path)