In [77]:
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
import patoolib
import os
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import StackingClassifier
import warnings
from sklearn.cluster import DBSCAN
warnings.filterwarnings("ignore")

# Загрузка данных

In [90]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [91]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [92]:
# Проверка наличия пропусков и их заполнение
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

In [93]:
# Проверка, что пропуски были заполнены
print(train_data.isna().sum())
print(test_data.isna().sum())

pair_id     0
group_id    0
doc_id      0
target      0
title       0
dtype: int64
pair_id     0
group_id    0
doc_id      0
title       0
dtype: int64


# Предобработка данных


In [94]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
# Стоп-слова и шум
stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))

In [96]:
# Пробуем стеммер или леммер
stemmer = SnowballStemmer("russian")
morph = pymorphy2.MorphAnalyzer()


def preprocess_text(text):
    """
    Токенизация + лемматизация/стемминг текста
    """
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return tokens


train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

# Векторизация BERT

In [97]:
# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu()


train_data['title_embeddings'] = train_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())
test_data['title_embeddings'] = test_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())

In [98]:
# Создание фичей
def embeddings_to_features(data, column_prefix):
    """
    Преобразуем эмбеддинги в фичи
    """
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df


train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

In [99]:
def add_new_features(data):
    """
    Добавляем новые признаки: длина заголовка и число уникальных слов
    """
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x)))
    return data


train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

## Косинусные сходства Tfidf

In [100]:
def vectorize_group(group):
    """
    Векторизуем группу документов с кастомным токенайзером
    """
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

In [101]:
def cosine_matrix_group(group):
    """
    Вычисление матрицы косинусных расстояний для группы
    """
    tfidf_matrix = vectorize_group(group)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    return cosine_matrix

In [102]:
def calc_cosine_similarity(data_grouped, count=10):
    """
    Возвращаем топ косинусных сходств для каждого документа группы
    """
    similarity_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        for k, (idx, row) in enumerate(group.iterrows()):
            similarities = []
            for j in range(len(group)):
                if k == j:
                    continue
                similarities.append(cosine_matrix[k, j])
            top_similarities = sorted(similarities, reverse=True)[:count]
            similarity_record = [row['pair_id']] + top_similarities
            similarity_features_list.append(similarity_record)
    similarity_columns = ['pair_id'] + [f'top_{i + 1}_similarity' for i in range(count)]
    similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)
    return similarity_features

In [103]:
# Кластеризация
def clustering_features(data_grouped, eps=0.5, min_samples=5, metric='cosine'):
    """
    Возвращаем кластер для каждого документа группы
    """
    clustering_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(cosine_matrix)
        cluster_labels = dbscan.labels_
        for k, (idx, row) in enumerate(group.iterrows()):
            clustering_record = [row['pair_id'], cluster_labels[k]]
            clustering_features_list.append(clustering_record)
    clustering_columns = ['pair_id', 'cluster']
    clustering_features = pd.DataFrame(clustering_features_list, columns=clustering_columns)
    return clustering_features

# Объединение данных и фичей в датасет


In [104]:
# Разбиваем train и test по группам на основе id
train_data_grouped = train_data.groupby('group_id')
test_data_grouped = test_data.groupby('group_id')

In [105]:
# Фичи для трейна и теста
train_similarity_features = calc_cosine_similarity(train_data_grouped)
train_clustering_features = clustering_features(train_data_grouped)

test_similarity_features = calc_cosine_similarity(test_data_grouped)
test_clustering_features = clustering_features(test_data_grouped)

Processing groups: 100%|██████████| 129/129 [00:08<00:00, 15.99it/s]
Processing groups: 100%|██████████| 129/129 [00:09<00:00, 13.58it/s]
Processing groups: 100%|██████████| 180/180 [00:11<00:00, 15.22it/s]
Processing groups: 100%|██████████| 180/180 [00:12<00:00, 14.40it/s]


In [106]:
# Заполняем пропуски нулями
train_similarity_features = train_similarity_features.fillna(0)
test_similarity_features = test_similarity_features.fillna(0)

In [107]:
# Объединяем новые признаки с исходным датасетом
train_data = train_data.merge(train_similarity_features, on=['pair_id'])
train_data = train_data.merge(train_clustering_features, on=['pair_id'])

test_data = test_data.merge(test_similarity_features, on=['pair_id'])
test_data = test_data.merge(test_clustering_features, on=['pair_id'])

In [108]:
# Объединяем с BERT признаками
train_data = pd.concat([train_data, train_features], axis=1)
test_data = pd.concat([test_data, test_features], axis=1)

In [109]:
# Удаление ненужных столбцов, если они существуют
drop_columns = ['title_embeddings']
train_data.drop(columns=[col for col in drop_columns if col in train_data.columns], inplace=True)
test_data.drop(columns=[col for col in drop_columns if col in test_data.columns], inplace=True)

In [110]:
# Преобразование типов данных к float
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        try:
            train_data[col] = train_data[col].astype(float)
        except ValueError:
            pass

for col in test_data.columns:
    if test_data[col].dtype == 'object':
        try:
            test_data[col] = test_data[col].astype(float)
        except ValueError:
            pass

In [111]:
# Удаление ненужных столбцов, если они существуют
drop_columns = ['title_embeddings', 'text_processed']
train_data.drop(columns=[col for col in drop_columns if col in train_data.columns], inplace=True)
test_data.drop(columns=[col for col in drop_columns if col in test_data.columns], inplace=True)

In [112]:
# Проверка на дубликаты
train_data = train_data.drop_duplicates(subset=['doc_id', 'pair_id', 'group_id'], keep='first')

X_train = train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed'], axis=1)
y_train = train_data['target']

X_test = test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed'], axis=1)

In [113]:
print(f"Length of train_groups: {len(train_groups)}")
print(f"Length of train_data after merging with docs_titles: {len(train_data)}")
print(f"Length of train_data after merging with additional_data: {len(train_data)}")
print(f"Length of X_train: {len(X_train)}")
print(f"Length of y_train: {len(y_train)}")

Length of train_groups: 11690
Length of train_data after merging with docs_titles: 11690
Length of train_data after merging with additional_data: 11690
Length of X_train: 11690
Length of y_train: 11690


In [114]:
# Убедитесь, что все переменные имеют одинаковую длину
assert len(X_train) == len(y_train) == len(train_data)

In [115]:
# Разделение данных на обучающую и валидационную выборки
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, groups=train_data['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

# Настройка параметров базовых моделей


In [116]:
# Проверка всех колонок в X_train_split на наличие строковых значений
for col in X_train_split.columns:
    if X_train_split[col].dtype == 'object':
        print(f"Column {col} contains non-numeric data.")

In [117]:
# Настройка параметров базовых моделей
knn_params = {'n_neighbors': [3, 5, 7, 10]}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='f1', n_jobs=-1)
knn_grid.fit(X_train_split, y_train_split)
best_knn = knn_grid.best_estimator_

In [118]:
# Настройка параметров для RandomForestClassifier
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30]}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_split, y_train_split)
best_rf = rf_grid.best_estimator_

In [119]:
# Настройка параметров для SVC
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(SVC(probability=True), svc_params, cv=5, scoring='f1', n_jobs=-1)
svc_grid.fit(X_train_split, y_train_split)
best_svc = svc_grid.best_estimator_

In [120]:
# Настройка параметров CatBoost
cat_params = {
    'iterations': [200, 350, 500],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

cat_model = CatBoostClassifier()
cat_grid = GridSearchCV(cat_model, cat_params, cv=5, scoring='f1', n_jobs=-1)
cat_grid.fit(X_train_split, y_train_split)
best_cat_params = cat_grid.best_params_

0:	learn: 0.6625102	total: 646ms	remaining: 2m 8s
1:	learn: 0.6304040	total: 782ms	remaining: 1m 17s
2:	learn: 0.6060923	total: 1.02s	remaining: 1m 7s
3:	learn: 0.5828368	total: 1.29s	remaining: 1m 3s
4:	learn: 0.5601979	total: 1.5s	remaining: 58.4s
5:	learn: 0.5407122	total: 1.74s	remaining: 56.2s
6:	learn: 0.5242454	total: 1.97s	remaining: 54.4s
7:	learn: 0.5100522	total: 2.21s	remaining: 52.9s
8:	learn: 0.4941718	total: 2.44s	remaining: 51.8s
9:	learn: 0.4839902	total: 2.74s	remaining: 52s
10:	learn: 0.4717192	total: 2.91s	remaining: 50s
11:	learn: 0.4620635	total: 3.05s	remaining: 47.8s
12:	learn: 0.4546223	total: 3.28s	remaining: 47.1s
13:	learn: 0.4461002	total: 3.51s	remaining: 46.7s
14:	learn: 0.4381080	total: 3.69s	remaining: 45.5s
15:	learn: 0.4307290	total: 3.94s	remaining: 45.4s
16:	learn: 0.4247813	total: 4.25s	remaining: 45.8s
17:	learn: 0.4206554	total: 4.41s	remaining: 44.6s
18:	learn: 0.4157221	total: 4.55s	remaining: 43.3s
19:	learn: 0.4122482	total: 4.69s	remaining: 

In [121]:
# Обучение CatBoost с лучшими параметрами на полном наборе данных
final_cat_model = CatBoostClassifier(**best_cat_params)
train_pool = Pool(X_train, y_train)
final_cat_model.fit(train_pool)

0:	learn: 0.6612932	total: 25.7ms	remaining: 5.11s
1:	learn: 0.6292015	total: 47.9ms	remaining: 4.74s
2:	learn: 0.6027518	total: 71.8ms	remaining: 4.71s
3:	learn: 0.5786959	total: 93.3ms	remaining: 4.57s
4:	learn: 0.5593865	total: 115ms	remaining: 4.47s
5:	learn: 0.5411594	total: 136ms	remaining: 4.41s
6:	learn: 0.5231093	total: 158ms	remaining: 4.36s
7:	learn: 0.5079343	total: 182ms	remaining: 4.36s
8:	learn: 0.4941117	total: 204ms	remaining: 4.33s
9:	learn: 0.4805685	total: 225ms	remaining: 4.28s
10:	learn: 0.4687752	total: 246ms	remaining: 4.23s
11:	learn: 0.4596826	total: 268ms	remaining: 4.2s
12:	learn: 0.4524586	total: 290ms	remaining: 4.17s
13:	learn: 0.4438950	total: 311ms	remaining: 4.13s
14:	learn: 0.4360872	total: 333ms	remaining: 4.11s
15:	learn: 0.4301115	total: 357ms	remaining: 4.11s
16:	learn: 0.4244383	total: 377ms	remaining: 4.06s
17:	learn: 0.4209692	total: 398ms	remaining: 4.02s
18:	learn: 0.4161540	total: 419ms	remaining: 3.99s
19:	learn: 0.4117041	total: 439ms	rema

<catboost.core.CatBoostClassifier at 0x3406d0640>

In [134]:
# Настройка параметров мета-модели
meta_params = {'C': [0.1, 1, 10]}
meta_grid = GridSearchCV(LogisticRegression(), meta_params, cv=5, scoring='f1', n_jobs=-1)
meta_grid.fit(X_train, y_train)
best_meta = meta_grid.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

# Блендинг

In [126]:
# Скейлинг данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [132]:
# Получение мета-фичей (предсказания базовых моделей на валидационной выборке)
val_predictions_knn = best_knn.predict(X_val_split)
val_predictions_rf = best_rf.predict(X_val_split)
val_predictions_svc = best_svc.predict(X_val_split)
val_predictions_cat = final_cat_model.predict(X_val_split)

# Создание мета-фичей
meta_features = np.column_stack((val_predictions_knn, val_predictions_rf, val_predictions_svc, val_predictions_cat))

# Обучение мета-модели на мета-фичах
meta_model = LogisticRegression()
meta_model.fit(meta_features, y_val_split)

# Получение мета-фичей на тестовой выборке
test_predictions_knn = best_knn.predict(X_test_scaled)
test_predictions_rf = best_rf.predict(X_test_scaled)
test_predictions_svc = best_svc.predict(X_test_scaled)
test_predictions_cat = final_cat_model.predict(X_test_scaled)

# Создание мета-фичей для тестовой выборки
test_meta_features = np.column_stack((test_predictions_knn, test_predictions_rf, test_predictions_svc, test_predictions_cat))

# Предсказание мета-модели на тестовой выборке
test_meta_predictions = meta_model.predict(test_meta_features)

# Стекинг


In [None]:
# Создание стекинг-классификатора
stacking_model = StackingClassifier(
    estimators=[
        ('knn', best_knn),
        ('rf', best_rf),
        ('svc', best_svc),
        ('cat', final_cat_model)
    ],
    final_estimator=best_meta,
    cv=5
)

stacking_model.fit(X_train, y_train)


In [None]:
# Предсказание и сохранение результатов
test_predictions = stacking_model.predict(X_test)


In [None]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('submission.csv', index=False)
print('Файл с предсказаниями создан: submission.csv')


In [None]:
# Важность признаков
import matplotlib.pyplot as plt

importances = final_cat_model.get_feature_importance(type='PredictionValuesChange')
feature_importances = pd.Series(importances, index=X_train.columns).sort_values()[-15:]
plt.figure(figsize=(10, 6))
plt.barh(feature_importances.index, feature_importances.values)
plt.title('CatBoost Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()