In [32]:
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
import os
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertModel
from sklearn.cluster import DBSCAN
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [3]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [4]:
# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

In [5]:
# Предобработка данных
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return tokens

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

train_data['title_embeddings'] = train_data['title_processed'].apply(lambda x: get_bert_embeddings(' '.join(x)))
test_data['title_embeddings'] = test_data['title_processed'].apply(lambda x: get_bert_embeddings(' '.join(x)))

In [7]:
# Создание фичей
def embeddings_to_features(data, column_prefix):
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

def add_new_features(data):
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x)))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [8]:
# Косинусные сходства Tfidf
def vectorize_group(group):
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text, ngram_range=(1, 2))
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

def cosine_matrix_group(group):
    tfidf_matrix = vectorize_group(group)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    return cosine_matrix

def calc_cosine_similarity(data_grouped, count=10):
    similarity_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        for k, (idx, row) in enumerate(group.iterrows()):
            similarities = []
            for j in range(len(group)):
                if k == j:
                    continue
                similarities.append(cosine_matrix[k, j])
            top_similarities = sorted(similarities, reverse=True)[:count]
            similarity_record = [row['pair_id']] + top_similarities
            similarity_features_list.append(similarity_record)
    similarity_columns = ['pair_id'] + [f'top_{i + 1}_similarity' for i in range(count)]
    similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)
    return similarity_features

In [9]:
# Кластеризация
def clustering_features(data_grouped, eps=0.5, min_samples=5, metric='cosine'):
    clustering_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(cosine_matrix)
        cluster_labels = dbscan.labels_
        for k, (idx, row) in enumerate(group.iterrows()):
            clustering_record = [row['pair_id'], cluster_labels[k]]
            clustering_features_list.append(clustering_record)
    clustering_columns = ['pair_id', 'cluster']
    clustering_features = pd.DataFrame(clustering_features_list, columns=clustering_columns)
    return clustering_features

In [10]:
# Объединение данных и фичей в датасет
train_data_grouped = train_data.groupby('group_id')
test_data_grouped = test_data.groupby('group_id')

train_similarity_features = calc_cosine_similarity(train_data_grouped)
train_clustering_features = clustering_features(train_data_grouped)

test_similarity_features = calc_cosine_similarity(test_data_grouped)
test_clustering_features = clustering_features(test_data_grouped)

train_similarity_features = train_similarity_features.fillna(0)
test_similarity_features = test_similarity_features.fillna(0)

train_data = train_data.merge(train_similarity_features, on=['pair_id'])
train_data = train_data.merge(train_clustering_features, on=['pair_id'])

test_data = test_data.merge(test_similarity_features, on=['pair_id'])
test_data = test_data.merge(test_clustering_features, on=['pair_id'])

train_data = pd.concat([train_data, train_features], axis=1)
test_data = pd.concat([test_data, test_features], axis=1)

Processing groups: 100%|██████████| 129/129 [00:07<00:00, 16.72it/s]
Processing groups: 100%|██████████| 129/129 [00:08<00:00, 15.17it/s]
Processing groups: 100%|██████████| 180/180 [00:11<00:00, 16.11it/s]
Processing groups: 100%|██████████| 180/180 [00:12<00:00, 14.45it/s]


In [29]:
# Разбиение на train, val и test
X_train = train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed', 'title_embeddings'], axis=1)
y_train = train_data['target']

X_test = test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed', 'title_embeddings'], axis=1)

In [30]:
# Разбиение данных на обучающую и тестовую выборки
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, train_groups['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

In [None]:
# Обучение модели CatBoost с оптимизацией гиперпараметров
cat_params = {
    'iterations': [200, 350, 500],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

cat_model = CatBoostClassifier()
cat_grid = GridSearchCV(cat_model, cat_params, cv=5, scoring='f1', n_jobs=-1)
cat_grid.fit(X_train_split, y_train_split)
best_cat_params = cat_grid.best_params_

0:	learn: 0.6863906	total: 176ms	remaining: 34.9s
1:	learn: 0.6793733	total: 242ms	remaining: 24s
2:	learn: 0.6734104	total: 482ms	remaining: 31.7s
3:	learn: 0.6669616	total: 821ms	remaining: 40.2s
4:	learn: 0.6607700	total: 992ms	remaining: 38.7s
5:	learn: 0.6551719	total: 1.14s	remaining: 36.7s
6:	learn: 0.6489529	total: 1.23s	remaining: 33.9s
7:	learn: 0.6428539	total: 1.37s	remaining: 32.9s
8:	learn: 0.6365929	total: 1.49s	remaining: 31.7s
9:	learn: 0.6308593	total: 1.68s	remaining: 31.9s
10:	learn: 0.6254323	total: 1.79s	remaining: 30.8s
11:	learn: 0.6199829	total: 2.04s	remaining: 32s
12:	learn: 0.6148420	total: 2.27s	remaining: 32.6s
13:	learn: 0.6093464	total: 2.43s	remaining: 32.3s
14:	learn: 0.6045219	total: 2.6s	remaining: 32.1s
15:	learn: 0.5998337	total: 2.79s	remaining: 32.1s
16:	learn: 0.5947188	total: 3.08s	remaining: 33.2s
17:	learn: 0.5898943	total: 3.22s	remaining: 32.5s
18:	learn: 0.5854635	total: 3.42s	remaining: 32.6s
19:	learn: 0.5815495	total: 3.6s	remaining: 32

In [None]:
# Обучение CatBoost с лучшими параметрами на полном наборе данных
final_cat_model = CatBoostClassifier(**best_cat_params)
train_pool = Pool(X_train_split, y_train_split)
val_pool = Pool(X_val_split, y_val_split)
final_cat_model.fit(train_pool, eval_set=val_pool)

In [None]:
# Предсказания на валидационной выборке
val_predictions = final_cat_model.predict(val_pool)

In [None]:
# Оценка модели на валидации
accuracy = accuracy_score(y_val_split, val_predictions)
f1 = f1_score(y_val_split, val_predictions)
report = classification_report(y_val_split, val_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1-score: {f1}")
print(report)

In [20]:
# Стекинг
class DjStacking(BaseEstimator, ClassifierMixin):
    def __init__(self, models, ens_model):
        self.models = models
        self.ens_model = ens_model
        self.n = len(models)
        self.valid = None

    def fit(self, groups_ids, X, y, p=0.25, cv=3, err=0.001, random_state=None):
        if p > 0:
            splitter = GroupShuffleSplit(n_splits=1, test_size=p, random_state=random_state)
            train_indices, val_indices = next(splitter.split(X, y, groups=groups_ids))
            train = X[train_indices]
            y_train = y[train_indices]
            valid = X[val_indices]
            y_valid = y[val_indices]

            self.valid = np.zeros((valid.shape[0], self.n))
            for t, clf in enumerate(self.models):
                clf.fit(train, y_train)
                self.valid[:, t] = clf.predict_proba(valid)[:, 1]

            self.ens_model.fit(X=self.valid, y=y_valid)
        else:
            self.valid = err * np.random.randn(X.shape[0], self.n)
            for t, clf in enumerate(self.models):
                self.valid[:, t] += cross_val_predict(clf, X, y, cv=cv, n_jobs=-1, method='predict_proba')[:, 1]
                clf.fit(X, y)

            self.ens_model.fit(self.valid, y)
        return self


    def predict(self, X):
        X_meta = np.zeros((X.shape[0], self.n))
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict_proba(X)[:, 1]

        return self.ens_model.predict(X_meta)

In [21]:
def predict(self, X):
        X_meta = np.zeros((X.shape[0], self.n))
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict_proba(X)[:, 1]

        return self.ens_model.predict(X_meta)

In [22]:
# Скейлинг данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# Базовые алгоритмы
knn1 = KNeighborsClassifier(n_neighbors=3)
knn2 = KNeighborsClassifier(n_neighbors=10)
rf1 = RandomForestClassifier(n_estimators=100)
rf2 = RandomForestClassifier(n_estimators=200)
svc1 = SVC(probability=True)
cat1 = CatBoostClassifier(**model.get_params())

In [24]:
# Стекинг моделей
models = [knn1, knn2, rf1, rf2, cat1]
ens_model = LogisticRegression()

groups_ids = train_data['group_id'].values
stacking_model = DjStacking(models, ens_model)
stacking_model.fit(groups_ids, X_train_scaled, y_train.values)

Learning rate set to 0.025799
0:	learn: 0.6763576	total: 26.3ms	remaining: 26.3s
1:	learn: 0.6588844	total: 45.7ms	remaining: 22.8s
2:	learn: 0.6450084	total: 63.5ms	remaining: 21.1s
3:	learn: 0.6318052	total: 81.7ms	remaining: 20.4s
4:	learn: 0.6181883	total: 102ms	remaining: 20.2s
5:	learn: 0.6043114	total: 122ms	remaining: 20.2s
6:	learn: 0.5915089	total: 142ms	remaining: 20.2s
7:	learn: 0.5795474	total: 162ms	remaining: 20.1s
8:	learn: 0.5680198	total: 182ms	remaining: 20.1s
9:	learn: 0.5575447	total: 203ms	remaining: 20.1s
10:	learn: 0.5470999	total: 223ms	remaining: 20s
11:	learn: 0.5386027	total: 243ms	remaining: 20s
12:	learn: 0.5298803	total: 262ms	remaining: 19.9s
13:	learn: 0.5211658	total: 282ms	remaining: 19.8s
14:	learn: 0.5140012	total: 301ms	remaining: 19.8s
15:	learn: 0.5061922	total: 322ms	remaining: 19.8s
16:	learn: 0.5000817	total: 342ms	remaining: 19.8s
17:	learn: 0.4937222	total: 365ms	remaining: 19.9s
18:	learn: 0.4876231	total: 390ms	remaining: 20.1s
19:	learn: 

In [25]:
# Предсказание на тестовом наборе
test_predictions = stacking_model.predict(X_test_scaled)

In [26]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')

Файл с предсказаниями создан: submission.csv
