In [1]:
import catboost
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
import os
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import StackingClassifier
import warnings
from sklearn.cluster import DBSCAN
import optuna
warnings.filterwarnings("ignore")



In [2]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [3]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [4]:
# Проверка наличия пропусков и их заполнение
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

In [5]:
# Предобработка данных
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return ' '.join(tokens)  # Изменено на возврат строки

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

train_data['title_embeddings'] = train_data['title_processed'].apply(lambda x: get_bert_embeddings(x))
test_data['title_embeddings'] = test_data['title_processed'].apply(lambda x: get_bert_embeddings(x))

In [7]:
# Создание фичей
def embeddings_to_features(data, column_prefix):
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

def add_new_features(data):
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x.split())))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [8]:
# Косинусные сходства Tfidf
def vectorize_group(group):
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

def cosine_matrix_group(group):
    tfidf_matrix = vectorize_group(group)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    return cosine_matrix

def calc_cosine_similarity(data_grouped, count=10):
    similarity_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        for k, (idx, row) in enumerate(group.iterrows()):
            similarities = []
            for j in range(len(group)):
                if k == j:
                    continue
                similarities.append(cosine_matrix[k, j])
            top_similarities = sorted(similarities, reverse=True)[:count]
            similarity_record = [row['pair_id']] + top_similarities
            similarity_features_list.append(similarity_record)
    similarity_columns = ['pair_id'] + [f'top_{i + 1}_similarity' for i in range(count)]
    similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)
    return similarity_features

In [9]:
# Кластеризация
def clustering_features(data_grouped, eps=0.5, min_samples=5, metric='cosine'):
    clustering_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(cosine_matrix)
        cluster_labels = dbscan.labels_
        for k, (idx, row) in enumerate(group.iterrows()):
            clustering_record = [row['pair_id'], cluster_labels[k]]
            clustering_features_list.append(clustering_record)
    clustering_columns = ['pair_id', 'cluster']
    clustering_features = pd.DataFrame(clustering_features_list, columns=clustering_columns)
    return clustering_features

In [10]:
# Объединение данных и фичей в датасет
train_data_grouped = train_data.groupby('group_id')
test_data_grouped = test_data.groupby('group_id')

train_similarity_features = calc_cosine_similarity(train_data_grouped)
train_clustering_features = clustering_features(train_data_grouped)

test_similarity_features = calc_cosine_similarity(test_data_grouped)
test_clustering_features = clustering_features(test_data_grouped)

train_similarity_features = train_similarity_features.fillna(0)
test_similarity_features = test_similarity_features.fillna(0)

train_data = train_data.merge(train_similarity_features, on=['pair_id'])
train_data = train_data.merge(train_clustering_features, on=['pair_id'])

test_data = test_data.merge(test_similarity_features, on=['pair_id'])
test_data = test_data.merge(test_clustering_features, on=['pair_id'])

train_data = pd.concat([train_data, train_features], axis=1)
test_data = pd.concat([test_data, test_features], axis=1)

drop_columns = ['title_embeddings']
train_data.drop(columns=[col for col in drop_columns if col in train_data.columns], inplace=True)
test_data.drop(columns=[col for col in drop_columns if col in test_data.columns], inplace=True)

X_train = train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed'], axis=1)
y_train = train_data['target']

X_test = test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed'], axis=1)

Processing groups: 100%|██████████| 129/129 [00:07<00:00, 17.30it/s]
Processing groups: 100%|██████████| 129/129 [00:07<00:00, 16.22it/s]
Processing groups: 100%|██████████| 180/180 [00:10<00:00, 16.62it/s]
Processing groups: 100%|██████████| 180/180 [00:11<00:00, 15.44it/s]


In [11]:
# Скейлинг данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Подбор гиперпараметров для CatBoost с использованием Optuna
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
    }

    train_pool = Pool(X_train_scaled, y_train)
    cv_results = catboost.cv(
        train_pool,
        params,
        fold_count=5,
        early_stopping_rounds=50,
        stratified=True,
        verbose=False,
        plot=False
    )

    return cv_results['test-F1-mean'].max()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=-1)

best_params = study.best_params

[I 2024-06-02 19:04:45,861] A new study created in memory with name: no-name-b6e1eec6-48ed-41ee-8e8c-8e9946d86ddf
[W 2024-06-02 19:04:45,878] Trial 0 failed with parameters: {'iterations': 165, 'depth': 6, 'learning_rate': 0.003951491874899117, 'l2_leaf_reg': 9.34040746386525, 'border_count': 70, 'random_strength': 6.902129825165415, 'bagging_temperature': 0.008493201002157516, 'od_type': 'Iter', 'od_wait': 25} because of the following error: CatBoostError('Parameter loss_function should be specified for cross-validation').
Traceback (most recent call last):
  File "/Users/taniyashuba/PycharmProjects/VK_ML_Project/venv/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/87/d35_l8056xbbpvj6g4knhvsm0000gn/T/ipykernel_46918/3805093963.py", line 16, in objective
    cv_results = catboost.cv(
  File "/Users/taniyashuba/PycharmProjects/VK_ML_Project/venv/lib/python3.9/site-packages/catboost/core.py", line 6803

CatBoostError: Parameter loss_function should be specified for cross-validation

In [None]:
# Обучение CatBoost с лучшими параметрами на полном наборе данных
final_cat_model = CatBoostClassifier(**best_params)
final_cat_model.fit(X_train_scaled, y_train)

In [None]:
# Настройка параметров мета-модели
meta_params = {'C': [0.1, 1, 10]}
meta_grid = GridSearchCV(LogisticRegression(), meta_params, cv=5, scoring='f1', n_jobs=-1)
meta_grid.fit(X_train_scaled, y_train)
best_meta = meta_grid.best_estimator_

In [None]:
# Настройка параметров базовых моделей
knn_params = {'n_neighbors': [3, 5, 7, 10]}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='f1', n_jobs=-1)
knn_grid.fit(X_train_scaled, y_train)
best_knn = knn_grid.best_estimator_

rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30]}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)
best_rf = rf_grid.best_estimator_

svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(SVC(probability=True), svc_params, cv=5, scoring='f1', n_jobs=-1)
svc_grid.fit(X_train_scaled, y_train)
best_svc = svc_grid.best_estimator_

In [None]:
# Стекинг
stacking_model = StackingClassifier(
    estimators=[
        ('knn', best_knn),
        ('rf', best_rf),
        ('svc', best_svc),
        ('cat', final_cat_model)
    ],
    final_estimator=best_meta,
    cv=5
)

stacking_model.fit(X_train_scaled, y_train)

In [None]:
# Предсказание и сохранение результатов
test_predictions = stacking_model.predict(X_test_scaled)

In [None]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('submission.csv', index=False)
print('Файл с предсказаниями создан: submission.csv')

In [None]:
# Важность признаков
import matplotlib.pyplot as plt

importances = final_cat_model.get_feature_importance(type='PredictionValuesChange')
feature_importances = pd.Series(importances, index=X_train.columns).sort_values()[-15:]
plt.figure(figsize=(10, 6))
plt.barh(feature_importances.index, feature_importances.values)
plt.title('CatBoost Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
