In [1]:
!pip install catboost
!pip install pymorphy2



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
from sklearn.cluster import DBSCAN
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier



In [3]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [4]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [5]:
# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['title'].fillna('', inplace=True)


In [6]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
# Пробуем стеммер или леммер
stemmer = SnowballStemmer("russian")
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    # tokens = [stemmer.stem(word) for word in tokens]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return tokens

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

In [8]:
# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu()

train_data['title_embeddings'] = train_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())
test_data['title_embeddings'] = test_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())

KeyboardInterrupt: 

In [None]:
def embeddings_to_features(data, column_prefix):
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

In [None]:
# Новые признаки
def add_new_features(data):
    # Длина заголовка
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    # Количество уникальных слов
    # data['unique_words'] = data['title'].apply(lambda x: len(set(x.split())))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x)))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [None]:
train_data

In [None]:
def vectorize_group(group):
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

In [None]:
grouped = train_data.groupby('group_id')

tfidf_train_df = pd.DataFrame()
similarity_features_list = []

for name, group in tqdm(grouped, desc="Processing groups"):
    tfidf_matrix = vectorize_group(group)
    group_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])], index=group.index)
    tfidf_train_df = pd.concat([tfidf_train_df, group_tfidf_df])

    # Вычисляем матрицу косинусных расстояний
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)

    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine').fit(cosine_sim_matrix)
    cluster_labels = dbscan.labels_


    for k, (idx, row) in enumerate(group.iterrows()):
        all_dist = []

        for j in range(len(group)):
            if k == j:
                continue
            all_dist.append(cosine_sim_matrix[k, j])

        top_15_similarities = sorted(all_dist, reverse=True)[:10]
        top_15_similarities.append(cluster_labels[k])

        similarity_record = [row['pair_id'], row['group_id'], row['doc_id']] + top_15_similarities
        similarity_features_list.append(similarity_record)

# Создаем DataFrame для новых признаков
similarity_columns = ['pair_id', 'group_id', 'doc_id'] + [f'top_{i+1}_similarity' for i in range(11)]
similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)

# Заполняем пропущенные значения нулями
tfidf_train_df = tfidf_train_df.fillna(0)
similarity_features = similarity_features.fillna(0)

# Объединяем новые признаки с исходным датасетом
enhanced_train_data = train_data.merge(similarity_features, on=['pair_id', 'group_id', 'doc_id'])

In [None]:
train_data

In [None]:
enhanced_train_data = pd.concat([enhanced_train_data, train_features], axis=1)
enhanced_train_data

In [None]:
grouped = test_data.groupby('group_id')

tfidf_test_df = pd.DataFrame()
similarity_features_list = []

for name, group in tqdm(grouped, desc="Processing groups"):
    tfidf_matrix = vectorize_group(group)
    group_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])], index=group.index)
    tfidf_test_df = pd.concat([tfidf_test_df, group_tfidf_df])

    # Вычисляем матрицу косинусных расстояний
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)

    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine').fit(cosine_sim_matrix)
    cluster_labels = dbscan.labels_

    for k, (idx, row) in enumerate(group.iterrows()):
        all_dist = []

        for j in range(len(group)):
            if k == j:
                continue
            all_dist.append(cosine_sim_matrix[k, j])

        top_15_similarities = sorted(all_dist, reverse=True)[:10]
        top_15_similarities.append(cluster_labels[k])

        similarity_record = [row['pair_id'], row['group_id'], row['doc_id']] + top_15_similarities
        similarity_features_list.append(similarity_record)

# Создаем DataFrame для новых признаков
similarity_columns = ['pair_id', 'group_id', 'doc_id'] + [f'top_{i+1}_similarity' for i in range(11)]
similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)

# Заполняем пропущенные значения нулями
tfidf_test_df = tfidf_test_df.fillna(0)
similarity_features = similarity_features.fillna(0)

# Объединяем новые признаки с исходным датасетом
enhanced_test_data = test_data.merge(similarity_features, on=['pair_id', 'group_id', 'doc_id'])
# enhanced_train_data = pd.concat([enhanced_train_data, tfidf_train_df], axis=1)

In [None]:
enhanced_test_data = pd.concat([enhanced_test_data, test_features], axis=1)

In [None]:
enhanced_test_data

In [None]:
# X_test = enhanced_test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed', 'title_embeddings'], axis=1)
X_test = enhanced_test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed'], axis=1)

In [None]:
X_test

In [None]:
# Объединение всех признаков
# train_features = pd.concat([similarity_features], axis=1)
# train_features = pd.concat([train_features, tfidf_train_df, train_data[['title_length', 'unique_words']]], axis=1)
# test_features = pd.concat([test_features, tfidf_test_df, test_data[['title_length', 'unique_words']]], axis=1)

In [None]:
# Подготовка данных для обучения
# X_train = enhanced_train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed', 'title_embeddings'], axis=1)
X_train = enhanced_train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed'], axis=1)
y_train = enhanced_train_data['target']
# X_test = test_features

In [None]:
X_train

In [None]:
# Разбиение данных на обучающую и тестовую выборки
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, train_groups['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

In [None]:
# # Скейлинг данных
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_split)
# X_val_scaled = scaler.transform(X_val_split)

In [None]:
# Обучение модели CatBoost
model = CatBoostClassifier()
train_pool = Pool(X_train_split, y_train_split)
val_pool = Pool(X_val_split, y_val_split)
model.fit(train_pool, eval_set=val_pool)

In [None]:
# Предсказания на валидационной выборке
val_predictions = model.predict(val_pool)

In [None]:
# Оценка модели
accuracy = accuracy_score(y_val_split, val_predictions)
f1 = f1_score(y_val_split, val_predictions)
report = classification_report(y_val_split, val_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1-score: {f1}")
print(report)

In [None]:
# Предсказание на тестовом наборе
# X_test_scaled = scaler.transform(X_test)
test_pool = Pool(X_test)
test_predictions = model.predict(test_pool)

In [None]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('/content/submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')