#Загрузка моделей и библиотек

In [1]:
import tokenizer
!pip install catboost
!pip install pymorphy2



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import DBSCAN
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")



# Загрузка данных

In [3]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [4]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [5]:
# Просмотр пропусков
train_data.isna().sum(), test_data.isna().sum()

(pair_id      0
 group_id     0
 doc_id       0
 target       0
 title       16
 dtype: int64,
 pair_id      0
 group_id     0
 doc_id       0
 title       92
 dtype: int64)

In [6]:
# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

# Предобработка данных

In [7]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Стоп-слова и шум
stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))

# Пробуем стеммер или леммер
stemmer = SnowballStemmer("russian")
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def preprocess_text(text):
    """
    Токенизация + лемматизация/стемминг текста
    """
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]  # Лемматизация
    return tokens

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

# Юзанье Word2wec

In [9]:
from gensim.models import KeyedVectors

word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_w2v_embeddings(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

train_data['w2v_embeddings'] = train_data['title_processed'].apply(get_w2v_embeddings)
test_data['w2v_embeddings'] = test_data['title_processed'].apply(get_w2v_embeddings)

In [10]:
def embeddings_to_features(data, column_name):
    embeddings = np.stack(data[column_name].values)
    feature_names = [f"{column_name}_{i}" for i in range(embeddings.shape[1])]
    return pd.DataFrame(embeddings, columns=feature_names, index=data.index)

train_w2v_features = embeddings_to_features(train_data, 'w2v_embeddings')
test_w2v_features = embeddings_to_features(test_data, 'w2v_embeddings')

# Векторизация BERT

In [14]:
from transformers import BertTokenizer, BertModel

# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu()

In [15]:
# Применение BERT для векторизации заголовков
train_data['bert_embeddings'] = train_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())
test_data['bert_embeddings'] = test_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())

In [29]:
def embeddings_to_features_bert(data, column_name):
    embeddings = np.vstack(data[column_name].values)
    feature_names = [f"{column_name}_{i}" for i in range(embeddings.shape[1])]
    return pd.DataFrame(embeddings, columns=feature_names, index=data.index)

In [30]:
train_bert_features = embeddings_to_features(train_data, 'bert_embeddings')
test_bert_features = embeddings_to_features(test_data, 'bert_embeddings')

KeyError: 'bert_embeddings_embeddings'

# Создание фичей

In [17]:
def embeddings_to_features(data, column_prefix):
    """
    Преобразуем эмбеддинги в фичи
    """
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

KeyError: 'title_embeddings'

In [18]:
def add_new_features(data):
    """
    Добавляем новые признаки: длина заголовка и число уникальных слов
    """
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x)))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [19]:
def vectorize_group(group):
    """
    Векторизуем группу документов с кастомным токенайзером
    """
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

In [20]:
def data_to_tfidf(data_grouped):
    """
    Создание tfidf матрицы для данных по группам
    """
    tfidf_data = pd.DataFrame()
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        # Для каждой группы получаем векторное представление
        tfidf_matrix = vectorize_group(group)
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])], index=group.index)
        # Соединяем в df все группы
        tfidf_data = pd.concat([tfidf_data, tfidf_df])
    return tfidf_data

In [21]:
def cosine_matrix_group(group):
    """
    Вычисление матрицы косинусных расстояний для группы
    """
    tfidf_matrix = vectorize_group(group)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    return cosine_matrix

In [22]:
def calc_cosine_similarity(data_grouped, count=10):
    """
    Возвращаем топ косинусных сходств для каждого документа группы
    """
    similarity_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        for k, (idx, row) in enumerate(group.iterrows()):
            similarities = []
            for j in range(len(group)):
                if k == j:
                    continue
                similarities.append(cosine_matrix[k, j])
            top_similarities = sorted(similarities, reverse=True)[:count]
            similarity_record = [row['pair_id']] + top_similarities
            similarity_features_list.append(similarity_record)
    similarity_columns = ['pair_id'] + [f'top_{i+1}_similarity' for i in range(count)]
    similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)
    return similarity_features

In [23]:
def clustering_features(data_grouped, eps=0.5, min_samples=5, metric='cosine'):
    """
    Возвращаем кластер для каждого документа группы
    """
    clustering_features_list = []
    for name, group in tqdm(data_grouped, desc="Processing groups"):
        cosine_matrix = cosine_matrix_group(group)
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(cosine_matrix)
        cluster_labels = dbscan.labels_
        for k, (idx, row) in enumerate(group.iterrows()):
            clustering_record = [row['pair_id'], cluster_labels[k]]
            clustering_features_list.append(clustering_record)
    clustering_columns = ['pair_id', 'cluster']
    clustering_features = pd.DataFrame(clustering_features_list, columns=clustering_columns)
    return clustering_features

# Объединение данных и фичей в датасет

In [24]:
# Разбиваем train и test по группам на основе id
train_data_grouped = train_data.groupby('group_id')
test_data_grouped = test_data.groupby('group_id')

In [25]:
# # Функция для вычисления RMS
# def fillna_with_rms(df):
#     rms_values = np.sqrt(np.nanmean(df**2, axis=0))
#     df_filled = df.copy()
#     for idx, col in enumerate(df.columns):
#         df_filled[col].fillna(rms_values[idx], inplace=True)
#     return df_filled

In [28]:
# Фичи для трейна и теста
train_similarity_features = calc_cosine_similarity(train_data_grouped)
train_clustering_features = clustering_features(train_data_grouped)

test_similarity_features = calc_cosine_similarity(test_data_grouped)
test_clustering_features = clustering_features(test_data_grouped)

# Замена пропусков на квадратично средние значения
train_similarity_features = train_similarity_features(0)
test_similarity_features = test_similarity_features(0)

TypeError: 'DataFrame' object is not callable

In [None]:
# Объединяем новые признаки с исходным датасетом
train_data = train_data.merge(train_similarity_features, on=['pair_id'])
train_data = train_data.merge(train_clustering_features, on=['pair_id'])
train_data = pd.concat([train_data, train_w2v_features, train_bert_features], axis=1)

test_data = test_data.merge(test_similarity_features, on=['pair_id'])
test_data = test_data.merge(test_clustering_features, on=['pair_id'])
test_data = pd.concat([test_data, test_w2v_features, test_bert_features], axis=1)

In [None]:
# Объединяем с BERT признаками
train_data = pd.concat([train_data, train_w2v_features], axis=1)
test_data = pd.concat([test_data, train_w2v_features], axis=1)

In [None]:
train_data

In [None]:
test_data

# Разбиение на train, val и test

In [None]:
# Определяем целевую переменную и признаки
X_train = train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed', 'w2v_embeddings', 'bert_embeddings'])
y_train = train_data['target']

X_test = test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed', 'w2v_embeddings', 'bert_embeddings'])

In [None]:
# Разбиваем данные на обучающие и валидационные
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, groups=train_data['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

In [None]:
# Скейлинг данных
scaler = StandardScaler()
X_train_split = scaler.fit_transform(X_train_split)
X_val_split = scaler.transform(X_val_split)
X_test = scaler.transform(X_test)

# Обучение и инференс модели

In [None]:
# Обучение модели CatBoost
model = CatBoostClassifier()
train_pool = Pool(X_train_split, y_train_split)
val_pool = Pool(X_val_split, y_val_split)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)

In [None]:
# Предсказания на валидационной выборке
val_predictions = model.predict(val_pool)

In [None]:
# Оценка модели на валидации
accuracy = accuracy_score(y_val_split, val_predictions)
f1 = f1_score(y_val_split, val_predictions)
report = classification_report(y_val_split, val_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1-score: {f1}")
print(report)

In [None]:
# Предсказание на тестовом наборе
test_pool = Pool(X_test)
test_predictions = model.predict(test_pool)

In [None]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')