In [41]:
! pip install catboost



In [42]:
import pandas as pd

# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

# Объединяем заголовки с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id')

# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

In [43]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [45]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id')

In [46]:
# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

In [47]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if pd.isnull(text):
        return set()
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return set(lemmatized_tokens)

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
# Векторизация с помощью BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze()

train_data['title_embeddings'] = train_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())
test_data['title_embeddings'] = test_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())

def embeddings_to_features(data, column_prefix):
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')



KeyboardInterrupt: 

In [None]:
# Новые признаки
def add_new_features(data):
    # Длина заголовка
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    # Количество уникальных слов
    data['unique_words'] = data['title'].apply(lambda x: len(set(x.split())))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [None]:
# TF-IDF Векторизация
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_train = tfidf_vectorizer.fit_transform(train_data['title']).toarray()
tfidf_test = tfidf_vectorizer.transform(test_data['title']).toarray()

tfidf_train_df = pd.DataFrame(tfidf_train, columns=[f'tfidf_{i}' for i in range(tfidf_train.shape[1])], index=train_data.index)
tfidf_test_df = pd.DataFrame(tfidf_test, columns=[f'tfidf_{i}' for i in range(tfidf_test.shape[1])], index=test_data.index)

In [None]:
# Объединение всех признаков
train_features = pd.concat([train_features, tfidf_train_df, train_data[['title_length', 'unique_words']]], axis=1)
test_features = pd.concat([test_features, tfidf_test_df, test_data[['title_length', 'unique_words']]], axis=1)

In [None]:
# Подготовка данных для обучения
X_train = train_features
y_train = train_data['target']
X_test = test_features

In [None]:
# Разбиение данных на обучающую и тестовую выборки
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Обучение модели
model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, num_leaves=31, objective='binary')
model.fit(X_train_split, y_train_split)

In [None]:
# Оценка модели
y_val_pred = model.predict(X_val_split)
print(f"F1 score на валидационном наборе: {f1_score(y_val_split, y_val_pred):.4f}")

In [None]:
# Предсказание на тестовом наборе
test_predictions = model.predict(X_test)

In [None]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('/mnt/data/submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')