In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader

In [8]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [9]:
# Объединение заголовков с данными групп
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id')

In [10]:
# Обработка отсутствующих значений
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['title'].fillna('', inplace=True)


In [11]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
stemmer = SnowballStemmer("russian")

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RangoPA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RangoPA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RangoPA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Определите устройство (CPU или CUDA)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Загрузите токенайзер и модель на устройство
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu()

train_data['title_embeddings'] = train_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())
test_data['title_embeddings'] = test_data['title'].apply(lambda x: get_bert_embeddings(x).numpy())

def embeddings_to_features(data, column_prefix):
    embeddings = np.stack(data[column_prefix + '_embeddings'].values)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [13]:
# Новые признаки
def add_new_features(data):
    # Длина заголовка
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    # Количество уникальных слов
    data['unique_words'] = data['title'].apply(lambda x: len(set(x.split())))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [14]:
# TF-IDF Векторизация
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_train = tfidf_vectorizer.fit_transform(train_data['title']).toarray()
tfidf_test = tfidf_vectorizer.transform(test_data['title']).toarray()

tfidf_train_df = pd.DataFrame(tfidf_train, columns=[f'tfidf_{i}' for i in range(tfidf_train.shape[1])], index=train_data.index)
tfidf_test_df = pd.DataFrame(tfidf_test, columns=[f'tfidf_{i}' for i in range(tfidf_test.shape[1])], index=test_data.index)

In [15]:
# Объединение всех признаков
train_features = pd.concat([train_features, tfidf_train_df, train_data[['title_length', 'unique_words']]], axis=1)
test_features = pd.concat([test_features, tfidf_test_df, test_data[['title_length', 'unique_words']]], axis=1)

In [16]:
# Подготовка данных для обучения
X_train = train_features
y_train = train_data['target']
X_test = test_features

In [17]:
# Разбиение данных на обучающую и тестовую выборки
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, train_groups['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

In [18]:
# Скейлинг данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)

class TextDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]

In [19]:
# Создание DataLoader'ов
batch_size = 32
train_dataset = TextDataset(X_train_scaled, y_train_split.values)
val_dataset = TextDataset(X_val_scaled, y_val_split.values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [20]:
# Определение модели
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [21]:
# Определение параметров модели
input_size = X_train_scaled.shape[1]
num_classes = 2
learning_rate = 0.001
num_epochs = 20

model = SimpleNN(input_size, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
# Обучение модели
for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_predictions)
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation F1-score: {val_f1:.4f}")

Epoch [1/20], Validation F1-score: 0.2965
Epoch [2/20], Validation F1-score: 0.3241
Epoch [3/20], Validation F1-score: 0.3268
Epoch [4/20], Validation F1-score: 0.3224
Epoch [5/20], Validation F1-score: 0.2790
Epoch [6/20], Validation F1-score: 0.3430
Epoch [7/20], Validation F1-score: 0.3446
Epoch [8/20], Validation F1-score: 0.2987
Epoch [9/20], Validation F1-score: 0.3446
Epoch [10/20], Validation F1-score: 0.3215
Epoch [11/20], Validation F1-score: 0.3192
Epoch [12/20], Validation F1-score: 0.3298
Epoch [13/20], Validation F1-score: 0.3176
Epoch [14/20], Validation F1-score: 0.3252
Epoch [15/20], Validation F1-score: 0.3086
Epoch [16/20], Validation F1-score: 0.3257
Epoch [17/20], Validation F1-score: 0.3190
Epoch [18/20], Validation F1-score: 0.3049
Epoch [19/20], Validation F1-score: 0.2949
Epoch [20/20], Validation F1-score: 0.2631


In [23]:
# Предсказание на тестовом наборе
X_test_scaled = scaler.transform(X_test)
test_dataset = TextDataset(X_test_scaled)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        test_predictions.extend(predicted.cpu().numpy())

In [24]:
# Сохранение результатов
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('/mnt/data/submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')

ValueError: Length of values (16551) does not match length of index (16627)