In [120]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight

In [121]:
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

docs_titles['title'] = docs_titles['title'].fillna('')  # Замена пропущенных заголовков на пустую строку

train_data = train_groups.merge(docs_titles, on='doc_id', how='left')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')

In [122]:
# Проверка типов данных в столбце 'title'
print(train_data['title'].apply(type).value_counts())
print(test_data['title'].apply(type).value_counts())

title
<class 'str'>    11690
Name: count, dtype: int64
title
<class 'str'>      16551
<class 'float'>       76
Name: count, dtype: int64


In [123]:
# Преобразование всех значений в строку, заменив NaN на пустую строку
train_data['title'] = train_data['title'].astype(str)
test_data['title'] = test_data['title'].astype(str).fillna('')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
stemmer = SnowballStemmer("russian")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [124]:
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())  # Удаление спецсимволов
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [stemmer.stem(word) for word in tokens]
    processed_text = ' '.join(tokens)
    return processed_text if processed_text else 'empty'  # Заменить пустой текст на 'empty'


train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [144]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

In [145]:
# Обработка возможных пустых значений при векторизации
def safe_get_bert_embeddings(text):
    try:
        embeddings = get_bert_embeddings(text)
        if embeddings.size == 0:
            print(f"Empty embeddings for text: {text[:50]}...")
        return embeddings
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return np.zeros(768)  # Размерность вектора BERT


train_data['title_embeddings'] = train_data['title'].apply(lambda x: safe_get_bert_embeddings(x))
test_data['title_embeddings'] = test_data['title'].apply(lambda x: safe_get_bert_embeddings(x))

In [146]:
def embeddings_to_features(data, column_prefix):
    embeddings = [e for e in data[column_prefix + '_embeddings'].values if e is not None and len(e) > 0]
    if len(embeddings) == 0:
        raise ValueError('No embeddings found to stack')
    embeddings = np.stack(embeddings)
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df


train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

In [147]:
def add_new_features(data):
    # Длина заголовка
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    # Количество уникальных слов
    data['unique_words'] = data['title'].apply(lambda x: len(set(x.split())))
    return data


train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

In [148]:
def group_tfidf_vectorize(data, groups):
    tfidf_features = pd.DataFrame(index=data.index)
    tfidf_vectorizer = TfidfVectorizer(max_features=100)

    for group_id in groups:
        group_indices = data[data['group_id'] == group_id].index
        group_texts = data.loc[group_indices, 'title_processed']
        if group_texts.empty:
            continue
        tfidf_matrix = tfidf_vectorizer.fit_transform(group_texts)
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])],
                                index=group_indices)
        tfidf_features = pd.concat([tfidf_features, tfidf_df], axis=0)

    tfidf_features = tfidf_features.reset_index(drop=True)
    data = data.reset_index(drop=True)
    tfidf_features = tfidf_features.reindex(data.index)  # Убедиться, что индексы совпадают
    return tfidf_features


tfidf_train_df = group_tfidf_vectorize(train_data, train_data['group_id'].unique())
tfidf_test_df = group_tfidf_vectorize(test_data, test_data['group_id'].unique())

train_features = pd.concat([train_features, tfidf_train_df, train_data[['title_length', 'unique_words']]], axis=1)
test_features = pd.concat([test_features, tfidf_test_df, test_data[['title_length', 'unique_words']]], axis=1)

In [149]:
# Проверка данных на наличие NaN и бесконечных значений
print(train_features.isnull().sum().sum())
print(test_features.isnull().sum().sum())

print(np.isinf(train_features).sum().sum())
print(np.isinf(test_features).sum().sum())

1169000
1662700
0
0


In [150]:
# Замена NaN значений на 0
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

In [151]:
# Замена бесконечных значений на конечные
train_features = train_features.replace([np.inf, -np.inf], 0)
test_features = test_features.replace([np.inf, -np.inf], 0)

X_train = train_features
y_train = train_data['target']
X_test = test_features

In [152]:
# Проверка количества строк после обработки данных
print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Train labels shape: {y_train.shape}")

Train features shape: (11690, 870)
Test features shape: (16627, 870)
Train labels shape: (11690,)


In [153]:
# Проверка уникальности идентификаторов
test_group_ids = set(test_groups['doc_id'])
test_feature_ids = set(test_data['doc_id'])
missing_ids = test_group_ids - test_feature_ids
print(f"Missing IDs in test_features: {len(missing_ids)}")

Missing IDs in test_features: 0


In [154]:
# Восстановление отсутствующих строк
missing_data = test_groups[test_groups['doc_id'].isin(missing_ids)]
missing_data = missing_data.merge(docs_titles, on='doc_id', how='left')
missing_data['title'] = missing_data['title'].fillna('')
missing_data['title_processed'] = missing_data['title'].apply(preprocess_text)
missing_data['title_embeddings'] = missing_data['title'].apply(lambda x: safe_get_bert_embeddings(x))

In [155]:
# Debugging: Check the embeddings for missing data
print("Generating embeddings for missing data:")
missing_data['title_embeddings'] = missing_data['title'].apply(lambda x: safe_get_bert_embeddings(x))

Generating embeddings for missing data:


In [156]:
# Check the length of each embedding
embedding_lengths = missing_data['title_embeddings'].apply(lambda x: len(x))
print("Embedding lengths summary:")
print(embedding_lengths.describe())

Embedding lengths summary:
count       0
unique      0
top       NaN
freq      NaN
Name: title_embeddings, dtype: object


In [157]:
# Check for non-empty embeddings
non_empty_embeddings = missing_data['title_embeddings'].apply(lambda x: len(x) > 0).sum()
print(f"Number of non-empty embeddings: {non_empty_embeddings}")

Number of non-empty embeddings: 0


In [158]:
# If there are non-empty embeddings, proceed
if non_empty_embeddings > 0:
    missing_features = embeddings_to_features(missing_data, 'title')
    missing_tfidf_features = group_tfidf_vectorize(missing_data, missing_data['group_id'].unique())
    
    # Новые признаки для отсутствующих данных
    missing_data = add_new_features(missing_data)

    # Объединение всех признаков для отсутствующих данных
    missing_features = pd.concat([missing_features, missing_tfidf_features, missing_data[['title_length', 'unique_words']]],
                                 axis=1)

    # Замена NaN значений на 0 и бесконечных значений на конечные
    missing_features = missing_features.fillna(0)
    missing_features = missing_features.replace([np.inf, -np.inf], 0)

    # Добавление отсутствующих данных к основным признакам теста
    test_features = pd.concat([test_features, missing_features])
    
print(f"Test features shape after adding missing data: {test_features.shape}")
print(f"Test groups shape: {test_groups.shape}")

Test features shape after adding missing data: (16627, 870)
Test groups shape: (16627, 3)


In [159]:
splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, groups=train_groups['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)
X_test_scaled = scaler.transform(X_test)  # Don't forget to scale test data

In [160]:
class TextDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]


batch_size = 32
train_dataset = TextDataset(X_train_scaled, y_train_split.values)
val_dataset = TextDataset(X_val_scaled, y_val_split.values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [161]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        return out


input_size = X_train_scaled.shape[1]
num_classes = 2
learning_rate = 0.001
num_epochs = 20

model = SimpleNN(input_size, num_classes)
model.to(device)


SimpleNN(
  (fc1): Linear(in_features=870, out_features=512, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=2, bias=True)
)

In [162]:
# Определение весов классов
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_split), y=y_train_split)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [163]:
# Обновление функции потерь с учетом весов классов
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_predictions, average='weighted')
    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation F1-score: {val_f1:.4f}")

Epoch [1/20], Validation F1-score: 0.7501
Epoch [2/20], Validation F1-score: 0.7247
Epoch [3/20], Validation F1-score: 0.7336
Epoch [4/20], Validation F1-score: 0.7530
Epoch [5/20], Validation F1-score: 0.7075
Epoch [6/20], Validation F1-score: 0.7407
Epoch [7/20], Validation F1-score: 0.7043
Epoch [8/20], Validation F1-score: 0.7507
Epoch [9/20], Validation F1-score: 0.6944
Epoch [10/20], Validation F1-score: 0.7494
Epoch [11/20], Validation F1-score: 0.7303
Epoch [12/20], Validation F1-score: 0.7303
Epoch [13/20], Validation F1-score: 0.7264
Epoch [14/20], Validation F1-score: 0.7282
Epoch [15/20], Validation F1-score: 0.7307
Epoch [16/20], Validation F1-score: 0.7070
Epoch [17/20], Validation F1-score: 0.7416
Epoch [18/20], Validation F1-score: 0.7160
Epoch [19/20], Validation F1-score: 0.7482
Epoch [20/20], Validation F1-score: 0.7419


In [164]:
test_dataset = TextDataset(X_test_scaled)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        test_predictions.extend(predicted.cpu().numpy())

print(f"Количество предсказаний: {len(test_predictions)}")
print(f"Количество строк в test_groups: {len(test_groups)}")

Количество предсказаний: 16627
Количество строк в test_groups: 16627


In [166]:
if len(test_predictions) == len(test_groups):
    submission = test_groups[['pair_id']].copy()
    submission['target'] = test_predictions
    submission.to_csv('submission.csv', index=False)
    print('Файл с предсказаниями создан: submission.csv')
else:
    print("Ошибка: количество предсказаний не совпадает с количеством строк в test_groups")

Файл с предсказаниями создан: submission.csv
