In [170]:
!pip install catboost
!pip install pymorphy2
!pip install transformers



In [171]:
import pandas as pd
import numpy as np
import re
import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from transformers import BertTokenizer, BertModel
from sklearn.cluster import DBSCAN
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [172]:
# Загрузка данных
train_groups = pd.read_csv('train_groups.csv')
test_groups = pd.read_csv('test_groups.csv')
sample_submission = pd.read_csv('sample_submission.csv')
docs_titles = pd.read_csv('docs_titles.tsv', sep='\t')

In [183]:
# Обработка отсутствующих значений
train_data = train_groups.merge(docs_titles, on='doc_id')
test_data = test_groups.merge(docs_titles, on='doc_id', how='left')
train_data['title'].fillna('', inplace=True)
test_data['title'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['title'].fillna('', inplace=True)


In [184]:
# Загрузка данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text.lower())
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return ' '.join(tokens)

train_data['title_processed'] = train_data['title'].apply(preprocess_text)
test_data['title_processed'] = test_data['title'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [201]:
# Векторизация с помощью BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def batch_get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

train_texts = train_data['title_processed'].tolist()
test_texts = test_data['title_processed'].tolist()

train_embeddings = batch_get_bert_embeddings(train_texts)
test_embeddings = batch_get_bert_embeddings(test_texts)

train_data['title_embeddings'] = list(train_embeddings)
test_data['title_embeddings'] = list(test_embeddings)

# Преобразование эмбеддингов в фичи
def embeddings_to_features(data, column_prefix):
    embeddings = np.array(data[column_prefix + '_embeddings'].tolist())  # Используем .tolist() для правильного преобразования
    feature_names = [f"{column_prefix}_embedding_{i}" for i in range(embeddings.shape[1])]
    features_df = pd.DataFrame(embeddings, columns=feature_names, index=data.index)
    return features_df

train_features = embeddings_to_features(train_data, 'title')
test_features = embeddings_to_features(test_data, 'title')

In [195]:
# Новые признаки
def add_new_features(data):
    data['title_length'] = data['title'].apply(lambda x: len(x.split()))
    data['unique_words'] = data['title_processed'].apply(lambda x: len(set(x.split())))
    return data

train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

enhanced_train_data = pd.concat([train_data, train_features], axis=1)
enhanced_test_data = pd.concat([test_data, test_features], axis=1)

In [None]:
# Подготовка данных для обучения
X_train = enhanced_train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed'], axis=1).values
y_train = enhanced_train_data['target'].values
X_test = enhanced_test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed'], axis=1).values

splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, groups=train_groups['group_id']))

X_train_split = X_train[train_indices]
y_train_split = y_train[train_indices]

X_val_split = X_train[val_indices]
y_val_split = y_train[val_indices]

In [196]:
# Убедитесь, что все данные в виде 2D массивов и преобразованы в float
X_train_split = np.vstack([np.array(x).astype(np.float32) for x in X_train_split])
X_val_split = np.vstack([np.array(x).astype(np.float32) for x in X_val_split])
X_test = np.vstack([np.array(x).astype(np.float32) for x in X_test])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)
X_test_scaled = scaler.transform(X_test)

In [198]:
class TextDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]

class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        return out

In [199]:
# Hyperparameters
input_size = X_train_scaled.shape[1]
num_classes = 2  # Assuming binary classification
learning_rate = 0.001
num_epochs = 20
batch_size = 32

In [200]:
# Prepare datasets and dataloaders
train_dataset = TextDataset(X_train_scaled, y_train_split.values)
val_dataset = TextDataset(X_val_scaled, y_val_split.values)
test_dataset = TextDataset(X_test_scaled)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:
# Model, criterion, and optimizer
model = SimpleNN(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Validation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_predictions, average='weighted')
    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation F1-score: {val_f1:.4f}")

In [None]:
# Test predictions
model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        test_predictions.extend(predicted.cpu().numpy())

In [None]:
# Saving predictions
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('/content/submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')

In [167]:
def vectorize_group(group):
    vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    vectors = vectorizer.fit_transform(group['title'])
    return vectors

grouped = train_data.groupby('group_id')

tfidf_train_df = pd.DataFrame()
similarity_features_list = []

for name, group in tqdm(grouped, desc="Processing groups"):
    tfidf_matrix = vectorize_group(group)
    group_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])], index=group.index)
    tfidf_train_df = pd.concat([tfidf_train_df, group_tfidf_df])

    cosine_sim_matrix = cosine_similarity(tfidf_matrix)
    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine').fit(cosine_sim_matrix)
    cluster_labels = dbscan.labels_

    for k, (idx, row) in enumerate(group.iterrows()):
        all_dist = []

        for j in range(len(group)):
            if k == j:
                continue
            all_dist.append(cosine_sim_matrix[k, j])

        top_15_similarities = sorted(all_dist, reverse=True)[:10]
        top_15_similarities.append(cluster_labels[k])

        similarity_record = [row['pair_id'], row['group_id'], row['doc_id']] + top_15_similarities
        similarity_features_list.append(similarity_record)

similarity_columns = ['pair_id', 'group_id', 'doc_id'] + [f'top_{i+1}_similarity' for i in range(11)]
similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)

tfidf_train_df = tfidf_train_df.fillna(0)
similarity_features = similarity_features.fillna(0)

enhanced_train_data = train_data.merge(similarity_features, on=['pair_id', 'group_id', 'doc_id'])
enhanced_train_data = pd.concat([enhanced_train_data, train_features], axis=1)

grouped = test_data.groupby('group_id')

tfidf_test_df = pd.DataFrame()
similarity_features_list = []

for name, group in tqdm(grouped, desc="Processing groups"):
    tfidf_matrix = vectorize_group(group)
    group_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])], index=group.index)
    tfidf_test_df = pd.concat([tfidf_test_df, group_tfidf_df])

    cosine_sim_matrix = cosine_similarity(tfidf_matrix)
    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine').fit(cosine_sim_matrix)
    cluster_labels = dbscan.labels_

    for k, (idx, row) in enumerate(group.iterrows()):
        all_dist = []

        for j in range(len(group)):
            if k == j:
                continue
            all_dist.append(cosine_sim_matrix[k, j])

        top_15_similarities = sorted(all_dist, reverse=True)[:10]
        top_15_similarities.append(cluster_labels[k])

        similarity_record = [row['pair_id'], row['group_id'], row['doc_id']] + top_15_similarities
        similarity_features_list.append(similarity_record)

similarity_columns = ['pair_id', 'group_id', 'doc_id'] + [f'top_{i+1}_similarity' for i in range(11)]
similarity_features = pd.DataFrame(similarity_features_list, columns=similarity_columns)

tfidf_test_df = tfidf_test_df.fillna(0)
similarity_features = similarity_features.fillna(0)

enhanced_test_data = test_data.merge(similarity_features, on=['pair_id', 'group_id', 'doc_id'])
enhanced_test_data = pd.concat([enhanced_test_data, test_features], axis=1)

X_test = enhanced_test_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'title', 'title_processed'], axis=1)

X_train = enhanced_train_data.drop(columns=['doc_id', 'pair_id', 'group_id', 'target', 'title', 'title_processed'], axis=1)
y_train = enhanced_train_data['target']

splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indices, val_indices = next(splitter.split(X_train, y_train, train_groups['group_id']))

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]

X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)
X_test_scaled = scaler.transform(X_test)

class TextDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]

class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        return out

# Hyperparameters
input_size = X_train_scaled.shape[1]
num_classes = 2  # Assuming binary classification
learning_rate = 0.001
num_epochs = 20
batch_size = 32

# Prepare datasets and dataloaders
train_dataset = TextDataset(X_train_scaled, y_train_split.values)
val_dataset = TextDataset(X_val_scaled, y_val_split.values)
test_dataset = TextDataset(X_test_scaled)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model, criterion, and optimizer
model = SimpleNN(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_predictions, average='weighted')
    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation F1-score: {val_f1:.4f}")

# Test predictions
model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        test_predictions.extend(predicted.cpu().numpy())

# Saving predictions
submission = test_groups[['pair_id']].copy()
submission['target'] = test_predictions
submission.to_csv('/content/submission.csv', index=False)

print('Файл с предсказаниями создан: submission.csv')




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['title'].fillna('', inplace=True)
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniyashuba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nl

ValueError: setting an array element with a sequence.