In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
! pip install transformers sentencepiece

In [None]:
%%capture
!git lfs install
!git clone https://huggingface.co/cointegrated/rubert-tiny2

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import nn
from collections import OrderedDict
from transformers import AutoTokenizer, AutoModel

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed = 42
set_seed(seed)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# Hyperparameters
num_epochs = 15
batch_size = 64
learning_rate = 1e-4

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
        self.base = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

        # freezing base model weights
        for param in self.base.parameters():
            param.requires_grad = False

        n_dim = 312
        self.head = nn.Sequential(OrderedDict( [('dropout', torch.nn.Dropout(.2)),
                                                ('fc_1' , nn.Linear(n_dim, n_dim//2)),
                                                ('relu_1' , nn.ReLU()),
                                                ('batchnorm_1' , nn.BatchNorm1d(n_dim//2, eps=1e-12)),
                                                ('dropout', torch.nn.Dropout(.2)),
                                                ('fc_3' , nn.Linear(n_dim//2, 2, bias=False))
                    ]))

    def forward(self, tokens):
        model_output = self.base(**tokens)
        result = self.head(model_output.pooler_output)
        return result
    
    def get_loss(self, texts, labels):
        targets = labels.long().to(device)  # Convert labels to long integers
        tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        outputs = self.forward(tokens)
        return criterion(outputs, targets)
    
    def eval_loss(self, dataloader):
        batch_indx = np.random.randint(len(dataloader)+1, size=batch_size)
        batch_texts = [dataloader.dataset[i][0] for i in batch_indx]
        batch_labels = torch.Tensor([train_dataloader.dataset[i][1] for i in batch_indx])
        return self.get_loss(batch_texts, batch_labels).item()

In [None]:
# model_path = './CrossEncoderModel'
model = Model().to(device)

In [None]:
# Unpickle dataset
df = pd.read_pickle('/kaggle/input/data-not-clean1/data_not_clean1.pkl')
df.head()

In [None]:
# Train-test split

from sklearn.model_selection import train_test_split

df_filtered = df[['clean_query', 'clean_text', 'label']]

# Группируем по запросам
grouped = df_filtered.groupby('clean_query').agg({'clean_text': list, 'label': list}).reset_index()

train, test_val = train_test_split(grouped, test_size=0.2, random_state=42)

# Разворачиваем списки текстов и меток обратно в строки для каждой подвыборки
train = train.explode(['clean_text', 'label']).reset_index(drop=True)
test_val = test_val.explode(['clean_text', 'label']).reset_index(drop=True)

# Проверка результата
print("Train size:", len(train))
print("Test/Validation size:", len(test_val))

In [None]:
train.head()

In [None]:
X_train, y_train  = (train['clean_query']+' [SEP] '+train['clean_text']).to_numpy(), train['label'].to_numpy()
X_test, y_test = (test_val['clean_query']+' [SEP] '+test_val['clean_text']).to_numpy(), test_val['label'].to_numpy()

In [None]:
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, df):
        self.dataframe = df.reset_index()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return list(self.dataframe.iloc[index])[1:]

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(PandasDataset(pd.DataFrame([X_train, y_train]).T),
                              batch_size=batch_size)
test_dataloader = DataLoader(PandasDataset(pd.DataFrame([X_test, y_test]).T),
                              batch_size=batch_size)

In [None]:
# Hyperparameters
num_epochs = 6
batch_size = 64
learning_rate = 1e-4
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.008)

In [23]:
import time

# Training loop
total_step = len(train_dataloader)
best_val_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    epoch_start_time = time.time()  # Start timing the epoch
    train_loss = 0.0
    for texts, labels in train_dataloader:
        loss = model.get_loss(texts, labels)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        train_loss += loss.item() * len(texts)  # Accumulate loss

    train_loss /= len(train_dataloader.dataset)  # Compute average loss

    # Evaluate loss after each epoch
    model.eval()
    with torch.no_grad():
        test_loss = sum(model.get_loss(batch[0], batch[1]).item() * len(batch[0]) for batch in test_dataloader) / len(test_dataloader.dataset)

    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'loss': loss
        }, 'best_model.ckpt')
        print('\nsave_model\n')

    epoch_duration = time.time() - epoch_start_time  # Calculate epoch duration
    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Time: {epoch_duration:.2f} seconds')


save_model

Epoch [1/6], Train Loss: 0.6711, Test Loss: 0.6022, Time: 112.27 seconds

save_model

Epoch [2/6], Train Loss: 0.6007, Test Loss: 0.5825, Time: 112.13 seconds

save_model

Epoch [3/6], Train Loss: 0.5830, Test Loss: 0.5812, Time: 112.10 seconds

save_model

Epoch [4/6], Train Loss: 0.5781, Test Loss: 0.5811, Time: 112.23 seconds

save_model

Epoch [5/6], Train Loss: 0.5733, Test Loss: 0.5789, Time: 112.28 seconds
Epoch [6/6], Train Loss: 0.5720, Test Loss: 0.5796, Time: 111.84 seconds


In [None]:
model2 = Model().to(device)

checkpoint = torch.load('/kaggle/input/to_fine_tuning/pytorch/default/1/best_model.ckpt', weights_only=True)

model2.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# Если вы планируете использовать модель для обучения
#model.train()

# Или если планируете использовать только для инференса
model2.eval()

In [18]:
def testing(model_f):
    correct, total = 0, 0
    with torch.no_grad():
        for texts, labels in test_dataloader:
            labels = labels.to(device).long() # Crucial: Convert labels to long
            tokens = model_f.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
            outputs = model_f(tokens)
            _, predicted = torch.max(outputs, 1) # Correct way to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print(f'Accuracy: {100 * correct / total:.2f}%')

In [None]:
testing(model)

In [19]:
testing(model2)

Accuracy: 71.74%


### Ranking

In [20]:
def custom_metric(df_sorted):
    n = len(df_sorted)
    k = df_sorted['label'].sum()
    if k == 0:
        return 1
    top_k = max(3, int(k))
    
    top_k_answers = df_sorted.iloc[:top_k]
    
    correct_in_top_k = top_k_answers['label'].sum()
    
    score = correct_in_top_k / k
    return score


In [26]:
def metric_with_weight(df_sorted):
    n = len(df_sorted)
    k = df_sorted['label'].sum()

    if k == 0:
        return 1
    
    # Количество позиций для оценки
    top_k = max(3, int(k))

    # Отбираем первые top_k ответов после ранжирования
    top_k_answers = df_sorted.iloc[:top_k]
    
    # Считаем количество правильных ответов среди первых top_k
    correct_in_top_k = top_k_answers['label'].sum()

    # Рассчитываем процент правильных ответов среди первых top_k
    base_score = correct_in_top_k / k

    # Уверенность модели - добавляем веса на основе отклонения similarity от среднего
    avg_similarity = df_sorted['relevance'].mean()
    similarity_deviation = abs(df_sorted['relevance'] - avg_similarity)

    # Присваиваем вес каждому ответу: чем больше отклонение от среднего, тем больше вес
    weights = 1 + similarity_deviation / avg_similarity

    # Применяем веса к правильным ответам в первых top_k позициях
    weighted_correct_in_top_k = (top_k_answers['label'] * weights[:top_k]).sum()

    # Рассчитываем итоговый score с учетом весов
    weighted_score = weighted_correct_in_top_k / k

    return weighted_score


In [27]:
def metric_first_true(df_sorted):
    n = len(df_sorted)
    k = df_sorted['label'].sum()

    if k == 0:
        return 1
    for i in range(n):
        if df_sorted['label'][i]==1:
            return i+1


In [28]:
def rank_answers_by_relevance(df, model_f):
    relevance_scores = []
    
    for _, row in df.iterrows():
        query = row['clean_query']
        answer = row['clean_text']
        
        inputs = model_f.tokenizer(
            query,
            answer,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True,
            padding='max_length'
        ).to(device)

        with torch.no_grad():
            outputs = model_f(inputs)
            relevance_score = torch.softmax(outputs, dim=1)[0][1].item()
            relevance_scores.append(relevance_score)
    
    df['relevance'] = relevance_scores
    df_sorted = df.sort_values(by='relevance', ascending=False).reset_index(drop=True)
    
    return df_sorted


In [29]:

res = []
for query, text in df.groupby('clean_query'):
    ranked_df = rank_answers_by_relevance(text, model2)
    score1 = custom_metric(ranked_df)
    score2 = metric_with_weight(ranked_df)
    score3 = metric_first_true(ranked_df)
    n = len(ranked_df)
    k = ranked_df['label'].sum()
    res.append([query, score1, score2, score3, n, k])



In [30]:
summ_score1 = sum([elem[1] for elem in res]) / len(res)
summ_score2 = sum([elem[2] for elem in res]) / len(res)
summ_score3 = sum([elem[3] for elem in res]) / len(res)
print("Average Score:", summ_score1,summ_score2,summ_score3 )


Average Score: 0.618837995646903 0.8999355470062163 2.447292966773486


In [3]:
Неожиданно metric_with_weight побила baseline результаты!!!!

SyntaxError: invalid syntax (3874190903.py, line 1)