In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackathon-data/data_not_clean.pkl
/kaggle/input/hackathon-data-clear/data.pkl


In [2]:
%%capture
! pip install transformers sentencepiece

In [3]:
%%capture
!git lfs install
!git clone https://huggingface.co/cointegrated/rubert-tiny2

In [4]:
import random
import numpy as np
import pandas as pd
import torch
from torch import nn
from collections import OrderedDict
from transformers import AutoTokenizer, AutoModel

In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed = 42
set_seed(seed)

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
# Hyperparameters
num_epochs = 100
batch_size = 16
learning_rate = 3*1e-6

In [8]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
        self.base = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
        n_dim = 312
        self.head = nn.Sequential(OrderedDict( [('dropout', torch.nn.Dropout(.2)),
                                                ('fc_1' , nn.Linear(n_dim, n_dim*2)),
                                                ('relu_1' , nn.ReLU()),
                                                ('batchnorm_1' , nn.BatchNorm1d(n_dim*2, eps=1e-12)),
                                                ('fc_2' , nn.Linear(n_dim*2, n_dim)),
                                                ('relu_2' , nn.ReLU()),
                                                ('batchnorm_2' , nn.BatchNorm1d(n_dim, eps=1e-12)),
                                                ('fc_3' , nn.Linear(n_dim, 2, bias=False))
                    ]))

    def forward(self, tokens):
        model_output = self.base(**tokens)
        result = self.head(model_output.pooler_output)
        return result
    
    def get_loss(self, texts, labels):
        # нужно дублировать метки, потому что на последнем слое 2 нейрона
        inv_labels = torch.ones(labels.shape) - labels
        targets = torch.stack((labels, inv_labels), dim=1).detach().clone().to(device)
        
        # токенизация и форвард-пасс
        tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        outputs = self.forward(tokens)
        
        return criterion(outputs, targets)
    
    def eval_loss(self, dataloader):
        batch_indx = np.random.randint(len(dataloader)+1, size=batch_size)
        batch_texts = [dataloader.dataset[i][0] for i in batch_indx]
        batch_labels = torch.Tensor([train_dataloader.dataset[i][1] for i in batch_indx])
        return self.get_loss(batch_texts, batch_labels).item()

In [9]:
# model_path = './CrossEncoderModel'
model = Model().to(device)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [10]:
# Unpickle dataset
df = pd.read_pickle('/kaggle/input/hackathon-data/data_not_clean.pkl')
df.head()

Unnamed: 0,query,text,label,clean_text,clean_query,embedding_text,embedding_query
0,Когда был спущен на воду первый миноносец «Спо...,Зачислен в списки ВМФ СССР 19 августа 1952 год...,1,Зачислен в списки ВМФ СССР 19 августа 1952 год...,Когда был спущен на воду первый миноносец «Спо...,"[-0.0030975677, -0.018142669, -0.0058722952, 0...","[0.00064380514, 0.0074218363, -0.03353223, -0...."
1,Как долго существовало британское телевизионно...,"Хрустальный лабиринт (""The Crystal Maze"") — бр...",1,"Хрустальный лабиринт (""The Crystal Maze"") — бр...",Как долго существовало британское телевизионно...,"[0.0026477594, -0.026646728, 0.0009579654, -0....","[-0.029795218, -0.01173853, -0.00032150946, -0..."
2,Когда родилась Князева Марина Леонидовна?,Князева Марина Леонидовна (род. 7 мая 1952 г.)...,1,Князева Марина Леонидовна (род. 7 мая 1952 г.)...,Когда родилась Князева Марина Леонидовна?,"[-0.036747612, -0.012604811, -0.0109199695, -0...","[-0.036575466, -0.010551005, -0.04117768, -0.0..."
3,Кто был главным художником мира Зен?,"В книге ""Half-Life 2: Raising the Bar"" художни...",1,"В книге ""Half-Life 2: Raising the Bar"" художни...",Кто был главным художником мира Зен?,"[-0.02514373, -0.023727695, -0.04738828, 0.011...","[-0.022960061, -0.013048667, -0.018877652, -0...."
4,Как звали предполагаемого убийцу Джона Кеннеди?,В 1966 году окружной прокурор Нового Орлеана Д...,1,В 1966 году окружной прокурор Нового Орлеана Д...,Как звали предполагаемого убийцу Джона Кеннеди?,"[0.0074619167, -0.024880972, -0.026498705, 0.0...","[-0.0124044325, -0.0020067864, -0.030558525, 0..."


In [11]:
# Train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split((df['clean_query']+' [SEP] '+df['clean_text']).to_numpy(), df['label'].to_numpy(), test_size=0.2, random_state=seed)
X_train.shape, X_test.shape

((37607,), (9402,))

In [12]:
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, df):
        self.dataframe = df.reset_index()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return list(self.dataframe.iloc[index])[1:]

In [13]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(PandasDataset(pd.DataFrame([X_train, y_train]).T),
                              batch_size=batch_size)
test_dataloader = DataLoader(PandasDataset(pd.DataFrame([X_test, y_test]).T),
                              batch_size=batch_size)

In [14]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.008)

In [15]:
# Uncomment to train model
total_step = len(train_dataloader)

best_val_loss = 10**8
train_losses_history = []
test_losses_history = []
for epoch in range(num_epochs):
    model.train()
    for i, (texts, labels) in enumerate(train_dataloader):
        # Forward pass
        loss = model.get_loss(texts, labels)
        
        # Backprpagation and optimizer step
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()
        # print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
    
    model.eval()
    train_loss = model.eval_loss(train_dataloader)
    test_loss = model.eval_loss(test_dataloader)
    print(f'\n Epoch [{epoch + 1}/{num_epochs}], Train Loss [{train_loss:.4f}], Test Loss [{test_loss:.4f}]')
    train_losses_history.append(train_loss)
    test_losses_history.append(test_loss)
    if test_loss <= best_val_loss:
        best_val_loss = test_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_val_loss,
        }, 'model.ckpt')


torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_val_loss,
}, 'model_final.ckpt')
torch.save(torch.Tensor(train_losses_history), 'train_loss_history')
torch.save(torch.Tensor(test_losses_history), 'test_loss_history')


 Epoch [1/100], Train Loss [0.6007], Test Loss [0.5700]

 Epoch [2/100], Train Loss [0.5486], Test Loss [0.6104]

 Epoch [3/100], Train Loss [0.5696], Test Loss [0.6962]

 Epoch [4/100], Train Loss [0.7883], Test Loss [0.5723]

 Epoch [5/100], Train Loss [0.6195], Test Loss [0.5934]

 Epoch [6/100], Train Loss [0.5343], Test Loss [0.7023]

 Epoch [7/100], Train Loss [0.5362], Test Loss [0.4758]

 Epoch [8/100], Train Loss [0.4991], Test Loss [0.5059]

 Epoch [9/100], Train Loss [0.4688], Test Loss [0.5951]

 Epoch [10/100], Train Loss [0.3703], Test Loss [0.8918]

 Epoch [11/100], Train Loss [0.4852], Test Loss [0.5391]

 Epoch [12/100], Train Loss [0.7102], Test Loss [0.7903]

 Epoch [13/100], Train Loss [0.2869], Test Loss [0.6655]

 Epoch [14/100], Train Loss [0.3839], Test Loss [0.6862]

 Epoch [15/100], Train Loss [0.4857], Test Loss [0.4716]

 Epoch [16/100], Train Loss [0.6902], Test Loss [0.8503]

 Epoch [17/100], Train Loss [0.5500], Test Loss [0.5551]

 Epoch [18/100], Train

In [16]:
# Uncomment to test the model
# with torch.no_grad():
#     correct = 0
#     total = 0
#     for texts, labels in test_dataloader:
#         labels = labels.to(device)
#         tokens = model.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
#         outputs = model(tokens)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#     print(f'Accuracy: {100 * correct / total} %')

# # Save the model checkpoint
# torch.save(model.state_dict(), 'model.ckpt')