## Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import math

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

from sklearn import preprocessing

## Загрузка датасета

In [2]:
gardening_val = pd.read_csv('gardening_test.csv')
gardening_train = pd.read_csv('gardening_train.csv')

## Функции для работы с датасетом

In [3]:
def convert_dataset(_train_dataset, _val_dataset, val_data_target, key):
    train_dataset, val_dataset = {}, {}
    
    for receipt_id, item_id in zip(_train_dataset['receipt_id'], _train_dataset[key]):
        if receipt_id not in train_dataset:
            train_dataset[receipt_id] = []

        train_dataset[receipt_id].append(int(item_id))

    for receipt_id, item_id in zip(_val_dataset['receipt_id'], _val_dataset[key]):
        if receipt_id not in val_dataset:
            val_dataset[receipt_id] = []

        val_dataset[receipt_id].append(int(item_id))

    # for receipt_id in val_dataset:
    #     val_dataset[receipt_id].append(int(val_data_target[val_data_target['receipt_id'] == receipt_id][key].tolist()[0]))

    val_keys = val_dataset.keys()
    train_dataset, val_dataset = list(train_dataset.values()), list(val_dataset.values())

    for i in range(len(train_dataset)):
        train_dataset[i] = train_dataset[i][-6:]
        
        for _ in range(6 - len(train_dataset[i])):
            train_dataset[i].insert(0, -1)

        for j in range(len(train_dataset[i])):
            train_dataset[i][j] += 1

    for i in range(len(val_dataset)):
        val_dataset[i] = val_dataset[i][-6:]
        
        for _ in range(6 - len(val_dataset[i])):
            val_dataset[i].insert(0, -1)

        for j in range(len(val_dataset[i])):
            val_dataset[i][j] += 1

    return train_dataset, val_dataset, val_keys

In [4]:
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

In [5]:
def create_dataset(dataset, val_data, val_data_target):
    unique_train_item_ids = dataset['item_id'].unique()
    unique_train_good = dataset['good'].unique()
    unique_train_brand = dataset['brand'].unique()
    
    le1.fit(dataset['item_id'])
    dataset['item_id'] = le1.transform(dataset['item_id'])

    le2.fit(dataset['good'])
    dataset['good'] = le2.transform(dataset['good'])

    le3.fit(dataset['brand'])
    dataset['brand'] = le3.transform(dataset['brand'])
    
    val_data['item_id'] = [le1.transform([i])[0] if i in unique_train_item_ids else len(unique_train_item_ids) for i in val_data['item_id']]
    # val_data_target['item_id'] = [le1.transform([i])[0] if i in unique_train_item_ids else len(unique_train_item_ids) for i in val_data_target['item_id']]
    
    val_data['good'] = [le2.transform([i])[0] if i in unique_train_good else len(unique_train_good) for i in val_data['good']]
    # val_data_target['good'] = [le2.transform([i])[0] if i in unique_train_good else len(unique_train_good) for i in val_data_target['good']]
    
    val_data['brand'] = [le3.transform([i])[0] if i in unique_train_brand else len(unique_train_brand) for i in val_data['brand']]
    # val_data_target['brand'] = [le3.transform([i])[0] if i in unique_train_brand else len(unique_train_brand) for i in val_data_target['brand']]

    try:
        dataset['item_id'][np.random.choice(list(range(len(dataset))), int(len(val_data['item_id']) * val_data['item_id'].value_counts()[len(unique_train_item_ids)] / len(val_data)))] = len(unique_train_item_ids)
    except:
        ...
    
    dataset = dataset.sort_values(by=['local_date'], key=lambda x: x.apply(lambda y: parser.parse(y).timestamp()))
    val_data = val_data.sort_values(by=['local_date'], key=lambda x: x.apply(lambda y: parser.parse(y).timestamp()))

    train_dataset_item_id, val_dataset_item_id, val_keys = convert_dataset(dataset, val_data, val_data_target, 'item_id')
    train_dataset_good, val_dataset_good, _ = convert_dataset(dataset, val_data, val_data_target, 'good')
    train_dataset_brand, val_dataset_brand, _ = convert_dataset(dataset, val_data, val_data_target, 'brand')

    train_dataset = [[a, b, c] for a, b, c in zip(train_dataset_item_id, train_dataset_good, train_dataset_brand)]
    val_dataset = [[a, b, c] for a, b, c in zip(val_dataset_item_id, val_dataset_good, val_dataset_brand)]

    return train_dataset, val_dataset, val_keys

## Архитектура модели

In [6]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

class TransformerModel(nn.Module):
    def __init__(self, ntoken1: int, ntoken2: int, ntoken3: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        
        self.pos_encoder1 = PositionalEncoding(d_model, dropout)
        encoder_layers1 = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder1 = TransformerEncoder(encoder_layers1, nlayers)
        self.embedding1 = nn.Embedding(ntoken1, d_model)
        self.d_model1 = d_model
        self.linear1 = nn.Linear(d_model, ntoken1)

        self.pos_encoder2 = PositionalEncoding(d_model, dropout)
        encoder_layers2 = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder2 = TransformerEncoder(encoder_layers2, nlayers)
        self.embedding2 = nn.Embedding(ntoken2, d_model)
        self.d_model2 = d_model
        self.linear2 = nn.Linear(d_model, ntoken1)

        self.pos_encoder3 = PositionalEncoding(d_model, dropout)
        encoder_layers3 = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder3 = TransformerEncoder(encoder_layers3, nlayers)
        self.embedding3 = nn.Embedding(ntoken3, d_model)
        self.d_model3 = d_model
        self.linear3 = nn.Linear(d_model, ntoken1)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        
        self.embedding1.weight.data.uniform_(-initrange, initrange)
        self.linear1.bias.data.zero_()
        self.linear1.weight.data.uniform_(-initrange, initrange)

        self.embedding2.weight.data.uniform_(-initrange, initrange)
        self.linear2.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange, initrange)

        self.embedding3.weight.data.uniform_(-initrange, initrange)
        self.linear3.bias.data.zero_()
        self.linear3.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, good: Tensor, brand: Tensor, mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            good: Tensor, shape ``[seq_len, batch_size]``
            brand: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[batch_size, ntoken]``
        """
        src = self.embedding1(src) * math.sqrt(self.d_model1)
        src = self.pos_encoder1(src)
        src_output = self.transformer_encoder1(src, mask)
        src_output = self.linear1(src_output.mean(0))

        good = self.embedding2(good) * math.sqrt(self.d_model2)
        good = self.pos_encoder2(good)
        good_output = self.transformer_encoder2(good, mask)
        good_output = self.linear2(good_output.mean(0))

        brand = self.embedding3(brand) * math.sqrt(self.d_model3)
        brand = self.pos_encoder3(brand)
        brand_output = self.transformer_encoder3(brand, mask)
        brand_output = self.linear3(brand_output.mean(0))

        output = torch.stack([src_output, good_output, brand_output])
        
        return output

In [7]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [8]:
device = 'cuda'

In [9]:
# %%time
train_dataset, val_dataset, val_keys = create_dataset(gardening_train, gardening_val, None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['item_id'][np.random.choice(list(range(len(dataset))), int(len(val_data['item_id']) * val_data['item_id'].value_counts()[len(unique_train_item_ids)] / len(val_data)))] = len(unique_train_item_ids)


## Инициализация параметров модели

In [10]:
model = torch.load('model.pt')

## Валидация

In [11]:
model.eval()
val_x = torch.stack([torch.stack([torch.tensor(i[0][:-1]), torch.tensor(i[1][:-1]), torch.tensor(i[2][:-1])]) for i in val_dataset]).transpose(0, 1).transpose(1, 2)
output = model(*val_x.to(device))
df = pd.DataFrame({'receipt_id': gardening_val['receipt_id'].unique().tolist(), 'item_id': le1.inverse_transform([min(i, 12766) for i in torch.argmax(output[0], dim=1).detach().cpu().tolist()])})
df.to_csv(f'submit.csv', index=False, sep=';')