Public Leaderboard:
 - Task 1: R1 - 0.265826  | MRR - 0.193179  | Precision - 0.426043
 - Task 2: R1 - 0.012731  | MRR - 0.006671  | Precision - 0.138875

In [37]:
# !mkdir data

# !wget https://storage.yandexcloud.net/datasouls-ods/materials/0433a4ca/transactions.zip -P data
# !wget https://storage.yandexcloud.net/datasouls-ods/materials/0554f0cf/clickstream.zip -P data
# !wget https://storage.yandexcloud.net/datasouls-ods/materials/acfacf11/train_matching.csv -P data
# !wget https://storage.yandexcloud.net/datasouls-ods/materials/b99fed70/puzzle.csv -P data
    
# !unzip data/transactions.zip -d data
# !unzip data/clickstream.zip -d data

# !rm data/transactions.zip
# !rm data/clickstream.zip

In [41]:
import numpy as np 
import pandas as pd 
import sys
import pickle
import shap
import catboost
from catboost import CatBoostClassifier, Pool
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook

print(catboost.__version__, np.__version__, pd.__version__)

import multiprocessing
from psutil import virtual_memory

0.26 1.20.1 1.4.1


In [3]:
def read_cl_data(input_folder='data', all_dicts = {}):
    clickstream = pd.read_csv(f'{input_folder}/clickstream.csv')
    clickstream['timestamp'] = pd.to_datetime(clickstream['timestamp'])
    all_dicts['rtk_le'] = LabelEncoder().fit(clickstream['user_id'])
    clickstream['user_id'] = all_dicts['rtk_le'].transform(clickstream['user_id'])+1
    clickstream_dtypes = {'user_id':np.int16, 'cat_id':np.int16, 'new_uid':np.int32}
    clickstream = clickstream.astype(clickstream_dtypes)
    return clickstream, all_dicts

In [4]:
def read_tr_data(input_folder='data', all_dicts = {}):
    transactions = pd.read_csv(f'{input_folder}/transactions.csv')
    transactions['transaction_dttm'] = pd.to_datetime(transactions['transaction_dttm'])
    all_dicts['bank_le'] = LabelEncoder().fit(transactions['user_id'])
    transactions['user_id'] = all_dicts['bank_le'].transform(transactions['user_id'])+1
    transactions_dtypes = {'user_id':np.int16, 'mcc_code':np.int16, 'currency_rk':np.int8}
    transactions = transactions.astype(transactions_dtypes)
    return transactions, all_dicts

In [5]:
def read_train_data(all_dicts, input_folder='data'):
    train = pd.read_csv(f'{input_folder}/train_matching.csv')
    train['bank'] = all_dicts['bank_le'].transform(train['bank'])+1
    train.loc[train.rtk=='0', 'rtk'] = 0
    train.loc[train.rtk!=0, 'rtk'] = all_dicts['rtk_le'].transform(train.loc[train.rtk!=0, 'rtk'])+1
    return train

In [6]:
def new_feats(clickstream, time_col, naming):
    clickstream['hour'] = clickstream[time_col].dt.hour
    cl_sv = pd.pivot_table(clickstream, index='user_id', columns='hour', values = time_col, aggfunc = 'count').fillna(0)
    cl_sv['summs'] = cl_sv.sum(axis=1)
    for i in cl_sv.columns[:-1]:
        cl_sv[i] /= cl_sv['summs']
    cl_sv.columns = [f'{naming}_h_'+ str(i) for i in cl_sv.columns]
    return cl_sv

In [7]:
def get_baseline_embed(clickstream, time_col, cat_col, naming, aggfunc):
    clickstream_embed = clickstream.pivot_table(index = 'user_id', 
                            values=[time_col],
                            columns=[cat_col],
                            aggfunc=aggfunc).fillna(0)
    clickstream_embed.columns = [f'{naming}_{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]
    clickstream_embed.loc[0] = np.empty(len(clickstream_embed.columns))
    dtype = pd.SparseDtype(np.int32, fill_value=0)
    clickstream_embed = clickstream_embed.astype(dtype)
    return clickstream_embed

### Get data and preprocessing

In [8]:
%%time
# Read data and label encoder to decrease used RAM
clickstream, all_dicts = read_cl_data(input_folder='data', all_dicts={})
# New feats basing on hour embeddings
cl_sv = new_feats(clickstream, 'timestamp', 'click')
# Embedings on categories from baseline
clickstream_embed = get_baseline_embed(clickstream, 'timestamp', 'cat_id', 'rtk', aggfunc = ['count'])
del clickstream

CPU times: user 2min 14s, sys: 16.7 s, total: 2min 31s
Wall time: 2min 31s


In [10]:
%%time
# Read data and label encoder to decrease used RAM
transactions, all_dicts = read_tr_data(input_folder='data', all_dicts=all_dicts)
# New feats basing on hour embeddings
tr_sv = new_feats(transactions, 'transaction_dttm', 'trans')
# Embedings on categories from baseline
bankclient_embed = get_baseline_embed(transactions, 'transaction_amt', 'mcc_code', 'bank', aggfunc = ['sum','mean', 'count'])
del transactions

CPU times: user 30.1 s, sys: 1.3 s, total: 31.4 s
Wall time: 32 s


In [45]:
%%time
# Read train data
train = read_train_data(all_dicts, input_folder='data')

CPU times: user 61 ms, sys: 13 µs, total: 61.1 ms
Wall time: 59.6 ms


### Training

In [46]:
%%time
# Соберём обучающую выборку, где на 1 правильный мэтч будет приходиться 10 неправильных

k=10
cor_dict = train.set_index('bank')['rtk'].to_dict()

train_bank_ids = train[(train.rtk!=0)]['bank'].unique()
train_rtk_ids = train[train.bank.isin(train_bank_ids)]['rtk'].drop_duplicates()
df_train = pd.DataFrame(train_bank_ids, columns=['bank'])
df_train['rtk'] = df_train['bank'].apply(lambda x: [cor_dict[x]] + train_rtk_ids.sample(k, random_state=x).values.tolist() )

df_train = df_train.explode('rtk')

train['bank+rtk'] = train['bank'].astype('str')+'_'+train['rtk'].astype('str')
df_train['bank+rtk'] = df_train['bank'].astype('str')+'_'+df_train['rtk'].astype('str')
df_train['target'] = df_train['bank+rtk'].isin(train['bank+rtk']).astype('int')

df_train = df_train.drop_duplicates('bank+rtk').reset_index(drop=True)

CPU times: user 6.46 s, sys: 19.9 ms, total: 6.48 s
Wall time: 6.48 s


In [47]:
%%time
X_train=df_train.merge(bankclient_embed, how='left', left_on='bank', right_index=True
                          ).merge(clickstream_embed, how='left', left_on='rtk', right_index=True
                           ).merge(cl_sv, how='left', left_on='rtk', right_index=True
                           ).merge(tr_sv, how='left', left_on='bank', right_index=True
                           ).fillna(0)
full_list_of_features = X_train.drop(['bank', 'rtk', 'bank+rtk', 'target'], axis=1).columns
print(len(full_list_of_features))

1610
CPU times: user 9.92 s, sys: 0 ns, total: 9.92 s
Wall time: 9.92 s


In [35]:
cb = CatBoostClassifier(verbose=50)
cb.fit( Pool(X_train[full_list_of_features], X_train['target']))

Learning rate set to 0.090298
0:	learn: 0.6139189	total: 232ms	remaining: 3m 51s
50:	learn: 0.3058380	total: 7.57s	remaining: 2m 20s
100:	learn: 0.3046477	total: 15.5s	remaining: 2m 18s
150:	learn: 0.3033156	total: 23.3s	remaining: 2m 10s
200:	learn: 0.3016363	total: 31.1s	remaining: 2m 3s
250:	learn: 0.3001244	total: 39s	remaining: 1m 56s
300:	learn: 0.2984244	total: 46.9s	remaining: 1m 48s
350:	learn: 0.2968242	total: 54.8s	remaining: 1m 41s
400:	learn: 0.2952564	total: 1m 2s	remaining: 1m 33s
450:	learn: 0.2937465	total: 1m 10s	remaining: 1m 25s
500:	learn: 0.2923515	total: 1m 18s	remaining: 1m 17s
550:	learn: 0.2908689	total: 1m 26s	remaining: 1m 10s
600:	learn: 0.2893044	total: 1m 34s	remaining: 1m 2s
650:	learn: 0.2878003	total: 1m 41s	remaining: 54.6s
700:	learn: 0.2864665	total: 1m 49s	remaining: 46.7s
750:	learn: 0.2850800	total: 1m 57s	remaining: 38.9s
800:	learn: 0.2835652	total: 2m 5s	remaining: 31.1s
850:	learn: 0.2822532	total: 2m 12s	remaining: 23.3s
900:	learn: 0.280928

<catboost.core.CatBoostClassifier at 0x7fa6a9ba3fd0>

In [36]:
cb.save_model('open_sol_model_1.cbm')

### Puzzle

In [38]:
puzzle = pd.read_csv('./data/puzzle.csv')
print(puzzle.shape)
puzzle.head(2)

(4952, 2)


Unnamed: 0,bank,rtk
0,6dd66e8624da427da6b558903a5772b8,56f386b1a1a9455b9118cd290b0627d7
1,224a2325b44a4326bc539e3f1a6e713b,ca77a88196ef4dd786fb390340cf226e


In [39]:
puzzle['bank'] = all_dicts['bank_le'].transform(puzzle['bank'])+1
puzzle['rtk'] = all_dicts['rtk_le'].transform(puzzle['rtk'])+1

In [40]:
puzzle_bank_ids = puzzle['bank'].drop_duplicates()
puzzle_rtk_ids = puzzle['rtk'].drop_duplicates()
df_puzzle = pd.DataFrame(puzzle_bank_ids, columns=['bank'])
df_puzzle['rtk'] = df_puzzle['bank'].apply(lambda x: puzzle_rtk_ids.values.tolist())

df_puzzle = df_puzzle.explode('rtk')

df_puzzle.reset_index(inplace=True, drop=True)

In [42]:
%%time
batch = 50
clf = CatBoostClassifier()
clf.load_model('open_sol_model_1.cbm')
for i in tqdm_notebook(range(len(puzzle_bank_ids)//batch+1)):
    bank_batch = puzzle_bank_ids[i*batch:(i+1)*batch]
    X_puzzle=df_puzzle[df_puzzle.bank.isin(bank_batch)].merge(bankclient_embed, how='left', left_on='bank', right_index=True
                                        ).merge(clickstream_embed, how='left', left_on='rtk', right_index=True
                                          ).merge(tr_sv, how='left', left_on='bank', right_index=True
                                            ).merge(cl_sv, how='left', left_on='rtk', right_index=True
                                              ).fillna(0)
    df_puzzle.loc[df_puzzle.bank.isin(bank_batch), 'predicts'] = clf.predict_proba(X_puzzle[clf.feature_names_])[:,1]
    del X_puzzle
    
df_puzzle['pred_rank'] = df_puzzle.groupby('bank')['predicts'].rank(ascending=False)

This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 26min 26s, sys: 7.25 s, total: 26min 33s
Wall time: 23min 34s


In [44]:
%%time
sub = df_puzzle[df_puzzle.pred_rank<=100].copy(deep=True).sort_values(by=['bank', 'predicts'], ascending=False)
sub['rtk'] = all_dicts['rtk_le'].inverse_transform(sub['rtk'].astype(int)-1)
sub['bank'] = all_dicts['bank_le'].inverse_transform(sub['bank']-1)

sub = sub.pivot_table(index='bank', values='rtk', aggfunc=list)
sub['rtk'] = sub['rtk'].apply(lambda x: str(x))
sub = sub.replace("'", '', regex=True)
sub.rename({'rtk':'rtk_list'}, axis=1).to_csv('puzzle_solution.csv', index=True)

CPU times: user 1.16 s, sys: 20.1 ms, total: 1.18 s
Wall time: 1.14 s
