In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
SEED = 1
N = 12
TEST_ON = 1
cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

In [3]:
user_factors = pd.read_csv("../tmp/customer_factors_train.csv", index_col=False)
item_factors = pd.read_csv("../tmp/article_factors_train.csv", index_col=False)

user_factors["als_customer_features"] = (
    user_factors["als_customer_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

item_factors["als_article_features"] = (
    item_factors["als_article_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

for i in tqdm(range(20)):
    user_factors["als_customer_features_" + str(i)] = (
        user_factors["als_customer_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
for i in tqdm(range(20)):
    item_factors["als_article_features_" + str(i)] = (
        item_factors["als_article_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
del user_factors["als_customer_features"], item_factors["als_article_features"]

100%|██████████| 20/20 [00:04<00:00,  4.72it/s]
100%|██████████| 20/20 [00:00<00:00, 66.12it/s]


In [4]:
train_data = pd.read_csv("../input/train_data.csv", index_col=False)
train_data = (
    train_data.merge(customers, on="customer_id", how="inner")
              .merge(articles, on="article_id", how="inner", suffixes=("_customer", "_article"))
              .merge(user_factors, on="customer_id", how="inner")
              .merge(item_factors, on="article_id", how="inner")
)
train_data = train_data.sort_values(by="customer_id")

del customers, articles, user_factors, item_factors
gc.collect()

train_data

Unnamed: 0,customer_id,article_id,purchase_score,similar_parent_purchase_score,similarity,popular_rank,pairs_parent_purchase_score,general_popular_count,general_popular_count_rank,target,...,als_article_features_10,als_article_features_11,als_article_features_12,als_article_features_13,als_article_features_14,als_article_features_15,als_article_features_16,als_article_features_17,als_article_features_18,als_article_features_19
17770335,0,96765,-9999,7,21,-9999,-9999,-9999,-9999,0.0,...,186,174,132,142,140,168,168,130,63,76
25480545,0,80338,-9999,0,15,-9999,-9999,-9999,-9999,0.0,...,149,111,129,154,154,0,167,255,50,191
27015022,0,76591,-9999,-9999,-9999,-9999,0,-9999,-9999,0.0,...,144,121,134,84,105,14,123,68,38,168
13689697,0,96962,-9999,0,24,-9999,-9999,-9999,-9999,0.0,...,216,101,167,70,147,8,196,140,134,211
869928,0,103703,-9999,-9999,-9999,-9999,-9999,441,27,0.0,...,143,157,120,47,255,119,154,222,65,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11566295,1371978,82628,-9999,-9999,-9999,47,-9999,567,8,0.0,...,178,167,184,131,161,190,171,208,19,137
13270738,1371978,104527,-9999,-9999,-9999,-9999,-9999,457,23,0.0,...,102,148,111,16,251,96,146,255,58,67
8438707,1371979,46351,-9999,-9999,-9999,40,-9999,-9999,-9999,0.0,...,121,146,98,79,133,111,154,255,118,0
16923661,1371979,94657,-9999,-9999,-9999,-9999,-9999,363,42,0.0,...,164,220,182,31,154,113,160,255,60,44


In [5]:
cust_count = train_data.groupby("customer_id").size()
cust_stop_list = cust_count[cust_count > 1023].index.to_list()

train_data = train_data[~train_data["customer_id"].isin(cust_stop_list)]
train_data.shape

(29617074, 99)

In [6]:
train_users, test_users = train_test_split(train_data["customer_id"].unique(), test_size=0.2)
print(train_users.shape[0], test_users.shape[0])

X_train = train_data[train_data["customer_id"].isin(train_users)]
X_test = train_data[train_data["customer_id"].isin(test_users)]

y_train = X_train["target"]
y_test = X_test["target"]

train_groups = X_train["customer_id"]
test_groups = X_test["customer_id"]

X_train.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)
X_test.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)

1097576 274394


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)


In [7]:
cat_features = [name for name, dtype in zip(X_train.dtypes.index,
                                            X_train.dtypes.to_list()) 
                if dtype == "object"]
cat_features

['club_member_status',
 'fashion_news_frequency',
 'age_group',
 'common_group',
 'sex',
 'price_group',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'index_name',
 'index_group_name',
 'garment_group_name',
 'product_code_name',
 'department_no_name',
 'section_no_name']

In [8]:
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features, group_id=train_groups)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features, group_id=test_groups)

In [9]:
del train_data
gc.collect()

0

In [14]:
SEED = 1
params = {
    'iterations': 10000,
#     'learning_rate': 0.01,
#     "l2_leaf_reg": 3, 
#     "random_strength": 1,
#     "bagging_temperature": 1,
#     "max_depth": 12,
#     "one_hot_max_size": 2, 
#     "rsm": 1,
#     "boosting_type": "Ordered",
#     "leaf_estimation_method": "Newton",
    "early_stopping_rounds": 100,
    'loss_function': "YetiRank",
    'custom_metric': ["MAP:top=12"],
    'use_best_model': True,
    "eval_metric": "MAP:top=12",
    "random_seed": SEED,
    'task_type': 'GPU',
    "max_ctr_complexity": 1
    }

clf = catboost.CatBoostRanker(**params, 
                         cat_features=cat_features, 
                         verbose=100)

clf.fit(train_pool, 
        eval_set=test_pool, plot=True)

clf.save_model("../input/catboost_ranker_yeti_emb_3.cbm") # 0.0071281, 0.0074242, 0.0074746 0.0082425

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=12 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=12 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.0047095	test: 0.0047944	best: 0.0047944 (0)	total: 747ms	remaining: 2h 4m 25s
100:	learn: 0.0068399	test: 0.0068863	best: 0.0068863 (100)	total: 1m	remaining: 1h 38m 34s
200:	learn: 0.0070697	test: 0.0070599	best: 0.0070612 (199)	total: 2m	remaining: 1h 37m 59s
300:	learn: 0.0071851	test: 0.0071836	best: 0.0071858 (299)	total: 3m 1s	remaining: 1h 37m 15s
400:	learn: 0.0072579	test: 0.0072536	best: 0.0072545 (395)	total: 4m 1s	remaining: 1h 36m 25s
500:	learn: 0.0073397	test: 0.0073303	best: 0.0073304 (499)	total: 5m 2s	remaining: 1h 35m 36s
600:	learn: 0.0073990	test: 0.0073915	best: 0.0073915 (600)	total: 6m 3s	remaining: 1h 34m 46s
700:	learn: 0.0074774	test: 0.0074574	best: 0.0074574 (700)	total: 7m 4s	remaining: 1h 33m 57s
800:	learn: 0.0075294	test: 0.0074989	best: 0.0074989 (800)	total: 8m 6s	remaining: 1h 33m 4s
900:	learn: 0.0075769	test: 0.0075594	best: 0.0075595 (894)	total: 9m 7s	remaining: 1h 32m 13s
1000:	learn: 0.0076257	test: 0.0076096	best: 0.0076113 (997)	t

In [2]:
clf = catboost.CatBoostRanker()
clf.load_model("../input/catboost_ranker_yeti_emb_3.cbm")

<catboost.core.CatBoostRanker at 0x7f62c0463400>

In [3]:
SEED = 1
N = 12
TEST_ON = 0
cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

Dataset created


In [4]:
user_factors = pd.read_csv("../tmp/customer_factors_test.csv", index_col=False)
item_factors = pd.read_csv("../tmp/article_factors_test.csv", index_col=False)

user_factors["als_customer_features"] = (
    user_factors["als_customer_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

item_factors["als_article_features"] = (
    item_factors["als_article_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

for i in tqdm(range(20)):
    user_factors["als_customer_features_" + str(i)] = (
        user_factors["als_customer_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
for i in tqdm(range(20)):
    item_factors["als_article_features_" + str(i)] = (
        item_factors["als_article_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
del user_factors["als_customer_features"], item_factors["als_article_features"]

100%|██████████| 20/20 [00:04<00:00,  4.73it/s]
100%|██████████| 20/20 [00:00<00:00, 65.59it/s]


In [5]:
test_data = pd.read_csv("../input/test_data.csv", index_col=False, dtype=np.int32)

In [6]:
i = 0
batch_size = 100000
customer_id_list = []
customer_preds = []

customer_list = customers["customer_id"].to_list()

while batch_size * i <= customers.shape[0]:
    test_data_slice = (
        test_data[test_data["customer_id"].isin(customer_list[batch_size * i: batch_size * (i + 1)])]
            .merge(customers, on="customer_id")
            .merge(articles, on="article_id", suffixes=("_customer", "_article"))
            .merge(user_factors, on="customer_id", how="inner")
            .merge(item_factors, on="article_id", how="inner")
    )
        
    test_data_slice["pred"] = clf.predict(test_data_slice.drop(["customer_id", "article_id"], axis=1))
    test_data_slice["customer_id"] = test_data_slice["customer_id"].map(dataset.customers_num2id)
    test_data_slice["article_id"] = test_data_slice["article_id"].map(dataset.articles_num2id)
    
    train_values = test_data_slice[["customer_id", "article_id", "pred"]].values

    cust_preds = defaultdict(list)
    for line in train_values:
        cust_preds[line[0]].append(line[1:])


    for cust in cust_preds:
        customer_id_list.append(cust)
        sorted_preds = sorted(cust_preds[cust], key=lambda x: -x[1])[:12]
        joinded_preds = " ".join([pred[0] for pred in sorted_preds])
        customer_preds.append(joinded_preds)
    
    print(i)
    i += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [7]:
new_sub = pd.DataFrame(data={"customer_id": customer_id_list, "prediction": customer_preds})
new_sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601043 0568601043 0568601006 0568601006 0568601044 0568601044 0779781015 0568601044 0850917001 0751471001 0884319001 0797710001
1,0085558e99d076a43ca09c37f3b8e5bd8e2fe8ab51b59559964887ed47994717,0762846027 0762846006 0372860001 0751471001 0914441004 0762846026 0930380001 0788575002 0762846031 0573085028 0762846029 0908924002
2,0090c0bb62e94f069ee7892db19d2fb546ff3c7cf7b79d6bae8fde9e9b88b2d0,0872537001 0872537004 0929275001 0909370001 0930380001 0793012001 0865799006 0793012001 0935541001 0881942001 0918525001 0918525001
3,00b358a9e9d630cca13667215d71a49ce26aba305270008a58e945090d72432b,0715828013 0924243001 0924243002 0923758001 0918522001 0915529003 0850917001 0866731001 0768912001 0909370001 0579541001 0928206001
4,00c2a836f9a471934133fc03f5ae900c0469ed69682fae63d8d9a63a3c9ecd27,0870328003 0918525001 0870328003 0870328003 0870328003 0156231001 0924243001 0855080009 0855080001 0918522001 0855080009 0855080001
...,...,...
1371975,ff3ab6541dee0fca89a8f6462f8e987c6c75e1cad6c2039c08e4492ce989eebe,0751471001 0924243001 0850917001 0924243002 0923758001 0918522001 0751471043 0573085028 0783346001 0915529003 0928206001 0762846027
1371976,ff59e924dc36d7204fbc771a20713b3a973f4128a96a2f9a4a879febb8bb888a,0924243001 0751471001 0850917001 0924243002 0923758001 0918522001 0751471043 0573085028 0783346001 0928206001 0915529003 0762846027
1371977,ff79f42c8ddbcdf82cf8cc6995e83a572e0d40f52ac67879843d9398932cff19,0751471001 0924243001 0850917001 0924243002 0923758001 0918522001 0751471043 0573085028 0783346001 0915529003 0928206001 0762846027
1371978,ff8dadd17c4b1fa56a50214d4336f71f721d14a8f3a667b785607eaff86d6ab6,0751471001 0924243001 0850917001 0924243002 0923758001 0918522001 0751471043 0573085028 0783346001 0915529003 0928206001 0762846027


In [8]:
new_sub.to_csv('../output/6.cb_ranking_yeti_1.csv', index=False) 