In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
SEED = 1
N = 12
TEST_ON = 1
cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
user_factors = pd.read_csv("../tmp/customer_factors_train.csv", index_col=False)
item_factors = pd.read_csv("../tmp/article_factors_train.csv", index_col=False)

user_factors["als_customer_features"] = (
    user_factors["als_customer_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

item_factors["als_article_features"] = (
    item_factors["als_article_features"].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
)

for i in tqdm(range(20)):
    user_factors["als_customer_features_" + str(i)] = (
        user_factors["als_customer_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
for i in tqdm(range(20)):
    item_factors["als_article_features_" + str(i)] = (
        item_factors["als_article_features"].apply(lambda x: x[i]).astype(np.uint8)
    )
    
del user_factors["als_customer_features"], item_factors["als_article_features"]

In [None]:
train_data = pd.read_csv("../tmp/train_data.csv", index_col=False, dtype=np.int32)
train_data = (
    train_data.merge(customers, on="customer_id", how="inner")
              .merge(articles, on="article_id", how="inner", suffixes=("_customer", "_article"))
              .merge(user_factors, on="customer_id", how="inner")
              .merge(item_factors, on="article_id", how="inner")
)
train_data = train_data.sort_values(by="customer_id")

del customers, articles, user_factors, item_factors
gc.collect()

train_data

In [None]:
cust_count = train_data.groupby("customer_id").size()
cust_stop_list = cust_count[cust_count > 1023].index.to_list()

train_data = train_data[~train_data["customer_id"].isin(cust_stop_list)]
train_data.shape

In [None]:
train_users, test_users = train_test_split(train_data["customer_id"].unique(), test_size=0.4)
print(train_users.shape[0], test_users.shape[0])

X_train = train_data[train_data["customer_id"].isin(train_users)]
X_test = train_data[train_data["customer_id"].isin(test_users)]

y_train = X_train["target"]
y_test = X_test["target"]

train_groups = X_train["customer_id"]
test_groups = X_test["customer_id"]

X_train.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)
X_test.drop(["target", "customer_id", "article_id"], axis=1, inplace=True)

In [7]:
cat_features = [name for name, dtype in zip(X_train.dtypes.index,
                                            X_train.dtypes.to_list()) 
                if dtype == "object"]
cat_features

['club_member_status',
 'fashion_news_frequency',
 'age_group',
 'common_group',
 'sex',
 'price_group',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'index_name',
 'index_group_name',
 'garment_group_name',
 'product_code_name',
 'department_no_name',
 'section_no_name']

In [8]:
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features, group_id=train_groups)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features, group_id=test_groups)

del train_data
gc.collect()

0

In [9]:
SEED = 1
params = {
    'iterations': 10000,
#     'learning_rate': 0.01,
#     "l2_leaf_reg": 3, 
#     "random_strength": 1,
#     "bagging_temperature": 1,
#     "max_depth": 12,
#     "one_hot_max_size": 2, 
#     "rsm": 1,
#     "boosting_type": "Ordered",
#     "leaf_estimation_method": "Newton",
    "early_stopping_rounds": 100,
    'loss_function': "Logloss",
    'custom_metric': ["MAP:top=12"],
    'use_best_model': True,
    "eval_metric": "MAP:top=12",
    "random_seed": SEED,
    'task_type': 'GPU',
    "max_ctr_complexity": 1
    }

# clf = catboost.CatBoostRanker(**params, 
#                          cat_features=cat_features, 
#                          verbose=100)

clf = catboost.CatBoostClassifier(**params, 
                                  cat_features=cat_features, 
                                  verbose=100)

clf.fit(train_pool, 
        eval_set=test_pool, plot=True)

clf.save_model("../input/catboost_clf_1.cbm") # 0.0071281, 0.0074242, 0.0074746 0.0082425

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.011808


Metric MAP:top=12 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=12 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.0020409	test: 0.0020277	best: 0.0020277 (0)	total: 522ms	remaining: 1h 27m 1s
100:	learn: 0.0061405	test: 0.0061361	best: 0.0061361 (100)	total: 30.3s	remaining: 49m 26s
200:	learn: 0.0062236	test: 0.0061975	best: 0.0062014 (196)	total: 1m 1s	remaining: 49m 41s
300:	learn: 0.0062982	test: 0.0062578	best: 0.0062578 (300)	total: 1m 32s	remaining: 49m 36s
400:	learn: 0.0064032	test: 0.0063667	best: 0.0063671 (399)	total: 2m 3s	remaining: 49m 9s
500:	learn: 0.0064420	test: 0.0064003	best: 0.0064005 (499)	total: 2m 34s	remaining: 48m 50s
600:	learn: 0.0064714	test: 0.0064236	best: 0.0064274 (589)	total: 3m 5s	remaining: 48m 24s
700:	learn: 0.0065015	test: 0.0064629	best: 0.0064639 (698)	total: 3m 36s	remaining: 47m 52s
800:	learn: 0.0065482	test: 0.0064916	best: 0.0064916 (800)	total: 4m 7s	remaining: 47m 24s
900:	learn: 0.0065786	test: 0.0065141	best: 0.0065151 (892)	total: 4m 38s	remaining: 46m 52s
1000:	learn: 0.0066214	test: 0.0065480	best: 0.0065486 (995)	total: 5m 9s	remai

In [11]:
for imp_, name in sorted(zip(clf.feature_importances_, clf.feature_names_), key=lambda x: -x[0]):
    print(imp_, name)

10.681962711087003 als_similarity
8.31619271082638 last_days_ago_customer
7.364222671720977 Ladieswear_count
4.890365779990257 sales_sum_customer
3.754346655159426 sales_channel_2_flg_sum_customer
3.1794548397938893 price_min_customer
2.7462296610725687 als_customer_features_2
2.564076221834813 Divided_count
2.4262961661456064 first_days_ago_article
2.370862365467399 price_mean_customer
1.9705782933715108 purchase_score
1.5617784833344432 als_customer_features_19
1.4091324864667256 Baby/Children_count
1.3401116305978358 has_children
1.2912134155612796 sales_sum_article
1.2452366219904774 first_days_ago_customer
1.1526529388225848 price_max_customer
1.1460777614429427 mean_article_count_on_date
1.1054707369113572 als_customer_features_17
1.0873029659013795 sales_channel_1_percent_customer
1.0157144331844647 als_customer_features_0
0.9393560945038555 price_std_customer
0.9344127625535987 als_customer_features_7
0.9226476576332179 Active
0.8833748321360689 als_customer_features_16
0.83604

In [19]:
X_train[cat_features]

Unnamed: 0,club_member_status,fashion_news_frequency,age_group,common_group,sex,price_group,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_name,index_group_name,garment_group_name,product_code_name,department_no_name,section_no_name
14469532,ACTIVE,NONE,45-54,Lady,Woman,medium,Trousers,Garment Lower body,Check,Black,Dark,Black,Ladieswear,Ladieswear,Trousers,751471: Pluto RW slacks (1),1722: Trouser,15: Womens Everyday Collection
6839386,ACTIVE,NONE,45-54,Lady,Woman,medium,Socks,Socks & Tights,Solid,White,Light,White,Lingeries/Tights,Ladieswear,Socks and Tights,Other,3611: Shopbasket Socks,"62: Womens Nightwear, Socks & Tigh"
18231666,ACTIVE,NONE,45-54,Lady,Woman,medium,Skirt,Garment Lower body,All over pattern,Dark Orange,Medium Dusty,Brown,Ladieswear,Ladieswear,Skirts,Other,1422: Skirt,15: Womens Everyday Collection
31027734,ACTIVE,NONE,45-54,Lady,Woman,medium,Trousers,Garment Lower body,Solid,Black,Dark,Black,Ladieswear,Ladieswear,Trousers,Other,1717: Trouser,11: Womens Tailoring
30766523,ACTIVE,NONE,45-54,Lady,Woman,medium,Garment Set,Garment Full body,All over pattern,Light Beige,Dusty Light,Beige,Baby Sizes 50-98,Baby/Children,Jersey Fancy,Other,6564: Newborn,44: Baby Essentials & Complements
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29538255,ACTIVE,Regularly,16-21,Divided,Woman,medium,Underwear bottom,Underwear,Solid,Black,Dark,Black,Lingeries/Tights,Ladieswear,"Under-, Nightwear",Other,1339: Clean Lingerie,61: Womens Lingerie
33518069,ACTIVE,Regularly,16-21,Divided,Woman,medium,Sweater,Garment Upper body,Solid,Black,Dark,Black,Divided,Divided,Knitwear,Other,1647: Tops Knitwear,53: Divided Collection
26646740,ACTIVE,Regularly,16-21,Divided,Woman,medium,Hoodie,Garment Upper body,Melange,Grey,Dusty Light,Grey,Divided,Divided,Jersey Basic,Other,1643: Basic 1,51: Divided Basics
5234726,ACTIVE,Regularly,16-21,Divided,Woman,medium,Trousers,Garment Lower body,Solid,Light Blue,Light,Blue,Divided,Divided,Trousers,Other,1747: Trousers,53: Divided Collection
