In [4]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [5]:
SEED = 1
N = 12
TEST_ON = 1

min_w1_count_for_actual_article = 10
similar_count_for_article = 10
purchase_value_limit = 3000

cv_iteration = 0

In [6]:
dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

Dataset created


In [7]:
train = add_quotient(train=train)
purchase_dict = get_purchase_dict(df=train)
print("Get purchase dict")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

100%|██████████| 27101148/27101148 [00:14<00:00, 1838901.56it/s]

Get purchase dict





In [9]:
group_popular_dict = get_group_popular_dict(df=train, customers=customers)
print("Get popular dict")

100%|██████████| 5/5 [00:02<00:00,  2.06it/s]

Get popular dict





In [13]:
similar_article_dict = get_similar_items(
    train=train, 
    articles=articles, 
    customers=customers,
    min_w1_count_for_actual_article = min_w1_count_for_actual_article, 
    similar_num_for_article = similar_count_for_article
)
print("Get similar articles")

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:42<00:00, 2474.97it/s]


Get similar articles


In [28]:
with open("../input/purchase_dict_v1.pickle", "wb") as f:
    pickle.dump(purchase_dict, f)
    
with open("../input/similar_article_dict_v1.pickle", "wb") as f:
    pickle.dump(similar_article_dict, f)
    
with open("../input/group_popular_dict_v1.pickle", "wb") as f:
    pickle.dump(group_popular_dict, f)

In [14]:
pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()
print("Get pairs")

Get pairs


In [29]:
with open("../input/purchase_dict_v1.pickle", "rb") as f:
    purchase_dict = pickle.load(f)
    
with open("../input/similar_article_dict_v1.pickle", "rb") as f:
    purchase_dict = pickle.load(f)
    
with open("../input/group_popular_dict_v1.pickle", "rb") as f:
    purchase_dict = pickle.load(f)

In [24]:
test_sub = get_prediction(
    customers=customers, 
    purchase_dict=purchase_dict, 
    pairs=pairs, 
    similar_article_dict=similar_article_dict, 
    group_popular_dict=group_popular_dict, 
    purchase_value_limit=purchase_value_limit)
print("Predict done")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

Predict done


In [26]:
test_sub[["customer_id", "prediction"]].to_csv('../output/2.trending_test_7.csv', index=False)

In [25]:
true = get_true_articles(test).reset_index()
sub = test_sub.merge(true, on="customer_id", how="inner")
sub["ap"] = sub.apply(avg_precision_at_k, axis=1)
map_score = sub["ap"].mean()
print("MAP: ", map_score)

MAP:  0.025589942958553172


In [17]:
sub["prediction_list"] = sub["prediction"].str.split(" ")
sub = (
    sub.explode("prediction_list")
        .rename({"prediction_list": "article_id"}, axis=1)
        [["customer_id", "article_id", "true"]]
)

In [23]:
def get_target(line):
    return int(line["article_id"] in line["true"])

sub["target"] = sub[["article_id", "true"]].apply(get_target, axis=1)

In [26]:
del sub["true"]

In [31]:
sub

Unnamed: 0,customer_id,article_id,target
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0111586001,0
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0816083001,0
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0916468001,0
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0754751001,0
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0158340001,0
...,...,...,...
68983,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,0867948008,0
68983,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,0895610005,0
68983,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,0865076006,0
68983,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,0707269007,0


In [36]:
train_data = (
    sub.merge(customers, on="customer_id")
        .merge(articles, on="article_id", suffixes=("_customer", "_article"))
)
train_data

Unnamed: 0,customer_id,article_id,target,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_group,...,price_max_article,price_mean_article,price_std_article,sales_channel_1_flg_sum_article,sales_channel_2_flg_sum_article,sales_sum_article,sales_channel_1_ratio_article,last_days_ago_article,first_days_ago_article,mean_count_on_customer
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,0111586001,0,0.0,0.0,ACTIVE,NONE,27.0,Other,22-29,...,0.016983,0.012034,0.002472,13150.0,992.0,14142.0,0.929854,0.0,726.0,1.794898
1,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758ec41f54f8c7f4729793,0111586001,0,0.0,0.0,ACTIVE,NONE,26.0,Other,22-29,...,0.016983,0.012034,0.002472,13150.0,992.0,14142.0,0.929854,0.0,726.0,1.794898
2,0064008cf954059e6f7801b03d1ed892b3d1d5a17ebd776300e4af3b5cad6c48,0111586001,0,0.0,0.0,ACTIVE,NONE,23.0,Other,22-29,...,0.016983,0.012034,0.002472,13150.0,992.0,14142.0,0.929854,0.0,726.0,1.794898
3,01283598b576c36586143e9d6c0d41101c46840b140c398bfc074529ade9a86f,0111586001,0,0.0,0.0,ACTIVE,,19.0,Other,16-21,...,0.016983,0.012034,0.002472,13150.0,992.0,14142.0,0.929854,0.0,726.0,1.794898
4,0156132aa76b9c3efd6191ced3f038defe167b98add8c70317decd697b39e78f,0111586001,0,0.0,0.0,ACTIVE,NONE,35.0,Other,30-44,...,0.016983,0.012034,0.002472,13150.0,992.0,14142.0,0.929854,0.0,726.0,1.794898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827803,ffd988e8d1123f4ffde60525bd255d6a710ebe3aecd12190f64726cef150829e,0860301003,0,0.0,0.0,ACTIVE,NONE,33.0,Other,30-44,...,0.030492,0.014887,0.005980,0.0,12.0,12.0,0.000000,2.0,87.0,1.000000
827804,ffd988e8d1123f4ffde60525bd255d6a710ebe3aecd12190f64726cef150829e,0699082006,0,0.0,0.0,ACTIVE,NONE,33.0,Other,30-44,...,0.025407,0.024170,0.003037,74.0,2499.0,2573.0,0.028760,11.0,601.0,1.318135
827805,ffe1791c9c6e3df9aafeebc77cf2cf03dd0123ac76ef90fb37c7b7c3fe6442e7,0853740002,1,0.0,0.0,ACTIVE,NONE,45.0,Other,45-54,...,0.033881,0.033449,0.000980,2.0,17.0,19.0,0.105263,0.0,27.0,1.000000
827806,ffee12091eb3aa2337defe250d7f6b23dbbf9d496fdd7c39a351d16056432189,0875898003,0,1.0,1.0,ACTIVE,Regularly,52.0,Other,45-54,...,0.016932,0.015915,0.001637,0.0,10.0,10.0,0.000000,3.0,21.0,1.111111


In [65]:
groups = train_data[["customer_id", "article_id"]].apply(lambda x: "_".join(x), axis=1)
X = train_data.drop(["target", "customer_id", "article_id"], axis=1)
y = train_data["target"]

In [43]:
cat_features = [name for name, dtype in zip(X_train.dtypes.index,
                                            X_train.dtypes.to_list()) 
                if dtype == "object"]
cat_features

['club_member_status',
 'fashion_news_frequency',
 'postal_code',
 'age_group',
 'common_group',
 'sex',
 'price_group',
 'prediction',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'index_name',
 'index_group_name',
 'garment_group_name',
 'detail_desc',
 'product_code_name',
 'department_no_name',
 'section_no_name']

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [70]:
SEED = 1
params = {
    'iterations': 3000,
#     'learning_rate': 0.01,
#     "l2_leaf_reg": 3, 
#     "random_strength": 1,
#     "bagging_temperature": 1,
#     "max_depth": 12,
#     "one_hot_max_size": 2, 
#     "rsm": 1,
#     "boosting_type": "Ordered",
#     "leaf_estimation_method": "Newton",
    "early_stopping_rounds": 100,
    'loss_function': "Logloss",
    'custom_metric': ["AUC", "Logloss", "Accuracy", "Precision", "Recall"],
    'use_best_model': True,
    "eval_metric": "AUC",
    "random_seed": SEED,
    'task_type': 'GPU'
    }

clf = catboost.CatBoostClassifier(**params, 
                         cat_features=cat_features, 
                         verbose=100)

clf.fit(X_train, y_train, 
        eval_set=(X_test, y_test))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.026014
0:	learn: 0.5571409	test: 0.5495629	best: 0.5495629 (0)	total: 43ms	remaining: 2m 8s
100:	learn: 0.7230175	test: 0.7256847	best: 0.7256847 (100)	total: 5.18s	remaining: 2m 28s
200:	learn: 0.7368787	test: 0.7428179	best: 0.7428179 (200)	total: 10.6s	remaining: 2m 28s
300:	learn: 0.7461511	test: 0.7499255	best: 0.7499255 (300)	total: 16s	remaining: 2m 23s
400:	learn: 0.7517797	test: 0.7539111	best: 0.7539111 (400)	total: 21.3s	remaining: 2m 17s
500:	learn: 0.7556519	test: 0.7565326	best: 0.7565326 (500)	total: 26.4s	remaining: 2m 11s
600:	learn: 0.7597088	test: 0.7591854	best: 0.7591854 (600)	total: 31.9s	remaining: 2m 7s
700:	learn: 0.7636332	test: 0.7611772	best: 0.7611772 (700)	total: 37.2s	remaining: 2m 2s
800:	learn: 0.7660454	test: 0.7627499	best: 0.7627542 (798)	total: 42.7s	remaining: 1m 57s
900:	learn: 0.7684370	test: 0.7635413	best: 0.7635413 (900)	total: 48.2s	remaining: 1m 52s
1000:	learn: 0.7707857	test: 0.7648104	best: 0.7648104 (1000)	total: 5

<catboost.core.CatBoostClassifier at 0x7f906db93250>