In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix

from tqdm import tqdm
from datetime import datetime, timedelta

import logging
logging.basicConfig(level=logging.INFO)
logging.info("test")

import sys
sys.path.append("..")
from src.dataset import *

INFO:root:test


In [2]:
dm = Dataset(train_days=365, test_days=7)

In [3]:
train, test = dm.get_train_test(train_days = 30)

In [4]:
articles = dm.get_articles()

In [5]:
customers = dm.get_customers()

## Crop data 

In [6]:
articles_crop = articles[(articles["last_days_ago"] <= 7) & 
                         (articles["sales_sum"] >= 100)]

In [7]:
train = train[train["article_id"].isin(articles_crop["article_id"].tolist())]
test = test[test["article_id"].isin(articles_crop["article_id"].tolist())]

In [8]:
pos_train = train[["customer_id", "article_id"]]
pos_test = test[["customer_id", "article_id"]]

In [9]:
pos_train["rating"] = 1
pos_test["rating"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_train["rating"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_test["rating"] = 1


In [10]:
customer_list = train["customer_id"].unique().tolist()
article_list = articles_crop["article_id"].to_list()

### Model 

In [11]:
def get_neg_train(train, N):
    pos_cust_article = set(train["customer_id"].str.cat(train["article_id"], sep="_").to_list())
    
    neg_customers = []
    neg_articles = []

    cust_len = len(customer_list)
    art_len = len(article_list)

    for _ in tqdm(range(N)):

        customer = customer_list[np.random.randint(0, cust_len)]
        article = article_list[np.random.randint(0, art_len)]

        if f"{customer}_{article}" not in pos_cust_article:
            neg_customers.append(customer)
            neg_articles.append(article)
    
    neg_train = pd.DataFrame({"customer_id": neg_customers, "article_id": neg_articles})
    neg_train["rating"] = 0
    return neg_train

In [12]:
neg_train = get_neg_train(pos_train, N=pos_train.shape[0] * 4)
neg_test = get_neg_train(pos_test, N=pos_test.shape[0] * 4)

100%|██████████| 3966624/3966624 [00:10<00:00, 383380.82it/s]
100%|██████████| 723792/723792 [00:01<00:00, 395847.15it/s]


In [13]:
train_full = pd.concat([pos_train, neg_train])
test_full = pd.concat([pos_test, neg_test])

In [14]:
train_full = (
    train_full.merge(customers, on="customer_id", how="inner")
            .merge(articles, on="article_id", how="inner", suffixes=["_customer", "_article"])
)

test_full = (
    test_full.merge(customers, on="customer_id", how="inner")
            .merge(articles, on="article_id", how="inner", suffixes=["_customer", "_article"])
)

In [15]:
train_full = train_full.drop(["detail_desc", "customer_id", "article_id"], axis=1)
# customer_article_test_df = test_full[["customer_id", "article_id"]]
test_ids = test_full[["customer_id", "article_id"]]
test_full = test_full.drop(["detail_desc", "customer_id", "article_id"], axis=1)

### Model 

In [16]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

In [17]:
cat_features = [name for name, dtype in zip(train_full.dtypes.index,
                                            train_full.dtypes.to_list()) 
                if dtype == "object"]
cat_features

['club_member_status',
 'fashion_news_frequency',
 'postal_code',
 'age_group',
 'common_group',
 'sex',
 'price_group',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'index_name',
 'index_group_name',
 'garment_group_name',
 'product_code_name',
 'department_no_name',
 'section_no_name']

In [18]:
X_train = train_full.drop(["rating"], axis=1)
y_train = train_full["rating"]
X_test = test_full.drop(["rating"], axis=1)
y_test = test_full["rating"]

In [19]:
SEED = 1
params = {
    'iterations': 1000,
    'learning_rate': 0.01,
#     "l2_leaf_reg": 3, 
#     "random_strength": 1,
#     "bagging_temperature": 1,
#     "max_depth": 12,
#     "one_hot_max_size": 2, 
#     "rsm": 1,
#     "boosting_type": "Ordered",
#     "leaf_estimation_method": "Newton",
    "early_stopping_rounds": 100,
    'loss_function': "Logloss",
    'custom_metric': ["AUC", "Logloss", "Accuracy", "Precision", "Recall"],
    'use_best_model': True,
    "eval_metric": "AUC",
    "random_seed": SEED,
    'task_type': 'GPU'
    }

clf = CatBoostClassifier(**params, 
                         cat_features=cat_features, 
                         verbose=10)

clf.fit(X_train, y_train, 
        eval_set=(X_test, y_test))

0:	learn: 0.8091927	test: 0.8191183	best: 0.8191183 (0)	total: 410ms	remaining: 6m 49s
10:	learn: 0.8182204	test: 0.8278583	best: 0.8278583 (10)	total: 4.56s	remaining: 6m 50s
20:	learn: 0.8203228	test: 0.8301050	best: 0.8301050 (20)	total: 8.71s	remaining: 6m 45s
30:	learn: 0.8233300	test: 0.8346887	best: 0.8347144 (29)	total: 12.7s	remaining: 6m 37s
40:	learn: 0.8257394	test: 0.8367092	best: 0.8367092 (40)	total: 16.8s	remaining: 6m 32s
50:	learn: 0.8279839	test: 0.8375804	best: 0.8377335 (49)	total: 20.8s	remaining: 6m 26s
60:	learn: 0.8296018	test: 0.8384773	best: 0.8384773 (60)	total: 24.8s	remaining: 6m 21s
70:	learn: 0.8318939	test: 0.8386209	best: 0.8391624 (64)	total: 28.8s	remaining: 6m 17s
80:	learn: 0.8344502	test: 0.8395787	best: 0.8395787 (80)	total: 32.9s	remaining: 6m 13s
90:	learn: 0.8360047	test: 0.8401227	best: 0.8403054 (86)	total: 37s	remaining: 6m 9s
100:	learn: 0.8370702	test: 0.8401288	best: 0.8404239 (98)	total: 41.1s	remaining: 6m 5s
110:	learn: 0.8380751	test

<catboost.core.CatBoostClassifier at 0x7fc05cc7b880>

In [20]:
for name, imp in sorted(zip(clf.feature_names_, clf.feature_importances_), key=lambda x: -x[1]):
    if imp != 0:
        print(round(imp, 5), name)

27.37211 first_days_ago_article
23.56083 sales_sum_article
18.32526 last_days_ago_article
9.21969 mean_article_count_on_date
4.70955 sales_channel_2_flg_sum_customer
3.07467 sales_channel_1_ratio_article
2.59476 sales_channel_1_ratio_customer
2.35827 price_mean_customer
2.09349 sales_sum_customer
1.6104 sales_channel_1_flg_sum_article
1.56765 sales_channel_2_flg_sum_article
1.23049 price_mean_article
0.89073 price_std_article
0.71573 product_type_name
0.28708 last_days_ago_customer
0.14976 price_group
0.11491 price_max_customer
0.07354 Ladieswear_count
0.05107 department_no_name


In [21]:
test_ids["pred"] = clf.predict_proba(X_test)[:, 1]

In [22]:
article_top_list = test_ids.groupby("article_id")["pred"].mean().sort_values()[-50:].index.tolist()

In [23]:
article_top = articles_crop[articles_crop["article_id"].isin(article_top_list)]

In [24]:
customers["1"] = 1
article_top["1"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_top["1"] = 1


In [25]:
del train, test, test_full, train_full
del articles, articles_crop
del dm
del X_test, X_train
del pos_test, pos_train, neg_test, neg_train

import gc
gc.collect()

248

In [26]:
batch_size = 100000
i = 0
submition = None
while i + batch_size <= customers.shape[0]:
    print(i)
    scoring_list = (
        customers.iloc[i:i + batch_size]
                .merge(article_top, on="1", how="inner", suffixes=["_customer", "_article"])
    )
    scoring_subm = scoring_list[["customer_id", "article_id"]]
    scoring_list = scoring_list.drop(["detail_desc", "customer_id", "article_id", "1"], axis=1)
    
    scoring_subm["pred"] = clf.predict_proba(scoring_list)[:, 1]
    submition = pd.concat([submition, scoring_subm]) if submition is not None else scoring_subm
    
    i += batch_size

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [27]:
tqdm.pandas()

In [31]:
submition[submition["pred"] > 0.7]

Unnamed: 0,customer_id,article_id,pred
44,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0918292001,0.720921
54,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0751471043,0.738263
79,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0896152002,0.736266
92,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0915529003,0.704610
93,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0916468003,0.740883
...,...,...,...
4999692,f2978674110d49ab0090ea22b309dccf1058b009da98f5...,0915529003,0.729514
4999693,f2978674110d49ab0090ea22b309dccf1058b009da98f5...,0916468003,0.761574
4999694,f2978674110d49ab0090ea22b309dccf1058b009da98f5...,0918292001,0.778050
4999695,f2978674110d49ab0090ea22b309dccf1058b009da98f5...,0918292004,0.712623


In [None]:
subm = (
        submition[submition["pred"] > 0.75]
        .groupby("customer_id")[["article_id", "pred"]]
        .progress_apply(lambda x: x.sort_values("pred", ascending=False).head(12))
        .reset_index().drop(["level_1", "pred"], axis=1)
        .groupby("customer_id")["article_id"].progress_apply(lambda x: " ".join(x))
        .reset_index()
)
subm

100%|██████████| 182472/182472 [00:28<00:00, 12020.91it/s]

In [None]:
subm.to_csv("../output/35.model_neg_samp.csv", index=False, header=True)