In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# TRAIN 

## Сборка датасета  

In [2]:
SEED = 1
N = 20
TEST_ON = 1

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

Dataset created


In [None]:
def get_actual_articles(train, days=7, min_count=20):
    articles_counter = (
        train[train["t_dat"] >= train["t_dat"].max() - pd.Timedelta(days=days)]
            .groupby("article_id").size()
    )
    actual_list = articles_counter[articles_counter > min_count].index.to_list()
    return actual_list

actual_articles = get_actual_articles(train)
len(actual_articles)

## Trending 

In [4]:
train = add_quotient(train=train)
purchase_dict = get_purchase_dict(df=train)

cust_list = []
art_list = []
purch_score_list = []
for cust_id in purchase_dict:
    for art_id in purchase_dict[cust_id]:
        cust_list.append(cust_id)
        art_list.append(art_id)
        purch_score_list.append(int(purchase_dict[cust_id][art_id]))
        
purch_data = pd.DataFrame({"customer_id": cust_list, 
                           "article_id": art_list, 
                           "purchase_score": purch_score_list}, 
                          dtype=np.uint32)

del cust_list, art_list, purch_score_list, purchase_dict
gc.collect()

print("Get purchase dict")

purch_data.to_csv("../tmp/purchase_data_train.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

100%|██████████| 27101148/27101148 [00:18<00:00, 1443221.91it/s]


Get purchase dict


In [5]:
purch_data_act = purch_data[purch_data["purchase_score"] != 0]
purch_data_act.to_csv("../tmp/purchase_data_actual_train.csv", index=False)

## Popular 

In [2]:
def get_general_count_popular(train, customers, N=20, days=7):
    general_count_popular = (
        train[train["t_dat"] >= train["t_dat"].max() - pd.Timedelta(days=days)]
            .groupby("article_id").size().nlargest(N)
            .reset_index()
            .rename({0: "general_popular_count"}, axis=1)
    )
    general_count_popular["general_popular_count_rank"] = np.arange(N)
    general_count_popular["key"] = 1
    customers["key"] = 1
    general_count_popular = (
        customers[["customer_id", "key"]].merge(general_count_popular, on="key", how="inner")
            .drop(["key"], axis=1)
    )
    
    del customers["key"]
    return general_count_popular

In [3]:
def get_general_trending_sum_popular(train, customers, N=20):
    general_trending_popular = (
        train.groupby("article_id")['tr_score'].sum().nlargest(N)
            .reset_index()
            .rename({"tr_score": "general_popular_trending_sum"}, axis=1)
    )
    general_trending_popular["general_popular_trending_sum"] = general_trending_popular["general_popular_trending_sum"].astype(np.uint32)
    general_trending_popular["general_popular_trending_sum_rank"] = np.arange(N).astype(np.uint8)
    general_trending_popular["key"] = 1
    customers["key"] = 1
    general_trending_popular = (
        customers[["customer_id", "key"]].merge(general_trending_popular, on="key", how="inner")
            .drop(["key"], axis=1)
    )
    
    del customers["key"]
    return general_trending_popular

In [4]:
def get_group_trending_mean_popular(train, customers, N=20):
    group_art_sum = (
        train.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")
            .groupby(['article_id', "age_group"])['tr_score'].mean()
            .reset_index()
            .groupby(["age_group"])[["article_id", "tr_score"]]
            .apply(lambda x: x.nlargest(N, "tr_score"))
            .reset_index()
            .rename({"tr_score": "group_popular_trending_mean"}, axis=1)
            .drop(["level_1"], axis=1)
    )
    
    group_art_sum["group_popular_trending_mean_rank"] = (
        np.array(list(range(N)) * group_art_sum["age_group"].unique().shape[0]).astype(np.uint8)
    )
    group_art_sum["group_popular_trending_mean"] = group_art_sum["group_popular_trending_mean"].astype(np.uint32)

    group_art_sum = (
        customers[["customer_id", "age_group"]].merge(group_art_sum, on="age_group", how="inner")
            .drop(["age_group"], axis=1)
    )
    
    return group_art_sum

In [5]:
def get_group_trending_sum_popular(train, customers, N=20):
    group_art_sum = (
        train.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")
            .groupby(['article_id', "age_group"])['tr_score'].sum()
            .reset_index()
            .groupby(["age_group"])[["article_id", "tr_score"]]
            .apply(lambda x: x.nlargest(N, "tr_score"))
            .reset_index()
            .rename({"tr_score": "group_popular_trending_sum"}, axis=1)
            .drop(["level_1"], axis=1)
    )

    
    group_art_sum["group_popular_trending_sum_rank"] = (
        np.array(list(range(N)) * group_art_sum["age_group"].unique().shape[0]).astype(np.uint8)
    )
    group_art_sum["group_popular_trending_sum"] = group_art_sum["group_popular_trending_sum"].astype(np.uint32)

    group_art_sum = (
        customers[["customer_id", "age_group"]].merge(group_art_sum, on="age_group", how="inner")
            .drop(["age_group"], axis=1)
    )
    
    return group_art_sum

In [6]:
def get_group_count_popular(train, customers, N=20, days=7):
    group_art_sum = (
        train[train["t_dat"] >= train["t_dat"].max() - pd.Timedelta(days=days)]
            .merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")
            .groupby(['article_id', "age_group"]).size()
            .reset_index()
            .rename({0: "group_popular_count"}, axis=1)
            .groupby(["age_group"])[["article_id", "group_popular_count"]]
            .apply(lambda x: x.nlargest(N, "group_popular_count"))
            .reset_index()
            .drop(["level_1"], axis=1)
    )

    
    group_art_sum["group_popular_count_rank"] = (
        np.array(list(range(N)) * group_art_sum["age_group"].unique().shape[0]).astype(np.uint8)
    )
    group_art_sum["group_popular_count"] = group_art_sum["group_popular_count"].astype(np.uint32)

    group_art_sum = (
        customers[["customer_id", "age_group"]].merge(group_art_sum, on="age_group", how="inner")
            .drop(["age_group"], axis=1)
    )
    
    return group_art_sum

In [11]:
general_count_popular = get_general_count_popular(train, customers)
general_trending_sum_popular = get_general_trending_sum_popular(train, customers)
group_trending_mean_popular = get_group_trending_mean_popular(train, customers)
group_trending_sum_popular = get_group_trending_sum_popular(train, customers)
group_count_popular = get_group_count_popular(train, customers)

### Pairs 

In [12]:
def get_pairs_data(dataset)
pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()

art_parent_list = []
art_child_list = []
for art_parent in pairs:
    art_parent_list.append(dataset.articles_id2num["0" + str(art_parent)])
    art_child_list.append(dataset.articles_id2num["0" + str(pairs[art_parent])])
        
pairs_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list}, dtype=np.uint32)

pairs_data = (
    purch_data_act
        .merge(pairs_data.rename({"article_id_parent": "article_id"}, axis=1), 
               on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "pairs_parent_purchase_score"}, axis=1)
)

del pairs, art_parent_list, art_child_list
gc.collect()

print("Get pairs")
pairs_data.to_csv("../tmp/pairs_train.csv", index=False)

Get pairs


### Merge 

In [13]:
merge_data = (
    purch_data_act
        .merge(general_count_popular, on=["customer_id", "article_id"], how='outer')
        .merge(general_trending_sum_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_trending_mean_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_trending_sum_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_count_popular, on=["customer_id", "article_id"], how='outer')
        .merge(pairs_data, on=["customer_id", "article_id"], how='outer')
)
merge_data

Unnamed: 0,customer_id,article_id,purchase_score,general_popular_count,general_popular_count_rank,general_popular_trending_sum,general_popular_trending_sum_rank,group_popular_trending_mean,group_popular_trending_mean_rank,group_popular_trending_sum,group_popular_trending_sum_rank,group_popular_count,group_popular_count_rank,pairs_parent_purchase_score
0,0,16003,330.0,,,,,,,,,,,28864.0
1,0,16003,330.0,,,,,,,,,,,6.0
2,0,16023,28864.0,,,,,,,,,,,
3,0,65667,48.0,,,,,,,,,,,
4,0,78719,7.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92345332,1218691,93412,,,,,,,,,,,,108770.0
92345333,1219679,28970,,,,,,,,,,,,95755.0
92345334,1238517,95228,,,,,,,,,,,,54385.0
92345335,1359561,95228,,,,,,,,,,,,12707.0


In [18]:
merge_data.to_csv("../tmp/purch_popular_pairs_train.csv", index=True)

# TEST 

In [7]:
SEED = 1
N = 20
TEST_ON = 0

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

Dataset created


In [8]:
train = add_quotient(train=train)
purchase_dict = get_purchase_dict(df=train)

cust_list = []
art_list = []
purch_score_list = []
for cust_id in purchase_dict:
    for art_id in purchase_dict[cust_id]:
        cust_list.append(cust_id)
        art_list.append(art_id)
        purch_score_list.append(int(purchase_dict[cust_id][art_id]))
        
purch_data = pd.DataFrame({"customer_id": cust_list, 
                           "article_id": art_list, 
                           "purchase_score": purch_score_list}, 
                          dtype=np.uint32)

del cust_list, art_list, purch_score_list, purchase_dict
gc.collect()

print("Get purchase dict")

purch_data.to_csv("../tmp/purchase_data_test.csv", index=False)

purch_data_act = purch_data[purch_data["purchase_score"] != 0]
purch_data_act.to_csv("../tmp/purchase_data_actual_test.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

100%|██████████| 27306439/27306439 [00:18<00:00, 1503451.83it/s]


Get purchase dict


In [9]:
general_count_popular = get_general_count_popular(train, customers)
general_trending_sum_popular = get_general_trending_sum_popular(train, customers)
group_trending_mean_popular = get_group_trending_mean_popular(train, customers)
group_trending_sum_popular = get_group_trending_sum_popular(train, customers)
group_count_popular = get_group_count_popular(train, customers)

In [10]:
pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()

art_parent_list = []
art_child_list = []
for art_parent in pairs:
    art_parent_list.append(dataset.articles_id2num["0" + str(art_parent)])
    art_child_list.append(dataset.articles_id2num["0" + str(pairs[art_parent])])
        
pairs_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list}, dtype=np.uint32)

pairs_data = (
    purch_data_act
        .merge(pairs_data.rename({"article_id_parent": "article_id"}, axis=1), 
               on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "pairs_parent_purchase_score"}, axis=1)
)

del pairs, art_parent_list, art_child_list
gc.collect()

print("Get pairs")
pairs_data.to_csv("../tmp/pairs_test.csv", index=False)

Get pairs


In [None]:
merge_data = (
    purch_data_act
        .merge(general_count_popular, on=["customer_id", "article_id"], how='outer')
        .merge(general_trending_sum_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_trending_mean_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_trending_sum_popular, on=["customer_id", "article_id"], how='outer')
        .merge(group_count_popular, on=["customer_id", "article_id"], how='outer')
        .merge(pairs_data, on=["customer_id", "article_id"], how='outer')
)
merge_data

In [None]:
merge_data.to_csv("../tmp/purch_popular_pairs_test.csv", index=False)