In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *
from src.level_1 import get_general_count_popular

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Отбор кандидатов 

In [2]:
SEED = 1
N = 12
TEST_ON = 1

min_w1_count_for_actual_article = 20
similar_count_for_article = 5
popular_count = 50

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

train = add_quotient(train=train)
purchase_dict = get_purchase_dict(df=train)

cust_list = []
art_list = []
purch_score_list = []
for cust_id in purchase_dict:
    for art_id in purchase_dict[cust_id]:
        cust_list.append(cust_id)
        art_list.append(art_id)
        purch_score_list.append(int(purchase_dict[cust_id][art_id]))
        
purch_data = pd.DataFrame({"customer_id": cust_list, 
                           "article_id": art_list, 
                           "purchase_score": purch_score_list}, 
                          dtype=np.uint32)

del cust_list, art_list, purch_score_list, purchase_dict
gc.collect()

print("Get purchase dict")

similar_article_dict = get_similar_items(
    train=train, 
    articles=articles, 
    customers=customers,
    min_w1_count_for_actual_article = min_w1_count_for_actual_article, 
    similar_num_for_article = similar_count_for_article
)

art_parent_list = []
art_child_list = []
art_child_score = []
for art_parent in similar_article_dict:
    for art_info in similar_article_dict[art_parent]:
        if art_info[1] != 0:
            art_parent_list.append(art_parent)
            art_child_list.append(art_info[0])
            art_child_score.append(int(art_info[1]))
        
similar_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list, 
                             "similarity": art_child_score}, dtype=np.uint32)

similar_purch_data = (
    purch_data.merge(similar_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "similar_parent_purchase_score"}, axis=1)
)
del similar_article_dict, art_parent_list, art_child_list, art_child_score, similar_data
gc.collect()

print("Get similar articles")

group_popular_dict = get_group_popular_dict(df=train, customers=customers, N=popular_count)

age_list = []
art_list = []
rank_list = []
for age_group in group_popular_dict:
    for i, art_id in enumerate(group_popular_dict[age_group]):
        age_list.append(age_group)
        art_list.append(art_id)
        rank_list.append(i)
        
popular_data = pd.DataFrame({"age_group": age_list, 
                             "article_id": art_list, 
                             "popular_rank": rank_list})
popular_data = popular_data.astype({"article_id": np.uint32, "popular_rank": np.uint8})

popular_cust_data = (
    customers[["customer_id", "age_group"]]
        .merge(popular_data, on="age_group", how="inner")
        .drop(["age_group"], axis=1)
)

del group_popular_dict, age_list, art_list, rank_list, popular_data
gc.collect()

print("Get group popular dict")

general_popular = get_general_count_popular(train, customers, popular_count)
print("Get popular dict")

pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()

art_parent_list = []
art_child_list = []
for art_parent in pairs:
    art_parent_list.append(dataset.articles_id2num["0" + str(art_parent)])
    art_child_list.append(dataset.articles_id2num["0" + str(pairs[art_parent])])
        
pairs_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list}, dtype=np.uint32)

pairs_purch_data = (
    purch_data.merge(pairs_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "pairs_parent_purchase_score"}, axis=1)
)

del pairs, art_parent_list, art_child_list, pairs_data
gc.collect()

print("Get pairs")

Dataset created


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3943502), Label(value='0 / 3943502…

100%|██████████| 27101148/27101148 [00:19<00:00, 1409196.33it/s]


Get purchase dict


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:35<00:00, 2945.38it/s]


Get similar articles


100%|██████████| 5/5 [00:02<00:00,  2.09it/s]


Get group popular dict
Get popular dict
Get pairs


In [None]:
train_data = purch_data.merge(similar_purch_data, on=["customer_id", "article_id"], how="outer")
del purch_data, similar_purch_data
gc.collect()
print("1")

train_data = train_data.merge(popular_cust_data, on=["customer_id", "article_id"], how="outer")
del popular_cust_data
gc.collect()
print("1")

train_data = train_data.merge(pairs_purch_data, on=["customer_id", "article_id"], how="outer")
del pairs_purch_data
gc.collect()
print("1")

train_data = train_data.merge(general_popular, on=["customer_id", "article_id"], how="outer")
del general_popular
gc.collect()
print("1")

train_data = train_data.fillna(-9999).astype(np.int32)

test_data = test[["customer_id", "article_id"]]
test_data["target"] = 1

train_data = (
    train_data.merge(test_data, on=["customer_id", "article_id"], how="left")
        .fillna({"target": 0.0})
)

print("Train data is done")

1
1
1


## Сохранение и загрузка 

## Подготовка данных 

In [4]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.00039770682368718216, 0.3222490855599619)

In [5]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.0003625857974308287, 0.4111547120190087)

In [5]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.0003909527814606927, 0.4010927506439572)

In [4]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.00035390453865012417, 0.4122408046240081)

In [5]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.0003909609031228094, 0.40110107319265453)

In [4]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.0003578507187709223, 0.43945970013857044)

In [6]:
train_data.shape

(272501021, 8)

In [6]:
train_data.shape

(246543840, 8)

In [5]:
train_data.shape

(279922943, 8)

In [6]:
train_data.shape

(246543834, 8)

In [5]:
train_data.shape

(295114679, 10)

In [6]:
train_neg = train_data[train_data["target"] == 0].sample(train_data.shape[0] // 10)
train_pos = train_data[train_data["target"] == 1]
train_data = pd.concat([train_neg, train_pos])
train_data

Unnamed: 0,customer_id,article_id,purchase_score,similar_parent_purchase_score,similarity,popular_rank,pairs_parent_purchase_score,general_popular_count,general_popular_count_rank,target
21796082,957784,50151,0,-9999,-9999,-9999,-9999,-9999,-9999,0.0
2556321,111997,12790,0,-9999,-9999,-9999,-9999,-9999,-9999,0.0
200500968,307891,9773,-9999,-9999,-9999,44,-9999,-9999,-9999,0.0
213526450,1316443,67625,-9999,-9999,-9999,37,-9999,-9999,-9999,0.0
159179183,848934,67736,-9999,0,14,-9999,-9999,-9999,-9999,0.0
...,...,...,...,...,...,...,...,...,...,...
295104445,1371691,101192,-9999,-9999,-9999,-9999,-9999,371,39,1.0
295104446,1371691,101192,-9999,-9999,-9999,-9999,-9999,371,39,1.0
295105343,1371717,104072,-9999,-9999,-9999,-9999,-9999,819,2,1.0
295105348,1371717,103793,-9999,-9999,-9999,-9999,-9999,519,14,1.0


In [7]:
train_data.to_csv("../input/train_data.csv", index=False)
# train_data = pd.read_csv("../input/train_data.csv", index_col=False)

# Предсказание с переранжированием 

In [2]:
SEED = 1
N = 12
TEST_ON = 0

min_w1_count_for_actual_article = 20
similar_count_for_article = 5
popular_count = 50

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

train = add_quotient(train=train)
purchase_dict = get_purchase_dict(df=train)

cust_list = []
art_list = []
purch_score_list = []
for cust_id in purchase_dict:
    for art_id in purchase_dict[cust_id]:
        cust_list.append(cust_id)
        art_list.append(art_id)
        purch_score_list.append(int(purchase_dict[cust_id][art_id]))
        
purch_data = pd.DataFrame({"customer_id": cust_list, 
                           "article_id": art_list, 
                           "purchase_score": purch_score_list}, 
                          dtype=np.uint32)

del cust_list, art_list, purch_score_list, purchase_dict
gc.collect()

print("Get purchase dict")

similar_article_dict = get_similar_items(
    train=train, 
    articles=articles, 
    customers=customers,
    min_w1_count_for_actual_article = min_w1_count_for_actual_article, 
    similar_num_for_article = similar_count_for_article
)

art_parent_list = []
art_child_list = []
art_child_score = []
for art_parent in similar_article_dict:
    for art_info in similar_article_dict[art_parent]:
        if art_info[1] != 0:
            art_parent_list.append(art_parent)
            art_child_list.append(art_info[0])
            art_child_score.append(int(art_info[1]))
        
similar_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list, 
                             "similarity": art_child_score}, dtype=np.uint32)

similar_purch_data = (
    purch_data.merge(similar_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "similar_parent_purchase_score"}, axis=1)
)
del similar_article_dict, art_parent_list, art_child_list, art_child_score, similar_data
gc.collect()

print("Get similar articles")

group_popular_dict = get_group_popular_dict(df=train, customers=customers, N=popular_count)

age_list = []
art_list = []
rank_list = []
for age_group in group_popular_dict:
    for i, art_id in enumerate(group_popular_dict[age_group]):
        age_list.append(age_group)
        art_list.append(art_id)
        rank_list.append(i)
        
popular_data = pd.DataFrame({"age_group": age_list, 
                             "article_id": art_list, 
                             "popular_rank": rank_list})
popular_data = popular_data.astype({"article_id": np.uint32, "popular_rank": np.uint8})

popular_cust_data = (
    customers[["customer_id", "age_group"]]
        .merge(popular_data, on="age_group", how="inner")
        .drop(["age_group"], axis=1)
)

del group_popular_dict, age_list, art_list, rank_list, popular_data
gc.collect()

print("Get group popular dict")

general_popular = get_general_count_popular(train, customers, popular_count)
print("Get popular dict")

pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()

art_parent_list = []
art_child_list = []
for art_parent in pairs:
    art_parent_list.append(dataset.articles_id2num["0" + str(art_parent)])
    art_child_list.append(dataset.articles_id2num["0" + str(pairs[art_parent])])
        
pairs_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list}, dtype=np.uint32)

pairs_purch_data = (
    purch_data.merge(pairs_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "pairs_parent_purchase_score"}, axis=1)
)

del pairs, art_parent_list, art_child_list, pairs_data
gc.collect()

print("Get pairs")

Dataset created


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

100%|██████████| 27306439/27306439 [00:17<00:00, 1557306.83it/s]


Get purchase dict


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [01:00<00:00, 1753.60it/s]


Get similar articles


100%|██████████| 5/5 [00:06<00:00,  1.31s/it]


Get group popular dict
Get popular dict
Get pairs


In [3]:
del train, test, customers, articles, dataset
gc.collect()

0

In [4]:
train_data = purch_data.merge(similar_purch_data, on=["customer_id", "article_id"], how="outer")
del purch_data, similar_purch_data
gc.collect()
print("1")

train_data = train_data.merge(popular_cust_data, on=["customer_id", "article_id"], how="outer")
del popular_cust_data
gc.collect()
print("1")

train_data = train_data.merge(pairs_purch_data, on=["customer_id", "article_id"], how="outer")
del pairs_purch_data
gc.collect()
print("1")

train_data = train_data.merge(general_popular, on=["customer_id", "article_id"], how="outer")
del general_popular
gc.collect()
print("1")

train_data = train_data.fillna(-9999).astype(np.int32)

1
1
1
1


In [6]:
train_data

Unnamed: 0,customer_id,article_id,purchase_score,similar_parent_purchase_score,similarity,popular_rank,pairs_parent_purchase_score,general_popular_count,general_popular_count_rank
0,0,99,0,-9999,-9999,-9999,-9999,-9999,-9999
1,0,16003,294,294,204,9,13753,-9999,-9999
2,0,16003,294,294,204,9,0,-9999,-9999
3,0,16023,13753,13753,116,-9999,-9999,-9999,-9999
4,0,16023,13753,0,69,-9999,-9999,-9999,-9999
...,...,...,...,...,...,...,...,...,...
299276561,1371979,77254,-9999,-9999,-9999,-9999,-9999,327,44
299276562,1371979,99023,-9999,-9999,-9999,-9999,-9999,326,45
299276563,1371979,102762,-9999,-9999,-9999,-9999,-9999,320,47
299276564,1371979,97255,-9999,-9999,-9999,-9999,-9999,317,48


In [7]:
train_data.to_csv("../input/test_data.csv", index=False)