In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *
from src.level_1 import *

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=False, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# TRAIN

In [2]:
SEED = 1
N = 12
TEST_ON = 1

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()

train = add_quotient(train=train)
print("Dataset created")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


Dataset created


In [3]:
actual_articles = get_actual_articles(train, min_count=10, days=7)
len(actual_articles)

4920

## Purchase 

In [4]:
purch_data = get_purchase_data(train)
purch_data.to_csv("../tmp/purchase_data_train.csv", index=False)

purch_actual_data = purch_data[purch_data["article_id"].isin(actual_articles)]
purch_data.to_csv("../tmp/purchase_data_actual_train.csv", index=False)

train.drop(["count", "count_targ", "quotient", "ldbw"], axis=1, inplace=True)
gc.collect()

100%|██████████| 27101148/27101148 [00:24<00:00, 1094874.67it/s]


Unnamed: 0,customer_id,article_id,purchase_score
0,0,99,0
1,0,16003,330
2,0,16023,28864
3,0,23996,0
4,0,29516,0
...,...,...,...
27101143,1371978,85872,25
27101144,1371978,89864,5
27101145,1371978,95506,110
27101146,1371978,99149,311


## Popular 

In [7]:
popular_count = 75

general_count_popular = get_general_count_popular(train, customers, N=popular_count)

general_trending_sum_popular = get_general_trending_sum_popular(train, customers, N=popular_count)
general_trending_sum_popular = (
    general_trending_sum_popular[general_trending_sum_popular["article_id"].isin(actual_articles)]
)
popular_data = general_count_popular.merge(general_trending_sum_popular, 
                                           on=["customer_id", "article_id"], how='outer')
del general_count_popular, general_trending_sum_popular
print(gc.collect())

group_trending_mean_popular = get_group_trending_mean_popular(train, customers, N=popular_count)
group_trending_mean_popular = (
    group_trending_mean_popular[group_trending_mean_popular["article_id"].isin(actual_articles)]
)
popular_data = popular_data.merge(group_trending_mean_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_trending_mean_popular
print(gc.collect())

group_trending_sum_popular = get_group_trending_sum_popular(train, customers, N=popular_count)
group_trending_sum_popular = (
    group_trending_sum_popular[group_trending_sum_popular["article_id"].isin(actual_articles)]
)
popular_data = popular_data.merge(group_trending_sum_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_trending_sum_popular
print(gc.collect())

group_count_popular = get_group_count_popular(train, customers, N=popular_count)
popular_data = popular_data.merge(group_count_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_count_popular
print(gc.collect())

popular_data = popular_data.fillna(-1).astype(np.int32)
popular_data.to_csv("../tmp/popular_data_train.csv", index=False)
popular_data

0
0
0
0


## Similar 

In [13]:
min_w1_count_for_actual_article = 50
similar_count_for_article = 5

similar_purch_data = get_similar_data(purch_data, train, articles, customers, 
                                      min_w1_count_for_actual_article=min_w1_count_for_actual_article, 
                                      similar_count_for_article=similar_count_for_article)
similar_purch_data.to_csv("../tmp/similar_purch_data_train.csv", index=False)
similar_purch_data

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:30<00:00, 3465.30it/s]


Unnamed: 0,customer_id,similar_parent_purchase_score,article_id,als_similarity
0,0,0,58588,84
1,0,0,57129,80
2,0,0,75482,60
3,0,0,32723,48
4,0,0,71465,43
...,...,...,...,...
135505735,1371893,0,78964,40
135505736,1371893,0,13042,34
135505737,1371893,0,97541,34
135505738,1371893,0,9,15


### Merge 

In [None]:
purch_actual_data = pd.read_csv("../tmp/purchase_data_actual_train.csv", index_col=False)
popular_data = pd.read_csv("../tmp/popular_data_train.csv", index_col=False)
similar_purch_data = pd.read_csv("../tmp/similar_purch_data_train.csv", index_col=False)

In [27]:
i = 0
batch_size = 100000
TEST = 0

customer_list = customers["customer_id"].to_list()
test["target"] = 1

while batch_size * i <= customers.shape[0]:
    customer_cur = customer_list[batch_size * i: batch_size * (i + 1)]
    
    purch_actual_data_cur = purch_actual_data[purch_actual_data["customer_id"].isin(customer_cur)]
    popular_data_cur = popular_data[popular_data["customer_id"].isin(customer_cur)]
    similar_purch_data_cur = similar_purch_data[similar_purch_data["customer_id"].isin(customer_cur)]
    
    train_data_cur = (
        purch_actual_data_cur
            .merge(popular_data_cur, on=["customer_id", "article_id"], how='outer')
            .merge(similar_purch_data_cur, on=["customer_id", "article_id"], how='outer')
    )
    train_data_cur = train_data_cur.fillna(-1).astype(np.int32)
    
    if not TEST:
        train_data_cur = (
            train_data_cur.merge(test[["customer_id", "article_id", "target"]], 
                                 on=["customer_id", "article_id"], how="left")
                .fillna({"target": 0.0})
        )

        train_neg = train_data_cur[train_data_cur["target"] == 0].sample(train_data_cur.shape[0] // 10)
        train_pos = train_data_cur[train_data_cur["target"] == 1]
        train_data_cur = pd.concat([train_neg, train_pos])

    del purch_actual_data_cur, popular_data_cur, similar_purch_data_cur, train_neg, train_pos
    gc.collect()
    
    train_data_cur.to_csv(f"../tmp/train_data_p{i}.csv", index=False)
    
    i += 1
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [29]:
i = 0
batch_size = 100000

train_data = None
while batch_size * i <= customers.shape[0]:
    train_data_cur = pd.read_csv(f"../tmp/train_data_p{i}.csv", index_col=False, dtype=np.int32)
    train_data = pd.concat([train_data, train_data_cur]) if train_data is not None else train_data_cur
    print(train_data.shape)
    
    i += 1
    print(i)

(2484619, 16)
1
(4964037, 16)
2
(7446233, 16)
3
(9918622, 16)
4
(12399177, 16)
5
(14879595, 16)
6
(17360117, 16)
7
(19844334, 16)
8
(22324132, 16)
9
(24796103, 16)
10
(27275074, 16)
11
(29746812, 16)
12
(32223705, 16)
13
(34006840, 16)
14


In [31]:
train_data.target.mean(), train_data.target.sum() / test.shape[0]

(0.003734954497389349, 0.5285401001202609)

In [32]:
train_data.to_csv(f"../tmp/train_data.csv", index=False)

# TEST 

In [3]:
SEED = 1
N = 12
TEST_ON = 0

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()

train = add_quotient(train=train)
print("Dataset created")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D'))


Dataset created


In [4]:
actual_articles = get_actual_articles(train, min_count=10, days=7)
len(actual_articles)

4664

In [5]:
purch_data = get_purchase_data(train)
purch_data.to_csv("../tmp/purchase_data_test.csv", index=False)

purch_actual_data = purch_data[purch_data["article_id"].isin(actual_articles)]
purch_data.to_csv("../tmp/purchase_data_actual_test.csv", index=False)

train.drop(["count", "count_targ", "quotient", "ldbw"], axis=1, inplace=True)
gc.collect()

100%|██████████| 27306439/27306439 [00:38<00:00, 701860.45it/s]


0

In [6]:
popular_count = 75

general_count_popular = get_general_count_popular(train, customers, N=popular_count)

general_trending_sum_popular = get_general_trending_sum_popular(train, customers, N=popular_count)
general_trending_sum_popular = (
    general_trending_sum_popular[general_trending_sum_popular["article_id"].isin(actual_articles)]
)
popular_data = general_count_popular.merge(general_trending_sum_popular, 
                                           on=["customer_id", "article_id"], how='outer')
del general_count_popular, general_trending_sum_popular
print(gc.collect())

group_trending_mean_popular = get_group_trending_mean_popular(train, customers, N=popular_count)
group_trending_mean_popular = (
    group_trending_mean_popular[group_trending_mean_popular["article_id"].isin(actual_articles)]
)
popular_data = popular_data.merge(group_trending_mean_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_trending_mean_popular
print(gc.collect())

group_trending_sum_popular = get_group_trending_sum_popular(train, customers, N=popular_count)
group_trending_sum_popular = (
    group_trending_sum_popular[group_trending_sum_popular["article_id"].isin(actual_articles)]
)
popular_data = popular_data.merge(group_trending_sum_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_trending_sum_popular
print(gc.collect())

group_count_popular = get_group_count_popular(train, customers, N=popular_count)
popular_data = popular_data.merge(group_count_popular, 
                                  on=["customer_id", "article_id"], how='outer')
del group_count_popular
print(gc.collect())

popular_data = popular_data.fillna(-1).astype(np.int32)
popular_data.to_csv("../tmp/popular_data_test.csv", index=False)
popular_data

0
0
0
0


Unnamed: 0,customer_id,article_id,general_popular_count,general_popular_count_rank,general_popular_trending_sum,general_popular_trending_sum_rank,group_popular_trending_mean,group_popular_trending_mean_rank,group_popular_trending_sum,group_popular_trending_sum_rank,group_popular_count,group_popular_count_rank
0,0,104553,970,0,117634326,0,-1,-1,16855136,3,149,0
1,0,104554,706,1,93317706,5,-1,-1,16065272,4,133,1
2,0,104527,705,2,84838591,6,-1,-1,21844367,1,129,3
3,0,104072,683,3,84039710,7,-1,-1,12295919,18,99,10
4,0,103108,619,4,64732972,17,-1,-1,-1,-1,62,43
...,...,...,...,...,...,...,...,...,...,...,...,...
205041544,1371979,102746,-1,-1,-1,-1,-1,-1,-1,-1,34,67
205041545,1371979,104147,-1,-1,-1,-1,-1,-1,-1,-1,34,68
205041546,1371979,104839,-1,-1,-1,-1,-1,-1,-1,-1,34,69
205041547,1371979,105306,-1,-1,-1,-1,-1,-1,-1,-1,34,70


In [7]:
min_w1_count_for_actual_article = 50
similar_count_for_article = 5

similar_purch_data = get_similar_data(purch_data, train, articles, customers, 
                                      min_w1_count_for_actual_article=min_w1_count_for_actual_article, 
                                      similar_count_for_article=similar_count_for_article)
similar_purch_data.to_csv("../tmp/similar_purch_data_test.csv", index=False)
similar_purch_data

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:12<00:00, 8132.88it/s]


Unnamed: 0,customer_id,similar_parent_purchase_score,article_id,als_similarity
0,0,0,57129,82
1,0,0,58588,59
2,0,0,75482,40
3,0,0,71465,39
4,0,0,57133,32
...,...,...,...,...
136532190,1371893,0,78964,25
136532191,1371893,0,75,24
136532192,1371893,0,13042,11
136532193,1371893,0,67910,10


In [4]:
purch_actual_data = pd.read_csv("../tmp/purchase_data_actual_test.csv", index_col=False, dtype=np.int32)
popular_data = pd.read_csv("../tmp/popular_data_test.csv", index_col=False, dtype=np.int32)
similar_purch_data = pd.read_csv("../tmp/similar_purch_data_test.csv", index_col=False, dtype=np.int32)

In [5]:
i = 0
batch_size = 100000
TEST = 1

customer_list = customers["customer_id"].to_list()
test["target"] = 1

while batch_size * i <= customers.shape[0]:
    customer_cur = customer_list[batch_size * i: batch_size * (i + 1)]
    
    purch_actual_data_cur = purch_actual_data[purch_actual_data["customer_id"].isin(customer_cur)]
    popular_data_cur = popular_data[popular_data["customer_id"].isin(customer_cur)]
    similar_purch_data_cur = similar_purch_data[similar_purch_data["customer_id"].isin(customer_cur)]
    
    train_data_cur = (
        purch_actual_data_cur
            .merge(popular_data_cur, on=["customer_id", "article_id"], how='outer')
            .merge(similar_purch_data_cur, on=["customer_id", "article_id"], how='outer')
    )
    train_data_cur = train_data_cur.fillna(-1).astype(np.int32)
    
    if not TEST:
        train_data_cur = (
            train_data_cur.merge(test[["customer_id", "article_id", "target"]], 
                                 on=["customer_id", "article_id"], how="left")
                .fillna({"target": 0.0})
        )

        train_neg = train_data_cur[train_data_cur["target"] == 0].sample(train_data_cur.shape[0] // 10)
        train_pos = train_data_cur[train_data_cur["target"] == 1]
        train_data_cur = pd.concat([train_neg, train_pos])
        del train_neg, train_pos

    del purch_actual_data_cur, popular_data_cur, similar_purch_data_cur
    gc.collect()
    
    train_data_cur.to_csv(f"../tmp/test_data_p{i}.csv", index=False)
    
    i += 1
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
