In [1]:
from math import sqrt
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
import pickle
import gc

from tqdm import tqdm
from pandarallel import pandarallel

from scipy.sparse import csr_matrix, coo_matrix
import implicit
import catboost

import sys
sys.path.append("..")
from src.utils import *
from src.dataset import *
from src.trending import *

from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from catboost.utils import get_roc_curve, create_cd
from catboost.eval.catboost_evaluation import CatboostEvaluation

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# TRAIN 

In [2]:
SEED = 1
N = 12
TEST_ON = 1

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

Dataset created


## Similars 

In [3]:
min_w1_count_for_actual_article=10
similar_count_for_article = 5

purch_data = pd.read_csv("../tmp/purchase_data_train.csv", index_col=False, dtype=np.uint32)

similar_article_dict = get_similar_items(
    train=train, 
    articles=articles, 
    customers=customers,
    min_w1_count_for_actual_article = min_w1_count_for_actual_article, 
    similar_num_for_article = similar_count_for_article
)

art_parent_list = []
art_child_list = []
art_child_score = []
for art_parent in similar_article_dict:
    for art_info in similar_article_dict[art_parent]:
        if art_info[1] != 0:
            art_parent_list.append(art_parent)
            art_child_list.append(art_info[0])
            art_child_score.append(int(art_info[1]))
        
similar_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list, 
                             "als_similarity": art_child_score}, dtype=np.uint32)

similar_purch_data = (
    purch_data.merge(similar_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "similar_parent_purchase_score"}, axis=1)
)
del similar_article_dict, art_parent_list, art_child_list, art_child_score, similar_data
gc.collect()

print("Get similar articles")

similar_purch_data.to_csv("../tmp/als_similarity_train.csv", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:40<00:00, 2586.64it/s]


Get similar articles


In [6]:
similar_purch_data[similar_purch_data.similar_parent_purchase_score != 0]

Unnamed: 0,customer_id,similar_parent_purchase_score,article_id,als_similarity
10,3031,6,1300,16
11,3031,6,2912,16
12,3031,6,2060,14
13,3031,6,1309,13
14,3031,6,61076,8
...,...,...,...,...
135505680,1369176,78899,103269,14
135505681,1369176,78899,78528,11
135505682,1369176,78899,100477,7
135505683,1369176,78899,99771,5


## User, item factors 

In [5]:
factors = 20
iterations = 400
regularization = 0.01
random_state = 1

dm = ImplicitDatasetMaker(articles, customers)
train_csr = dm.get_coo_matrix(train).tocsr()

als = implicit.als.AlternatingLeastSquares(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=random_state
)

als.fit(train_csr, show_progress=True)

  0%|          | 0/400 [00:00<?, ?it/s]

In [47]:
art_list = []
factors_list = []
for article_id, article_num in tqdm(dm.articles_id2num.items()):
    article_factors = als.item_factors[article_num].to_numpy()
    article_factors = (
        (article_factors - article_factors.min()) / 
        (article_factors.max() - article_factors.min())
    )
    article_factors *= 255
    article_factors = article_factors.astype(np.uint8)
    art_list.append(article_id)
    factors_list.append(article_factors[0])
    
article_factors = pd.DataFrame({"article_id": art_list, 
                                "als_article_features": factors_list})

article_factors.to_csv("../tmp/article_factors_train.csv", index=False)

  (article_factors - article_factors.min()) /
100%|██████████| 105542/105542 [00:01<00:00, 83467.42it/s]


In [51]:
cust_list = []
factors_list = []
for customer_id, customer_num in tqdm(dm.customers_id2num.items()):
    customer_factors = als.user_factors[customer_num].to_numpy()
    customer_factors = (
        (customer_factors - customer_factors.min()) / 
        (customer_factors.max() - customer_factors.min())
    )
    customer_factors *= 255
    customer_factors = customer_factors.astype(np.uint8)
    cust_list.append(customer_id)
    factors_list.append(customer_factors[0])
    
customer_factors = pd.DataFrame({"customer_id": cust_list, 
                                "als_customer_features": factors_list})

customer_factors.to_csv("../tmp/customer_factors_train.csv", index=False)

  (customer_factors - customer_factors.min()) /
100%|██████████| 1371980/1371980 [00:51<00:00, 26673.29it/s]


# TEST 

In [None]:
SEED = 1
N = 20
TEST_ON = 0

cv_iteration = 0

dataset = Dataset(skip_days=7 * cv_iteration, test_days=7 * TEST_ON)
train, test = dataset.get_train_and_test()
articles = dataset.get_articles()
customers = dataset.get_customers()
print("Dataset created")

In [None]:
min_w1_count_for_actual_article=10
similar_count_for_article = 3

purch_data = pd.read_csv("../tmp/purchase_data_test.csv", index_col=False, dtype=np.uint32)

similar_article_dict = get_similar_items(
    train=train, 
    articles=articles, 
    customers=customers,
    min_w1_count_for_actual_article = min_w1_count_for_actual_article, 
    similar_num_for_article = similar_count_for_article
)

art_parent_list = []
art_child_list = []
art_child_score = []
for art_parent in similar_article_dict:
    for art_info in similar_article_dict[art_parent]:
        if art_info[1] != 0:
            art_parent_list.append(art_parent)
            art_child_list.append(art_info[0])
            art_child_score.append(int(art_info[1]))
        
similar_data = pd.DataFrame({"article_id_parent": art_parent_list, 
                             "article_id_child": art_child_list, 
                             "als_similarity": art_child_score}, dtype=np.uint32)

similar_purch_data = (
    purch_data.merge(similar_data.rename({"article_id_parent": "article_id"}, axis=1), 
                     on="article_id", how="inner")
        .drop(["article_id"], axis=1)
        .rename({"article_id_child": "article_id", 
                 "purchase_score": "similar_parent_purchase_score"}, axis=1)
)
del similar_article_dict, art_parent_list, art_child_list, art_child_score, similar_data
gc.collect()

print("Get similar articles")

similar_purch_data.to_csv("../tmp/als_similarity_test.csv", index=False)

In [57]:
factors = 20
iterations = 400
regularization = 0.01
random_state = 1

dm = ImplicitDatasetMaker(articles, customers)
train_csr = dm.get_coo_matrix(train).tocsr()

als = implicit.als.AlternatingLeastSquares(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=random_state
)

als.fit(train_csr, show_progress=True)

  0%|          | 0/400 [00:00<?, ?it/s]

In [58]:
art_list = []
factors_list = []
for article_id, article_num in tqdm(dm.articles_id2num.items()):
    article_factors = als.item_factors[article_num].to_numpy()
    article_factors = (
        (article_factors - article_factors.min()) / 
        (article_factors.max() - article_factors.min())
    )
    article_factors *= 255
    article_factors = article_factors.astype(np.uint8)
    art_list.append(article_id)
    factors_list.append(article_factors[0])
    
article_factors = pd.DataFrame({"article_id": art_list, 
                                "als_article_features": factors_list})
article_factors.to_csv("../tmp/article_factors_test.csv", index=False)

  (article_factors - article_factors.min()) /
100%|██████████| 105542/105542 [00:04<00:00, 26013.25it/s]


In [59]:
cust_list = []
factors_list = []
for customer_id, customer_num in tqdm(dm.customers_id2num.items()):
    customer_factors = als.user_factors[customer_num].to_numpy()
    customer_factors = (
        (customer_factors - customer_factors.min()) / 
        (customer_factors.max() - customer_factors.min())
    )
    customer_factors *= 255
    customer_factors = customer_factors.astype(np.uint8)
    cust_list.append(customer_id)
    factors_list.append(customer_factors[0])
    
customer_factors = pd.DataFrame({"customer_id": cust_list, 
                                "als_customer_features": factors_list})
customer_factors.to_csv("../tmp/customer_factors_test.csv", index=False)

  (customer_factors - customer_factors.min()) /
100%|██████████| 1371980/1371980 [00:15<00:00, 86597.04it/s]
