In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from pathlib import Path
from tqdm import tqdm
from pandarallel import pandarallel

from collections import defaultdict
from scipy.sparse import csr_matrix, coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k

In [2]:
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Замена старых элементов на новые 

In [3]:
articles = pd.read_csv("../input/articles_proc.csv", dtype={"article_id": str})
customers = pd.read_csv("../input/customers_proc.csv")

transactions_full = pd.read_csv("../input/transactions_full.csv", 
                                dtype={"article_id": str}, parse_dates=["t_dat"])

In [43]:
pairs = np.load('../input/pairs_cudf.npy',allow_pickle=True).item()
sub = pd.read_csv('../input/sample_submission.csv')

In [4]:
class DatasetMaker:
    def __init__(self, 
                 articles, 
                 customers):        
        self.articles_num2id = dict(enumerate(articles["article_id"].unique()))
        self.articles_id2num = {id_: num for num, id_  in self.articles_num2id.items()}

        self.customers_num2id = dict(enumerate(customers["customer_id"].unique()))
        self.customers_id2num = {id_: num for num, id_ in self.customers_num2id.items()}
        
        self.data_shape = (customers.shape[0], articles.shape[0])
    
    def get_coo_matrix(self, data):
        data_csr = coo_matrix(
            (
                np.ones(data.shape[0]), 
                (
                    data["customer_id"].map(self.customers_id2num), 
                    data["article_id"].map(self.articles_id2num)
                )
            ),
            shape=self.data_shape,
            dtype=np.uint8
        )
        return data_csr
        
    def split_data(self, data, val_days: int = 7):
        val_split_date = data['t_dat'].max() - pd.Timedelta(val_days)
        
        data_train = data[data['t_dat'] < val_split_date]
        data_val = data[data['t_dat'] >= val_split_date]
        return data_train, data_val
    
    def limit_data(self, data, min_days_ago: int = 30, max_days_ago: int = 0):
        min_split_date = data['t_dat'].max() - pd.Timedelta(days=min_days_ago)
        max_split_date = data['t_dat'].max() - pd.Timedelta(days=max_days_ago)
        
        return data[data['t_dat'].between(min_split_date, max_split_date)]

In [5]:
train_weeks = 9999
factors = 200
iterations = 5
regularization = 0.01
random_state = 1

dm = DatasetMaker(articles, customers)
data = dm.limit_data(transactions_full, 
                     min_days_ago=7 * (train_weeks), 
                     max_days_ago=0)
train = dm.get_coo_matrix(data).tocsr()

als = implicit.als.AlternatingLeastSquares(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=1
)

als.fit(train, show_progress=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
last_date = transactions_full["t_dat"].max()
article_counter_w1 = (
    transactions_full[transactions_full["t_dat"] >= last_date - pd.Timedelta(days=7)]
        .groupby("article_id").size()
).to_dict()
article_counter_w1 = dict(filter(lambda x: x[1] > 10, article_counter_w1.items()))
article_counter_w1

{'0111565001': 39,
 '0111586001': 52,
 '0111593001': 44,
 '0111609001': 13,
 '0123173001': 96,
 '0146730001': 35,
 '0148033001': 60,
 '0153115019': 23,
 '0153115020': 27,
 '0153115021': 11,
 '0156224001': 12,
 '0156231001': 141,
 '0156231002': 11,
 '0158340001': 298,
 '0160442007': 154,
 '0160442010': 139,
 '0160442043': 24,
 '0179208001': 77,
 '0179950001': 24,
 '0179950002': 26,
 '0179950017': 73,
 '0182909001': 32,
 '0186262001': 19,
 '0186262006': 19,
 '0189616001': 61,
 '0189616006': 23,
 '0189616032': 51,
 '0189626001': 12,
 '0194037001': 39,
 '0200182001': 18,
 '0200182002': 12,
 '0201219001': 41,
 '0201219003': 23,
 '0201219016': 16,
 '0201219017': 20,
 '0214844001': 22,
 '0214844002': 24,
 '0214844003': 13,
 '0224606019': 11,
 '0226959007': 18,
 '0226959008': 22,
 '0228257001': 97,
 '0228257002': 13,
 '0228257003': 21,
 '0228257004': 12,
 '0228257008': 15,
 '0237347045': 56,
 '0237347059': 41,
 '0237347060': 47,
 '0237347063': 36,
 '0240561001': 44,
 '0243937001': 11,
 '024393

In [7]:
N = 12
df = transactions_full[['t_dat', 'customer_id', 'article_id']].copy()
last_ts = df['t_dat'].max()

# последний день расчётного периода
df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D')) 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

In [8]:
# количество покупок товаров за расчётные даты
weekly_sales = (
    df.drop('customer_id', axis=1)
    .groupby(['ldbw', 'article_id']).count()
    .rename(columns={'t_dat': 'count'})
    .reset_index()
)

last_day = last_ts.strftime('%Y-%m-%d')
weekly_sales_targ = (
    weekly_sales[weekly_sales['ldbw'] == last_day][['article_id', 'count']]
        .rename({"count": "count_targ"}, axis=1)
)
last_day_count = weekly_sales_targ.set_index("article_id")["count_targ"].to_dict()

In [10]:
df = df.merge(weekly_sales, on=['ldbw', 'article_id'])
df = df.merge(weekly_sales_targ, on='article_id', how="left")

df['count_targ'].fillna(0, inplace=True)
df['quotient'] = df['count_targ'] / df['count'] #отношение продаж товара за последний месяц к текущему месяцу
df

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count,count_targ,quotient
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,2018-09-25,40,0.0,0.0
1,2018-09-20,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,0663713001,2018-09-25,40,0.0,0.0
2,2018-09-20,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,0663713001,2018-09-25,40,0.0,0.0
3,2018-09-20,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,0663713001,2018-09-25,40,0.0,0.0
4,2018-09-20,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,0663713001,2018-09-25,40,0.0,0.0
...,...,...,...,...,...,...,...
31788319,2020-09-22,fd5ce8716faf00f6a83616f609e0403ac516727d4ca4aa...,0548659003,2020-09-22,1,1.0,1.0
31788320,2020-09-22,fd5ce8716faf00f6a83616f609e0403ac516727d4ca4aa...,0807775001,2020-09-22,1,1.0,1.0
31788321,2020-09-22,fdcfd2235f001a0b4e1f480b2a196dc5f5cd93f8a4c45c...,0771844001,2020-09-22,1,1.0,1.0
31788322,2020-09-22,fef6e2e0ab0493e18c1a76f90ed286c2c6c3bc8ac4ca33...,0892354003,2020-09-22,1,1.0,1.0


In [9]:
actual_article_list = list(
    map(lambda x: dm.articles_id2num[x],
        list(article_counter_w1.keys())
       )
)

similar_article_dict = defaultdict(list)
for article_id, article_num in tqdm(dm.articles_id2num.items()):
    items, scores = als.similar_items(
        itemid=article_num, 
        N=10, 
        items=actual_article_list
    )
    for i in range(len(items)):
        item, score = items[i], scores[i]
        article_id_simular = dm.articles_num2id[item]
        similar_article_dict[article_id].append((article_id_simular, score * 
                                                 last_day_count[article_id_simular]))

for article_id in similar_article_dict:
    similar_article_dict[article_id] = sorted(similar_article_dict[article_id], key=lambda x: -x[1])

100%|██████████| 105542/105542 [00:55<00:00, 1902.26it/s]


In [13]:
def get_tr_score(line):
    cust_id = line['customer_id']
    art_id = line['article_id']
    t_dat = line['t_dat']
    quotient = line['quotient']
    
    x = max(1, (last_ts - t_dat).days)
    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d # коэфф. временного затухания
    return quotient * max(0, y)

In [17]:
df["tr_score"] = df.parallel_apply(get_tr_score, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

In [22]:
cust_art_score = (
    df.groupby(["customer_id", "article_id"])["tr_score"].sum()
        .reset_index().values
)

purchase_dict = {}
for line in tqdm(cust_art_score):
    cust_id, art_id, score = line
    
    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}
        
    purchase_dict[cust_id][art_id] = score

In [36]:
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

In [32]:
def get_prediction(cust_id):
    purch_value_limit = 150
    
    prediction = []
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > purch_value_limit]
        l = series.nlargest(N).index.tolist()
        prediction.extend(l)
        
        for elm in l:
            if int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                if ('0' + str(itm)) not in prediction:
                    prediction.append('0' + str(itm))
        
        series = pd.Series(purchase_dict[cust_id])
        series = series[series <= purch_value_limit]
        l = series.nlargest(N).index.tolist()
        for elm in l:
            itm = similar_article_dict[elm][0][0]
            if itm not in prediction:
                prediction.append(itm)
    
    for elm in general_pred:
        if elm not in prediction:
            prediction.append(elm)
    
    return ' '.join(prediction[:N])

sub['prediction'] = sub["customer_id"].parallel_apply(get_prediction)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

In [35]:
sub.to_csv('../output/37.trending_add_old_similar_fix_general.csv', index=False) # 0.0237

In [45]:
def diff_sub(sub, old_sub_path):
    sub_old = pd.read_csv(old_sub_path)
    sub_old = sub_old.merge(sub, on="customer_id", how="inner")
    sub_old["is_eq"] = (sub_old["prediction_x"] == sub_old["prediction_y"]).astype(int)
    print(sub_old[sub_old["is_eq"] == 0].shape[0])
    print(sub_old[sub_old["is_eq"] == 0][["prediction_x", "prediction_y"]].values)

### Популярные по группам 

In [48]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_group,price_min,price_max,...,mean_article_count_on_date,Ladieswear_count,Baby/Children_count,Menswear_count,Sport_count,Divided_count,common_group,sex,has_children,price_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,1.0,1.0,ACTIVE,NONE,49.0,Other,45-54,0.010153,0.054220,...,2.100000,13.0,5.0,1.0,0.0,2.0,Lady,Woman,1,medium
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1.0,1.0,ACTIVE,NONE,25.0,Other,22-29,0.006763,0.084729,...,3.739130,62.0,4.0,0.0,1.0,19.0,Lady,Woman,1,high
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.013542,0.067780,...,2.571429,12.0,0.0,4.0,1.0,1.0,Lady,Woman,0,high
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,1.0,1.0,ACTIVE,NONE,54.0,Other,45-54,0.030492,0.030492,...,2.000000,0.0,0.0,0.0,2.0,0.0,Divided,Unknown,0,medium
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,Other,45-54,0.016932,0.059305,...,2.166667,11.0,0.0,0.0,2.0,0.0,Lady,Woman,0,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.005068,0.067780,...,4.636364,46.0,0.0,1.0,4.0,0.0,Lady,Woman,0,high
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,1.0,1.0,ACTIVE,NONE,21.0,Other,16-21,0.005068,0.076254,...,4.421053,27.0,7.0,2.0,0.0,48.0,Divided,Woman,1,high
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,Other,16-21,0.004559,0.042356,...,2.368421,19.0,0.0,5.0,3.0,18.0,Divided,Woman,0,medium
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,Other,16-21,0.013542,0.050831,...,1.750000,3.0,0.0,0.0,0.0,4.0,Divided,Woman,0,medium


In [49]:
group_art_sum = (
    df.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")
        .groupby(['age_group', 'article_id'])['quotient'].sum()
)

In [50]:
group_popular_dict = {}
for age_group in tqdm(group_art_sum.index.levels[0].tolist()):
    group_popular = (
        group_art_sum[(group_art_sum.index.get_level_values("age_group") == age_group)]
            .nlargest(N)
            .index.get_level_values("article_id")
    ).tolist()
    group_popular_dict[age_group] = group_popular

100%|██████████| 5/5 [00:02<00:00,  2.15it/s]


In [51]:
group_popular_dict

{'16-21': ['0448509014',
  '0715624001',
  '0706016001',
  '0573085028',
  '0673677002',
  '0706016002',
  '0685814001',
  '0685814003',
  '0706016003',
  '0372860002',
  '0685813003',
  '0751471001'],
 '22-29': ['0448509014',
  '0706016001',
  '0573085028',
  '0158340001',
  '0706016003',
  '0751471001',
  '0715624001',
  '0372860001',
  '0706016002',
  '0685814001',
  '0372860002',
  '0673677002'],
 '30-44': ['0448509014',
  '0573085028',
  '0158340001',
  '0768912001',
  '0579541001',
  '0751471001',
  '0610776002',
  '0711053003',
  '0562245046',
  '0372860001',
  '0706016001',
  '0611415001'],
 '45-54': ['0751471001',
  '0673677002',
  '0573085028',
  '0579541001',
  '0448509014',
  '0372860002',
  '0678942001',
  '0706016001',
  '0715624001',
  '0568601006',
  '0706016002',
  '0372860001'],
 '54+': ['0579541001',
  '0751471001',
  '0573085028',
  '0678942001',
  '0673677002',
  '0448509014',
  '0783346001',
  '0762846006',
  '0850917001',
  '0568601006',
  '0562245046',
  '061077

In [61]:
sub = sub.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")

def get_prediction(line):
    cust_id = line["customer_id"]
    age_group = line["age_group"]
    
    purch_value_limit = 3000
    
    prediction = []
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > purch_value_limit]
        l = series.nlargest(N).index.tolist()
        prediction.extend(l)
        
        for elm in l:
            if int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                if ('0' + str(itm)) not in prediction:
                    prediction.append('0' + str(itm))
        
        series = pd.Series(purchase_dict[cust_id])
        series = series[series <= purch_value_limit]
        l = series.nlargest(N).index.tolist()
        for elm in l:
            itm = similar_article_dict[elm][0][0]
            if itm not in prediction:
                prediction.append(itm)
    
    for elm in group_popular_dict[age_group]:
        if elm not in prediction:
            prediction.append(elm)
        
    return ' '.join(prediction[:N])

sub['prediction'] = customers[["customer_id", "age_group"]].parallel_apply(get_prediction, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

In [62]:
sub[["customer_id", "prediction"]].to_csv('../output/39.trending_val_limit_3000.csv', 
                                          index=False) # ----- 0.0242

In [49]:
sub[["customer_id", "prediction"]]

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0568597006 0793012001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0739590027 0706016001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0754238023 0740922001 0751471001 0673677002 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0896152002 0818320001 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0804992033 0720125039 0713997002 0557599022 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0818754002 0762846031 0759871030 0624486088 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0884081001 0762846027 0794819001 0689365050 06...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0882810001 0714790020 0464297007 0448509014 08...
