In [2]:
import pandas as pd
import numpy as np
from math import sqrt
from pathlib import Path
from tqdm import tqdm
from pandarallel import pandarallel

In [3]:
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
N = 12
df_trans = pd.read_csv('../input/transactions_full.csv',dtype={'article_id': str})
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])
df = df_trans[['t_dat', 'customer_id', 'article_id']].copy()
last_ts = df['t_dat'].max()

# последний день расчётного периода
df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D')) 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

In [4]:
# количество покупок товаров за расчётные даты
weekly_sales = (
    df.drop('customer_id', axis=1)
    .groupby(['ldbw', 'article_id']).count()
    .rename(columns={'t_dat': 'count'})
)

df = df.join(weekly_sales, on=['ldbw', 'article_id'])
weekly_sales = weekly_sales.reset_index().set_index('article_id')
last_day = last_ts.strftime('%Y-%m-%d')

df = df.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_day, ['count']],
    on='article_id', rsuffix="_targ")

df['count_targ'].fillna(0, inplace=True)
df['quotient'] = df['count_targ'] / df['count'] #отношение продаж товара за последний месяц к текущий месяц

In [5]:
purchase_dict = {}
for i in tqdm(df.index):
    cust_id = df.at[i, 'customer_id']
    art_id = df.at[i, 'article_id']
    t_dat = df.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days) # дней с продажи товара

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d # коэфф. временного затухания

    value = df.at[i, 'quotient'] * max(0, y) # умножение коэф. проданных товаров на затухание
    purchase_dict[cust_id][art_id] += value

100%|██████████| 31788324/31788324 [08:47<00:00, 60299.86it/s]


In [6]:
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

In [7]:
pairs = np.load('../input/pairs_cudf.npy',allow_pickle=True).item()
sub = pd.read_csv('../input/sample_submission.csv')

In [8]:
def get_prediction(cust_id):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 150]
        l = series.nlargest(N).index.tolist()
        tmp_l = l.copy()
        for elm in tmp_l:
            if len(l) < N and int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                if ('0' + str(itm)) not in l:
                    l.append('0' + str(itm))
        if len(l) < N:
            l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    return ' '.join(l)

sub['prediction'] = sub["customer_id"].parallel_apply(get_prediction)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

In [9]:
sub.to_csv('../output/37.trending_original_fix',index=False)

## Замена старых элементов на новые  - не работает 

In [4]:
articles = pd.read_csv("../input/articles_proc.csv", dtype={"article_id": str})
customers = pd.read_csv("../input/customers_proc.csv")

transactions_full = pd.read_csv("../input/transactions_full.csv", 
                                dtype={"article_id": str}, parse_dates=["t_dat"])

In [5]:
class DatasetMaker:
    def __init__(self, 
                 articles, 
                 customers):        
        self.articles_num2id = dict(enumerate(articles["article_id"].unique()))
        self.articles_id2num = {id_: num for num, id_  in self.articles_num2id.items()}

        self.customers_num2id = dict(enumerate(customers["customer_id"].unique()))
        self.customers_id2num = {id_: num for num, id_ in self.customers_num2id.items()}
        
        self.data_shape = (customers.shape[0], articles.shape[0])
    
    def get_coo_matrix(self, data):
        data_csr = coo_matrix(
            (
                np.ones(data.shape[0]), 
                (
                    data["customer_id"].map(self.customers_id2num), 
                    data["article_id"].map(self.articles_id2num)
                )
            ),
            shape=self.data_shape,
            dtype=np.uint8
        )
        return data_csr
        
    def split_data(self, data, val_days: int = 7):
        val_split_date = data['t_dat'].max() - pd.Timedelta(val_days)
        
        data_train = data[data['t_dat'] < val_split_date]
        data_val = data[data['t_dat'] >= val_split_date]
        return data_train, data_val
    
    def limit_data(self, data, min_days_ago: int = 30, max_days_ago: int = 0):
        min_split_date = data['t_dat'].max() - pd.Timedelta(days=min_days_ago)
        max_split_date = data['t_dat'].max() - pd.Timedelta(days=max_days_ago)
        
        return data[data['t_dat'].between(min_split_date, max_split_date)]

In [6]:
from scipy.sparse import csr_matrix, coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k

train_weeks = 9999
factors = 200
iterations = 5
regularization = 0.01
random_state = 1

dm = DatasetMaker(articles, customers)
data = dm.limit_data(transactions_full, 
                     min_days_ago=7 * (train_weeks), 
                     max_days_ago=0)
train = dm.get_coo_matrix(data).tocsr()

als = implicit.als.AlternatingLeastSquares(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=1
)

als.fit(train, show_progress=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
last_date = transactions_full["t_dat"].max()
article_counter_w1 = (
    transactions_full[transactions_full["t_dat"] >= last_date - pd.Timedelta(days=7)]
        .groupby("article_id").size()
).to_dict()
article_counter_w1 = dict(filter(lambda x: x[1] > 10, article_counter_w1.items()))

In [8]:
N = 12
df_trans = pd.read_csv('../input/transactions_full.csv',dtype={'article_id': str})
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])
df = df_trans[['t_dat', 'customer_id', 'article_id']].copy()
last_ts = df['t_dat'].max()

# последний день расчётного периода
df['ldbw'] = df['t_dat'].parallel_apply(lambda d: last_ts - (last_ts - d).floor('7D')) 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3973541), Label(value='0 / 3973541…

In [9]:
# количество покупок товаров за расчётные даты
weekly_sales = (
    df.drop('customer_id', axis=1)
    .groupby(['ldbw', 'article_id']).count()
    .rename(columns={'t_dat': 'count'})
    .reset_index()
)

last_day = last_ts.strftime('%Y-%m-%d')
weekly_sales_targ = (
    weekly_sales[weekly_sales['ldbw']==last_day][['article_id', 'count']]
        .rename({"count": "count_targ"}, axis=1)
)
last_day_count = weekly_sales_targ.set_index("article_id")["count_targ"].to_dict()

In [10]:
from collections import defaultdict

actual_article_list = list(
    map(lambda x: dm.articles_id2num[x],
        list(article_counter_w1.keys())
       )
)

similar_article_dict = defaultdict(list)
for article_id, article_num in tqdm(dm.articles_id2num.items()):
    items, scores = als.similar_items(
        itemid=article_num, 
        N=10, 
        items=actual_article_list
    )
    for i in range(len(items)):
        item, score = items[i], scores[i]
        article_id_simular = dm.articles_num2id[item]
        similar_article_dict[article_id].append((article_id_simular, score * 
                                                 last_day_count[article_id_simular]))

for article_id in similar_article_dict:
    similar_article_dict[article_id] = sorted(similar_article_dict[article_id], key=lambda x: -x[1])

100%|██████████| 105542/105542 [00:44<00:00, 2361.98it/s]


In [11]:
df = df.merge(weekly_sales, on=['ldbw', 'article_id'])
df = df.merge(weekly_sales_targ, on='article_id', how="left")

df['count_targ'].fillna(0, inplace=True)
df['quotient'] = df['count_targ'] / df['count'] #отношение продаж товара за последний месяц к текущему месяцу
df

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count,count_targ,quotient
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,2018-09-25,40,0.0,0.0
1,2018-09-20,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,0663713001,2018-09-25,40,0.0,0.0
2,2018-09-20,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,0663713001,2018-09-25,40,0.0,0.0
3,2018-09-20,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,0663713001,2018-09-25,40,0.0,0.0
4,2018-09-20,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,0663713001,2018-09-25,40,0.0,0.0
...,...,...,...,...,...,...,...
31788319,2020-09-22,fd5ce8716faf00f6a83616f609e0403ac516727d4ca4aa...,0548659003,2020-09-22,1,1.0,1.0
31788320,2020-09-22,fd5ce8716faf00f6a83616f609e0403ac516727d4ca4aa...,0807775001,2020-09-22,1,1.0,1.0
31788321,2020-09-22,fdcfd2235f001a0b4e1f480b2a196dc5f5cd93f8a4c45c...,0771844001,2020-09-22,1,1.0,1.0
31788322,2020-09-22,fef6e2e0ab0493e18c1a76f90ed286c2c6c3bc8ac4ca33...,0892354003,2020-09-22,1,1.0,1.0


In [None]:
purchase_dict = {}
for i in tqdm(df.index):
    cust_id = df.at[i, 'customer_id']
    art_id = df.at[i, 'article_id']
    t_dat = df.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days) # дней с продажи товара

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d # коэфф. временного затухания

    value = df.at[i, 'quotient'] * max(0, y) # умножение коэф. проданных товаров на затухание
    purchase_dict[cust_id][art_id] += value

 25%|██▍       | 7920015/31788324 [03:59<26:22, 15082.79it/s]

In [None]:
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

In [13]:
pairs = np.load('../input/pairs_cudf.npy',allow_pickle=True).item()
sub = pd.read_csv('../input/sample_submission.csv')

In [14]:
def get_prediction(cust_id):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 150]
        l = series.nlargest(N).index.tolist()
        
        tmp_l = l.copy()
        for elm in tmp_l:
            if int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                if ('0' + str(itm)) not in l:
                    l.append('0' + str(itm))
        
        series = pd.Series(purchase_dict[cust_id])
        series = series[series < 100]
        t = series.nlargest(N).index.tolist()
        for elm in t: # НЕТ ПРОВЕРКИ НА УЖЕ НАХОЖДЕНИЕ ЭЛЕМЕНТА В СПИСКЕ !!!
            l.append(similar_article_dict[elm][0][0])
        
        l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    return ' '.join(l[:N])

sub['prediction'] = sub["customer_id"].parallel_apply(get_prediction)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

In [15]:
sub.to_csv('../output/37.trending_add_old_similar.csv', index=False) # 0.0235

### Популярные по группам 

In [54]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_group,price_min,price_max,...,mean_article_count_on_date,Ladieswear_count,Baby/Children_count,Menswear_count,Sport_count,Divided_count,common_group,sex,has_children,price_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,1.0,1.0,ACTIVE,NONE,49.0,Other,45-54,0.010153,0.054220,...,2.100000,13.0,5.0,1.0,0.0,2.0,Lady,Woman,1,medium
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1.0,1.0,ACTIVE,NONE,25.0,Other,22-29,0.006763,0.084729,...,3.739130,62.0,4.0,0.0,1.0,19.0,Lady,Woman,1,high
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.013542,0.067780,...,2.571429,12.0,0.0,4.0,1.0,1.0,Lady,Woman,0,high
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,1.0,1.0,ACTIVE,NONE,54.0,Other,45-54,0.030492,0.030492,...,2.000000,0.0,0.0,0.0,2.0,0.0,Divided,Unknown,0,low
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,Other,45-54,0.016932,0.059305,...,2.166667,11.0,0.0,0.0,2.0,0.0,Lady,Woman,0,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,1.0,1.0,ACTIVE,NONE,24.0,Other,22-29,0.005068,0.067780,...,4.636364,46.0,0.0,1.0,4.0,0.0,Lady,Woman,0,high
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,1.0,1.0,ACTIVE,NONE,21.0,Other,16-21,0.005068,0.076254,...,4.421053,27.0,7.0,2.0,0.0,48.0,Divided,Woman,1,high
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,Other,16-21,0.004559,0.042356,...,2.368421,19.0,0.0,5.0,3.0,18.0,Divided,Woman,0,medium
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,Other,16-21,0.013542,0.050831,...,1.750000,3.0,0.0,0.0,0.0,4.0,Divided,Woman,0,medium


In [88]:
group_art_sum = (
    df.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")
        .groupby(['age_group', 'article_id'])['quotient'].sum()
)

In [91]:
group_popular_dict = {}
for age_group in group_art_sum.index.levels[0].tolist():
    group_popular = (
        group_art_sum[(group_art_sum.index.get_level_values("age_group") == age_group)]
            .nlargest(N)
            .index.get_level_values("article_id")
    ).tolist()
    group_popular_dict[age_group] = group_popular

In [92]:
group_popular_dict

{'16-21': ['0448509014',
  '0715624001',
  '0706016001',
  '0573085028',
  '0673677002',
  '0706016002',
  '0685814001',
  '0685814003',
  '0706016003',
  '0372860002',
  '0685813003',
  '0751471001'],
 '22-29': ['0448509014',
  '0706016001',
  '0573085028',
  '0158340001',
  '0706016003',
  '0751471001',
  '0715624001',
  '0372860001',
  '0706016002',
  '0685814001',
  '0372860002',
  '0673677002'],
 '30-44': ['0448509014',
  '0573085028',
  '0158340001',
  '0768912001',
  '0579541001',
  '0751471001',
  '0610776002',
  '0711053003',
  '0562245046',
  '0372860001',
  '0706016001',
  '0611415001'],
 '45-54': ['0751471001',
  '0673677002',
  '0573085028',
  '0579541001',
  '0448509014',
  '0372860002',
  '0678942001',
  '0706016001',
  '0715624001',
  '0568601006',
  '0706016002',
  '0372860001'],
 '54+': ['0579541001',
  '0751471001',
  '0573085028',
  '0678942001',
  '0673677002',
  '0448509014',
  '0783346001',
  '0762846006',
  '0850917001',
  '0568601006',
  '0562245046',
  '061077

In [95]:
sub = pd.read_csv('../input/sample_submission.csv')
sub = sub.merge(customers[["customer_id", "age_group"]], on="customer_id", how="inner")

def get_prediction(line):
    cust_id = line["customer_id"]
    age_group = line["age_group"]
    
    prediction = []
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 150]
        t = series.nlargest(N).index.tolist()
        prediction.extend(t)
        
        for elm in t:
            if int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                if ('0' + str(itm)) not in prediction:
                    prediction.append('0' + str(itm))
        
        series = pd.Series(purchase_dict[cust_id])
        series = series[series <= 150]
        t = series.nlargest(N).index.tolist()
        for elm in t:
            prediction.append(similar_article_dict[elm][0][0])
        
    prediction.extend(group_popular_dict[age_group])
    return ' '.join(prediction[:N])

sub['prediction'] = sub[["customer_id", "age_group"]].parallel_apply(get_prediction, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171498), Label(value='0 / 171498')…

In [96]:
sub[["customer_id", "prediction"]].to_csv('../output/37.trending_add_old_similar_popular_group_age.csv', 
                                          index=False) # 0.02//

In [97]:
sub[["customer_id", "prediction"]]

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0568597006 0793012001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0739590027 0706016001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0754238023 0740922001 0751471001 0673677002 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0896152002 0818320001 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0804992033 0720125039 0713997002 0557599022 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0818754002 0762846031 0759871030 0624486088 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0884081001 0762846027 0794819001 0689365050 06...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0882810001 0714790020 0464297007 0448509014 08...
