In [1]:
import sys
sys.path.append("..")
from src.utils import *

import pandas as pd
import numpy as np
from math import sqrt
from pathlib import Path
from tqdm import tqdm
import swifter
tqdm.pandas()

import gc
import cv2, matplotlib.pyplot as plt
from os.path import exists

# Prepare transactions 

## Get customer group 

In [2]:
cust_features = ["age_group", "price_group"]
customers = pd.read_csv("../input/customers_proc.csv")[["customer_id", *cust_features]]

i_group_map = dict(
    enumerate(
        map(tuple, 
            customers[cust_features].drop_duplicates().values
           )
    )
)
group_i_map = {group: i for i, group in i_group_map.items()}

def get_group(line):
    group = (line["age_group"], line["price_group"])
    group = tuple([line[x] for x in cust_features])
    return group_i_map[group]

customers["group"] = customers.apply(get_group, axis=1)
customers = customers[["customer_id", "group"]]
customers

Unnamed: 0,customer_id,group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,2
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,1
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,12
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,4
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,4


In [3]:
df = (
    pd.read_csv('../input/transactions.csv', dtype={'article_id': str}, parse_dates=["t_dat"])
        [["t_dat", "customer_id", "article_id"]]
)
df = df.merge(customers, on="customer_id", how="inner")
df

Unnamed: 0,t_dat,customer_id,article_id,group
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023,1
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0578020002,1
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0723529001,1
...,...,...,...,...
31788319,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0
31788320,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0
31788321,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0913597001,4
31788322,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0903420001,4


## Get popular in groups 

In [4]:
def get_tr_list(transactions, weeks: int = 4):
    tr_list = []
    max_date = transactions["t_dat"].max()
    for i in range(1, weeks + 1):
        min_date = max_date - timedelta(days=7 * i)
        tr_list.append(transactions[transactions['t_dat'] >= min_date])
    return tr_list

train_list = get_tr_list(df, weeks=4)

In [5]:
popular_w1_groups = {}
for group_i in group_i_map.values():
    popular_w1_groups[group_i] = (
        list((train_list[0][train_list[0]["group"] == group_i]['article_id']
              .value_counts()).index)[:12]
    )
popular_w1_groups

{0: ['0751471001',
  '0448509014',
  '0751471038',
  '0751471043',
  '0706016001',
  '0918522001',
  '0850917001',
  '0536139068',
  '0852584001',
  '0673677002',
  '0579541001',
  '0456163086'],
 1: ['0924243001',
  '0909370001',
  '0923758001',
  '0924243002',
  '0915529005',
  '0889550002',
  '0918522001',
  '0866731001',
  '0929275001',
  '0934835001',
  '0448509014',
  '0896169005'],
 2: ['0889456001',
  '0781758003',
  '0850917003',
  '0767834002',
  '0889456002',
  '0874110016',
  '0316441001',
  '0899749005',
  '0456163086',
  '0803757015',
  '0884319005',
  '0803757004'],
 3: ['0918292001',
  '0768912001',
  '0924243001',
  '0866731001',
  '0805000001',
  '0915529003',
  '0909370001',
  '0923758001',
  '0850917001',
  '0751471001',
  '0711053003',
  '0714790020'],
 4: ['0918522001',
  '0685813003',
  '0685814001',
  '0448509014',
  '0715624001',
  '0850917001',
  '0866731001',
  '0918292001',
  '0911699002',
  '0751471001',
  '0685814003',
  '0706016001'],
 5: ['0924243001',
 

# Trending 

In [6]:
N = 12
last_ts = df['t_dat'].max()

In [7]:
df['ldbw'] = df['t_dat'].swifter.apply(lambda d: last_ts - (last_ts - d).floor('7D'))
df

Dask Apply:   0%|          | 0/40 [00:00<?, ?it/s]

Unnamed: 0,t_dat,customer_id,article_id,group,ldbw
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023,1,2018-09-25
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0578020002,1,2019-03-05
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0723529001,1,2020-02-04
...,...,...,...,...,...
31788319,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22
31788320,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22
31788321,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0913597001,4,2020-09-22
31788322,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0903420001,4,2020-09-22


In [8]:
weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id'])[["t_dat"]].count()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
weekly_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,count
ldbw,article_id,Unnamed: 2_level_1
2018-09-25,0108775015,224
2018-09-25,0108775044,96
2018-09-25,0108775051,31
2018-09-25,0110065001,28
2018-09-25,0110065002,14
...,...,...
2020-09-22,0952267001,10
2020-09-22,0952938001,7
2020-09-22,0953450001,5
2020-09-22,0953763001,18


In [9]:
df = df.join(weekly_sales[["count"]], on=['ldbw', 'article_id'])
df

Unnamed: 0,t_dat,customer_id,article_id,group,ldbw,count
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023,1,2018-09-25,29
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0578020002,1,2019-03-05,15
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0723529001,1,2020-02-04,100
...,...,...,...,...,...,...
31788319,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121
31788320,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121
31788321,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0913597001,4,2020-09-22,194
31788322,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0903420001,4,2020-09-22,164


In [10]:
weekly_sales = weekly_sales.reset_index().set_index('article_id')
last_day = last_ts.strftime('%Y-%m-%d')

In [11]:
df = df.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_day, ['count']],
    on='article_id', rsuffix="_targ")
df

Unnamed: 0,t_dat,customer_id,article_id,group,ldbw,count,count_targ
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40,
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023,1,2018-09-25,29,
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40,
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0578020002,1,2019-03-05,15,
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0723529001,1,2020-02-04,100,9.0
...,...,...,...,...,...,...,...
31788319,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121,121.0
31788320,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121,121.0
31788321,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0913597001,4,2020-09-22,194,194.0
31788322,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0903420001,4,2020-09-22,164,164.0


In [12]:
df['count_targ'].fillna(0, inplace=True)
df['quotient'] = df['count_targ'] / df['count']
df

Unnamed: 0,t_dat,customer_id,article_id,group,ldbw,count,count_targ,quotient
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40,0.0,0.00
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023,1,2018-09-25,29,0.0,0.00
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,1,2018-09-25,40,0.0,0.00
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0578020002,1,2019-03-05,15,0.0,0.00
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0723529001,1,2020-02-04,100,9.0,0.09
...,...,...,...,...,...,...,...,...
31788319,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121,121.0,1.00
31788320,2020-09-22,fee56cc5315dafb35a4490ccc6f711092cae913550c832...,0903647001,0,2020-09-22,121,121.0,1.00
31788321,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0913597001,4,2020-09-22,194,194.0,1.00
31788322,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,0903420001,4,2020-09-22,164,164.0,1.00


In [22]:
purchase_dict = {}

for i in tqdm(df.index):
    cust_id = df.at[i, 'customer_id']
    art_id = df.at[i, 'article_id']
    t_dat = df.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d

    value = df.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

100%|██████████| 31788324/31788324 [13:41<00:00, 38706.37it/s]


In [23]:
pairs = np.load('../input/pairs_cudf.npy', allow_pickle=True).item()

# pairs = {}
# vc = df["article_id"].value_counts()
# for article_id in tqdm(vc.index.to_list()):
#     users = df[df["article_id"] == article_id]["customer_id"].unique()
#     vc_article = df[df["customer_id"].isin(users) & 
#                     (df["article_id"] != article_id)]["article_id"].value_counts()
#     pairs[article_id] = vc_article.index[0]

sub = pd.read_csv('../input/sample_submission.csv')

In [24]:
def get_prediction(customer):
    customer_id = customer["customer_id"]
    group = customer["group"]
    
    prediction = []
    if customer_id in purchase_dict:
        series = pd.Series(purchase_dict[customer_id])
        series = series[series > 150]
        l = series.nlargest(N).index.tolist()
        tmp_l = l.copy()
        for elm in tmp_l:
            if len(l) < N and int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                prediction.append("0" + str(itm))
#         print(customer_id, prediction)
                
    l = [y for y in popular_w1_groups[group] if y not in prediction]
    prediction.extend(l)
    
    l = [y for y in general_pred if y not in prediction]
    prediction.extend(l)
    
    return " ".join(prediction[:12])

customers["prediction"] = customers.progress_apply(get_prediction, axis=1)

100%|██████████| 1371980/1371980 [09:07<00:00, 2505.01it/s]


In [26]:
customers[["customer_id", "prediction"]].to_csv("../output/35.trending_and_popular_group.csv", index=False, header=True)

In [25]:
customers

Unnamed: 0,customer_id,group,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0,0568601006 0568597006 0751471001 0448509014 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,1,0706016001 0610776001 0866383001 0924243001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1,0794321011 0685813001 0852643004 0858883001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,2,0889456001 0781758003 0850917003 0767834002 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0,0706016001 0791587001 0896152001 0732206001 09...
...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,1,0924243001 0909370001 0923758001 0924243002 09...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,12,0918522001 0929275001 0898692006 0865799006 04...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,4,0884081002 0762846031 0794819002 0706016001 06...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,4,0464297007 0448509014 0918522001 0685813003 06...
