In [1]:
import pandas as pd
import numpy as np
import pickle
from math import sqrt
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_pkl(fname):
    with open(fname, 'rb') as infile:
        return pickle.load(infile)
def save_pkl(df, fname):
    with open(fname, 'wb') as outfile:
        pickle.dump(df, outfile)
def na_percent(df):
    return df.isna().sum()/len(df)
def map_customer_id(df):
    df['customer_id'] = df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
def map_article_id(df):
    df['article_id'] = df['article_id'].astype('int32')
    # when making predictions, should convert to string and add leading 0
    # e.g. train['article_id'] = '0' + train.article_id.astype('str')
path='/content/drive/MyDrive/Colab Notebooks/COMS4995/'

In [4]:
#df_articles = load_pkl(path+'df_articles.pkl')
#df_customers = load_pkl(path+'df_customers.pkl')
df_transactions_train = load_pkl(path+'df_transactions_train.pkl')

In [5]:
df_transactions_train["day_num"] = df_transactions_train["year"]*365 + df_transactions_train["month"]*30 + df_transactions_train["day"]

In [6]:
df_transactions_train["week_num"] = df_transactions_train["day_num"].apply(lambda x: x//7)

In [7]:
weekly_sales = df_transactions_train[['week_num', 'article_id', 'day']].groupby(["week_num", 'article_id']).count()
weekly_sales = weekly_sales.rename(columns={'day': 'count'})
weekly_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,count
week_num,article_id,Unnamed: 2_level_1
943,108775015,115
943,108775044,57
943,108775051,21
943,110065001,13
943,110065002,11
...,...,...
1064,909080001,5
1064,909080002,2
1064,909091001,10
1064,909519003,4


In [8]:
df_transactions_train = df_transactions_train.join(weekly_sales, on=["week_num", 'article_id'])
df_transactions_train

Unnamed: 0,customer_id,article_id,price,sales_channel_id,year,month,day,day_num,week_num,count
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,18,9,20,6604,943,26
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,18,9,20,6604,943,22
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,18,9,20,6604,943,12
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,18,9,20,6604,943,908
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,18,9,20,6604,943,1092
...,...,...,...,...,...,...,...,...,...,...
31788319,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,20,9,22,7336,1048,4
31788320,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,20,9,22,7336,1048,2
31788321,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,20,9,22,7336,1048,2
31788322,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,20,9,22,7336,1048,11


In [9]:
last_week = max(df_transactions_train['week_num'])
weekly_sales = weekly_sales.reset_index().set_index('article_id')
df_transactions_train = df_transactions_train.join(weekly_sales.loc[weekly_sales['week_num']==last_week, ['count']],
    on='article_id', rsuffix="_targ")
df_transactions_train

Unnamed: 0,customer_id,article_id,price,sales_channel_id,year,month,day,day_num,week_num,count,count_targ
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,18,9,20,6604,943,26,
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,18,9,20,6604,943,22,
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,18,9,20,6604,943,12,
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,18,9,20,6604,943,908,
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,18,9,20,6604,943,1092,
...,...,...,...,...,...,...,...,...,...,...,...
31788319,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,20,9,22,7336,1048,4,
31788320,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,20,9,22,7336,1048,2,
31788321,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,20,9,22,7336,1048,2,
31788322,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,20,9,22,7336,1048,11,


In [10]:
df_transactions_train['count_targ'].fillna(0, inplace=True)
del weekly_sales
df_transactions_train['quotient'] = df_transactions_train['count_targ'] / df_transactions_train['count']
df_transactions_train

Unnamed: 0,customer_id,article_id,price,sales_channel_id,year,month,day,day_num,week_num,count,count_targ,quotient
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,18,9,20,6604,943,26,0.0,0.0
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,18,9,20,6604,943,22,0.0,0.0
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,18,9,20,6604,943,12,0.0,0.0
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,18,9,20,6604,943,908,0.0,0.0
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,18,9,20,6604,943,1092,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
31788319,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,20,9,22,7336,1048,4,0.0,0.0
31788320,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,20,9,22,7336,1048,2,0.0,0.0
31788321,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,20,9,22,7336,1048,2,0.0,0.0
31788322,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,20,9,22,7336,1048,11,0.0,0.0


In [11]:
last_day = max(df_transactions_train['day_num'])

In [12]:
df_transactions_train = df_transactions_train[df_transactions_train['year']>18]
df_transactions_train

Unnamed: 0,customer_id,article_id,price,sales_channel_id,year,month,day,day_num,week_num,count,count_targ,quotient
4411262,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,304766008,0.022017,2,19,1,1,6966,995,31,0.0,0.0
4411263,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,304766008,0.022017,2,19,1,1,6966,995,31,0.0,0.0
4411264,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,304766008,0.022017,2,19,1,1,6966,995,31,0.0,0.0
4411265,000ca60ca6d8de6d3b4689e2da829d69db5947bd89084c...,662916002,0.022017,2,19,1,1,6966,995,13,0.0,0.0
4411266,000ca60ca6d8de6d3b4689e2da829d69db5947bd89084c...,740909001,0.042356,2,19,1,1,6966,995,30,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
31788319,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,20,9,22,7336,1048,4,0.0,0.0
31788320,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,20,9,22,7336,1048,2,0.0,0.0
31788321,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,20,9,22,7336,1048,2,0.0,0.0
31788322,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,20,9,22,7336,1048,11,0.0,0.0


In [None]:
purchase_dict = {}

for i in tqdm(df_transactions_train.index):
    cust_id = df_transactions_train.at[i, 'customer_id']
    art_id = df_transactions_train.at[i, 'article_id']
    day_num = df_transactions_train.at[i, 'day_num']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, last_day - day_num)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d

    value = df_transactions_train.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

target_sales = df_transactions_train.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(12).index.tolist()

100%|██████████| 27377062/27377062 [33:43<00:00, 13532.48it/s]

In [None]:
import json

with open("purchase_dict.json", "w") as outfile:
    json.dump(purchase_dict, outfile)

In [None]:
with open('purchase_dict.json') as json_file:
    purchase_dict = json.load(json_file)

In [None]:
N=12
pairs = np.load(path + '/pairs_cudf.npy',allow_pickle=True).item()
sub = pd.read_csv('sample_submission.csv')

pred_list = []
for cust_id in tqdm(sub['customer_id']):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 0]
        l = series.nlargest(N).index.tolist()
        tmp_l = l.copy()
        for elm in tmp_l:
            if len(l) < N and int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                l.append('0' + str(itm))
        if len(l) < N:
            l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    pred_list.append(' '.join(l))

sub['prediction'] = pred_list
sub.to_csv(path + 'submission4.csv',index=False)