In [1]:
# https://www.kaggle.com/code/cdeotte/recommend-items-purchased-together-0-021 をベースに
# 変更点
# コールドスタートを年齢毎
# 1ペアアイテム
# New: 4週分の履歴を用いる

# MAP@12 (all): 0.026641
# MAP@12 (cold start): 0.008750

In [2]:
EXP = '010'
FOLD = '_fold1' # '_fold0' のように指定、全データ学習時は'' を指定

# Recommend Items Frequently Purchased Together
This notebook demonstrates how recommending items that are frequently purchased together is effective. The current best scoring public notebook [here][1] recommends to customers those customers' last purchases and scores public LB 0.020. In this notebook here, we will begin with that idea and add recommending items that are frequently purchased together with a customers' previous purchaes. This notebook improves the LB and scores LB 0.021. This notebook's strategy is as follows:
* recommend items previously purchased [idea here][1]
* recommend items that are bought together with previous purchases [idea here][2]
* recommend popular items [idea here][1]

[1]: https://www.kaggle.com/hengzheng/time-is-our-best-friend-v2
[2]: https://www.kaggle.com/cdeotte/customers-who-bought-this-frequently-buy-this

In [3]:
import pandas as pd
import numpy as np

# Find Each Customer's Last Week of Purchases
Our final predictions will have the row order from of our dataframe. Each row of our dataframe will be a prediction. We will create the `predictionstring` later by `train.groupby('customer_id').article_id.sum()`. Since `article_id` is a string, when we groupby sum, it will concatenate all the customer predictions into a single string. It will also create the string in the order of the dataframe. So as we proceed in this notebook, we will order the dataframe how we want our predictions ordered.

In [4]:
train = pd.read_parquet(f'train{FOLD}.pqt')

tmp = train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train = train.merge(tmp,on=['customer_id'],how='left')
train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
train4 = train.loc[train['diff_dat']<28].copy()
train3 = train4.loc[train4['diff_dat']<21].copy()
train2 = train3.loc[train3['diff_dat']<14].copy()
train1 = train2.loc[train2['diff_dat']<7].copy()
print('Train1 shape:',train1.shape, 'Train2 shape:',train2.shape, 'Train3 shape:',train3.shape, 'Train4 shape:',train4.shape)

Train1 shape: (5184732, 5) Train2 shape: (5614738, 5) Train3 shape: (5972932, 5) Train4 shape: (6317958, 5)


In [5]:
# (1) Recommend Most Often Previously Purchased Items

tmp = train1.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train1 = train1.merge(tmp,on=['customer_id','article_id'],how='left')
train1 = train1.sort_values(['ct','t_dat'],ascending=False)
train1 = train1.drop_duplicates(['customer_id','article_id'])
train1 = train1.sort_values(['ct','t_dat'],ascending=False)
# train1['weight'] = train1.groupby('customer_id')['ct'].rank('dense')
# train1['weight'] = train1['ct']
train1.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
1164094,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100
80390,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86
2171280,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81


In [6]:
# (2) Recommend Items Purchased Together

# for 1 item
pairs = np.load(f'../input/hmitempairs/pairs_cudf{FOLD}_1item.npy',allow_pickle=True).item()
train1['article_id2'] = train1.article_id.map(pairs)

# # for 5 items
# pairs = pd.read_csv(f'../input/hmitempairs/pairs_df{FOLD}_5items.csv', index_col=0)
# pairs_ls = []
# for index, row in pairs.iterrows():
#     for i, value in enumerate(row):
#         pairs_ls.append((index, value, i))
# pairs_df = pd.DataFrame(pairs_ls, columns=['article_id', 'article_id2', 'pair_rank'])
# train1 = train1.merge(pairs_df, how='left', on='article_id')
# train1 = train1.dropna(axis=0)
# train1[['article_id2', 'pair_rank']] = train1[['article_id2', 'pair_rank']].astype(int)

train1.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct,article_id2
1164094,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100,719348001.0
80390,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86,557247003.0
2171280,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81,610776002.0


In [7]:
# RECOMMENDATION OF PAIRED ITEMS
train1_pair = train1[['customer_id','article_id2']].copy()
train1_pair = train1_pair.loc[train1_pair.article_id2.notnull()]
train1_pair = train1_pair.drop_duplicates(['customer_id','article_id2'])
train1_pair = train1_pair.rename({'article_id2':'article_id'},axis=1)

In [8]:
# (1) Recommend Most Often Previously Purchased Items

tmp = train2.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train2 = train2.merge(tmp,on=['customer_id','article_id'],how='left')
train2 = train2.sort_values(['ct','t_dat'],ascending=False)
train2 = train2.drop_duplicates(['customer_id','article_id'])
train2 = train2.sort_values(['ct','t_dat'],ascending=False)
# train2['weight'] = train2.groupby('customer_id')['ct'].rank('dense')
# train2['weight'] = train2['ct']
train2.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
1219771,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100
83376,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86
2282621,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81


In [9]:
# (2) Recommend Items Purchased Together

# for 1 item
pairs = np.load(f'../input/hmitempairs/pairs_cudf{FOLD}_1item.npy',allow_pickle=True).item()
train2['article_id2'] = train2.article_id.map(pairs)

# # for 5 items
# pairs = pd.read_csv(f'../input/hmitempairs/pairs_df{FOLD}_5items.csv', index_col=0)
# pairs_ls = []
# for index, row in pairs.iterrows():
#     for i, value in enumerate(row):
#         pairs_ls.append((index, value, i))
# pairs_df = pd.DataFrame(pairs_ls, columns=['article_id', 'article_id2', 'pair_rank'])
# train2 = train2.merge(pairs_df, how='left', on='article_id')
# train2 = train2.dropna(axis=0)
# train2[['article_id2', 'pair_rank']] = train2[['article_id2', 'pair_rank']].astype(int)

train2.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct,article_id2
1219771,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100,719348001.0
83376,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86,557247003.0
2282621,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81,610776002.0


In [10]:
# RECOMMENDATION OF PAIRED ITEMS
train2_pair = train2[['customer_id','article_id2']].copy()
train2_pair = train2_pair.loc[train2_pair.article_id2.notnull()]
train2_pair = train2_pair.drop_duplicates(['customer_id','article_id2'])
train2_pair = train2_pair.rename({'article_id2':'article_id'},axis=1)

In [11]:
# (1) Recommend Most Often Previously Purchased Items

tmp = train3.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train3 = train3.merge(tmp,on=['customer_id','article_id'],how='left')
train3 = train3.sort_values(['ct','t_dat'],ascending=False)
train3 = train3.drop_duplicates(['customer_id','article_id'])
train3 = train3.sort_values(['ct','t_dat'],ascending=False)
# train3['weight'] = train3.groupby('customer_id')['ct'].rank('dense')
# train3['weight'] = train3['ct']
train3.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
1260447,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100
85514,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86
2364813,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81


In [12]:
# (2) Recommend Items Purchased Together

# for 1 item
pairs = np.load(f'../input/hmitempairs/pairs_cudf{FOLD}_1item.npy',allow_pickle=True).item()
train3['article_id2'] = train3.article_id.map(pairs)

# # for 5 items
# pairs = pd.read_csv(f'../input/hmitempairs/pairs_df{FOLD}_5items.csv', index_col=0)
# pairs_ls = []
# for index, row in pairs.iterrows():
#     for i, value in enumerate(row):
#         pairs_ls.append((index, value, i))
# pairs_df = pd.DataFrame(pairs_ls, columns=['article_id', 'article_id2', 'pair_rank'])
# train3 = train3.merge(pairs_df, how='left', on='article_id')
# train3 = train3.dropna(axis=0)
# train3[['article_id2', 'pair_rank']] = train3[['article_id2', 'pair_rank']].astype(int)

train3.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct,article_id2
1260447,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100,719348001.0
85514,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86,557247003.0
2364813,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81,610776002.0


In [13]:
# RECOMMENDATION OF PAIRED ITEMS
train3_pair = train3[['customer_id','article_id2']].copy()
train3_pair = train3_pair.loc[train3_pair.article_id2.notnull()]
train3_pair = train3_pair.drop_duplicates(['customer_id','article_id2'])
train3_pair = train3_pair.rename({'article_id2':'article_id'},axis=1)

In [14]:
# (1) Recommend Most Often Previously Purchased Items

tmp = train4.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train4 = train4.merge(tmp,on=['customer_id','article_id'],how='left')
train4 = train4.sort_values(['ct','t_dat'],ascending=False)
train4 = train4.drop_duplicates(['customer_id','article_id'])
train4 = train4.sort_values(['ct','t_dat'],ascending=False)
# train4['weight'] = train4.groupby('customer_id')['ct'].rank('dense')
# train4['weight'] = train4['ct']
train4.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
594148,2019-02-04,-7250849952617471376,570002001,2019-02-11,7,170
853929,2019-04-15,2923286585288886018,685347001,2019-05-09,24,130
1296427,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100


In [15]:
# (2) Recommend Items Purchased Together

# for 1 item
pairs = np.load(f'../input/hmitempairs/pairs_cudf{FOLD}_1item.npy',allow_pickle=True).item()
train4['article_id2'] = train4.article_id.map(pairs)

# # for 5 items
# pairs = pd.read_csv(f'../input/hmitempairs/pairs_df{FOLD}_5items.csv', index_col=0)
# pairs_ls = []
# for index, row in pairs.iterrows():
#     for i, value in enumerate(row):
#         pairs_ls.append((index, value, i))
# pairs_df = pd.DataFrame(pairs_ls, columns=['article_id', 'article_id2', 'pair_rank'])
# train4 = train4.merge(pairs_df, how='left', on='article_id')
# train4 = train4.dropna(axis=0)
# train4[['article_id2', 'pair_rank']] = train4[['article_id2', 'pair_rank']].astype(int)

train4.head(3)

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct,article_id2
594148,2019-02-04,-7250849952617471376,570002001,2019-02-11,7,170,570002002.0
853929,2019-04-15,2923286585288886018,685347001,2019-05-09,24,130,685347002.0
1296427,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100,719348001.0


In [16]:
# RECOMMENDATION OF PAIRED ITEMS
train4_pair = train4[['customer_id','article_id2']].copy()
train4_pair = train4_pair.loc[train4_pair.article_id2.notnull()]
train4_pair = train4_pair.drop_duplicates(['customer_id','article_id2'])
train4_pair = train4_pair.rename({'article_id2':'article_id'},axis=1)

In [17]:
# CONCATENATE PAIRED ITEM RECOMMENDATION
train1 = train1[['customer_id','article_id']]
train2 = train2[['customer_id','article_id']]
train3 = train3[['customer_id','article_id']]
train4 = train4[['customer_id','article_id']]

train = pd.concat([train1, train1_pair, train2, train2_pair, train3, train3_pair, train4, train4_pair], axis=0, ignore_index=True)
train.article_id = train.article_id.astype('int32')
train = train.drop_duplicates(['customer_id','article_id'])

In [18]:
# CONVERT RECOMMENDATIONS INTO SINGLE STRING
train.article_id = ' 0' + train.article_id.astype('str')
preds = pd.DataFrame( train.groupby('customer_id').article_id.sum().reset_index() )
preds.columns = ['customer_id','prediction']
preds.head()

Unnamed: 0,customer_id,prediction
0,-9223352921020755230,0673396002 0812167004 0706016001 0812167002
1,-9223343869995384291,0908292002 0910601003 0903926002 0865929007 0...
2,-9223321797620987725,0580600006 0610776035 0688018003 0610776002
3,-9223319430705797669,0470985003 0562245001 0646592007 0504155001 0...
4,-9223308614576639426,0750423005 0750423001


In [19]:
# (3) Recommend Most Popular Items

train = pd.read_parquet(f'train{FOLD}.pqt')
# train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
# train['customer_id'] = train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# train['customer_id'] = train['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
# train['article_id'] = train.article_id.astype('int32')
# train.t_dat = pd.to_datetime(train.t_dat)
# train = train[['t_dat','customer_id','article_id']]

train.t_dat = pd.to_datetime(train.t_dat)
max_dat = train.t_dat.max()
train['diff_dat'] = (max_dat - train['t_dat']).dt.days
train = train.loc[train.diff_dat < 7]
top12 = ' 0' + ' 0'.join(train.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
train = train.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = train.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

Top 12 popular items:
 0909370001 0865799006 0918522001 0924243001 0448509014 0751471001 0809238001 0918292001 0762846027 0809238005 0673677002 0923758001


0.0     0685814003 0448509014 0918522001 0715624001 0...
1.0     0909370001 0865799006 0924243001 0809238001 0...
2.0     0909370001 0865799006 0918525001 0909371001 0...
3.0     0909370001 0751471001 0673677002 0910601003 0...
4.0     0918522001 0751471001 0751471043 0910601003 0...
5.0     0918522001 0908799002 0896152002 0924243001 0...
6.0     0736870001 0796210001 0908799002 0865799006 0...
Name: top_12_popular_items, dtype: object

# Write Submission CSV
We will merge our predictions onto `sample_submission.csv` and submit to Kaggle.

In [20]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
sub = sub[['customer_id']]
sub['customer_id_2'] = sub['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
sub = sub.merge(preds.rename({'customer_id':'customer_id_2'},axis=1),\
    on='customer_id_2', how='left').fillna('')
# del sub['customer_id_2']
# sub.prediction = sub.prediction + top12
sub = sub.merge(customers[['customer_id', 'age_bin']].rename(columns={'customer_id': 'customer_id_2'}), on='customer_id_2', how='left')
sub['top12_popular_items'] = sub['age_bin'].map(popular_items_sr)
sub['top12_popular_items'] = sub['top12_popular_items'].fillna(top12).astype('str')
sub.prediction = sub.prediction + sub.top12_popular_items
sub = sub[['customer_id', 'prediction']]
sub.prediction = sub.prediction.str.strip()
sub.prediction = sub.prediction.str[:131]
sub.to_csv(f'../submissions/{EXP}_submission{FOLD}.csv',index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0909370001 0751471001 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0706016001 0909370001 0865799006 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0909370001 0865799006 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0730683001 0372860001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0927530004 0791587015 08...
