### Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
from tqdm import tqdm

### Load data

In [2]:
path = '../input/h-and-m-personalized-fashion-recommendations/'
articles = pd.read_csv(path + 'articles.csv')
customers = pd.read_csv(path + 'customers.csv')
transactions = pd.read_csv(path + 'transactions_train.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

### Data Pre-processing

In [3]:
# handling missing values
articles = articles.drop(columns=['detail_desc'])
customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)
customers['club_member_status'] = customers['club_member_status'].fillna('INACTIVE')
customers['age'] = customers['age'].fillna(customers['age'].median())

In [4]:
# converting data types
transactions['sales_channel_id'] = transactions['sales_channel_id'].astype('category')
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

In [5]:
# merge data
merged_df = transactions.merge(articles, on='article_id', how='left')
merged_df = merged_df.merge(customers.drop(columns=['postal_code']), on='customer_id', how='left')

merged_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,...,index_group_name,section_no,section_name,garment_group_no,garment_group_name,FN,Active,club_member_status,fashion_news_frequency,age
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,663713,Atlanta Push Body Harlow,283,Underwear body,Underwear,...,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",0.0,0.0,ACTIVE,NONE,24.0
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,541518,Rae Push (Melbourne) 2p,306,Bra,Underwear,...,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",0.0,0.0,ACTIVE,NONE,24.0
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,505221,Inca Jumper,252,Sweater,Garment Upper body,...,Divided,58,Divided Selected,1003,Knitwear,1.0,1.0,ACTIVE,Regularly,32.0
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,685687,W YODA KNIT OL OFFER,252,Sweater,Garment Upper body,...,Ladieswear,15,Womens Everyday Collection,1023,Special Offers,1.0,1.0,ACTIVE,Regularly,32.0
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,685687,W YODA KNIT OL OFFER,252,Sweater,Garment Upper body,...,Ladieswear,15,Womens Everyday Collection,1023,Special Offers,1.0,1.0,ACTIVE,Regularly,32.0


## Method 1: Recommend most often previously purchased items for each customer

In [6]:
# group transactions by customer and article, then count occurrences
customer_purchase_count = transactions.groupby(['customer_id', 'article_id']).size().reset_index(name='purchase_count')
customer_purchase_count.head()

Unnamed: 0,customer_id,article_id,purchase_count
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,1
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,2
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008,1
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,1


In [7]:
# sort by purchase count in descending order for each customer
customer_purchase_count = customer_purchase_count.sort_values(by=['customer_id', 'purchase_count'], ascending=[True, False])
customer_purchase_count.head()

Unnamed: 0,customer_id,article_id,purchase_count
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,2
13,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,797065001,2
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,1
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008,1


In [8]:
# extract most purchased articles for each customer
# get top 12 for each customer
top_purchases = customer_purchase_count.groupby('customer_id').head(12)  
top_purchases.head()

Unnamed: 0,customer_id,article_id,purchase_count
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,2
13,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,797065001,2
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,1
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008,1


In [9]:
# create a dictionary customer_id, article_id with the count
top_purchases_dict={}

for i,x in enumerate(zip(top_purchases['customer_id'], top_purchases['article_id'])):
    cust_id, art_id = x
    if cust_id not in top_purchases_dict:
        top_purchases_dict[cust_id] = {}
    
    if art_id not in top_purchases_dict[cust_id]:
        top_purchases_dict[cust_id][art_id] = 0
    
    top_purchases_dict[cust_id][art_id] += 1

## Method 2: Recommend last week's most popular items 

In [10]:
# filter transactions for the last week
end_date = transactions['t_dat'].max() 
start_date = end_date - timedelta(days=7)
last_week_transactions = transactions[(transactions['t_dat'] >= start_date) & (transactions['t_dat'] <= end_date)]

In [11]:
# find purchase counts for items in the last week
last_week_purchase_counts = last_week_transactions['article_id'].value_counts()

In [15]:
# get the top N items from last week
top_items = last_week_purchase_counts.head(12).index.tolist()
top_items

[924243001,
 924243002,
 923758001,
 918522001,
 909370001,
 866731001,
 751471001,
 915529003,
 915529005,
 448509014,
 762846027,
 714790020]

## Merging the two predictions

In [16]:
# get unique list of customers
all_customers = sample_submission['customer_id'].unique()
all_customers.shape

(1371980,)

In [17]:
# define a list for final output
prediction_list = []

In [18]:
dummy_list = list(map(str, top_items))[:12]
dummy_pred = ' '.join(dummy_list)

In [19]:
# merging the two predictions
tqdm.pandas()
for i, cust_id in tqdm(enumerate(all_customers)):
    if cust_id in top_purchases_dict:
        l = sorted((top_purchases_dict[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        l = list(map(str,l))
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list[:(12-len(l))])
    else:
        s = dummy_pred
    prediction_list.append(s)

1371980it [00:13, 100528.19it/s]


In [20]:
# converting the customer_id array to a dataframe
all_customers_prediction=pd.DataFrame({'customer_id':all_customers})
all_customers_prediction.head()

Unnamed: 0,customer_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...


In [21]:
# merging predictions with customer_id
all_customers_prediction['prediction'] = prediction_list
print(all_customers_prediction.shape)
all_customers_prediction.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006 797065001 176209023 568601043 607642...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,811835004 351484002 689898002 723529001 583558...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,351484002 663713001 750424014 870304002 541518...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,732413001 742079001 924243001 924243002 923758...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,399061015 589440005 634249005 677049001 698286...


In [22]:
# save to a file or use it directly for submission
all_customers_prediction.to_csv('submission.csv', index=False)