In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [3]:
# Read csv containing training data
train_df = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str})
print('Shape of training data frame: ', train_df.shape)
train_df.head()

In [11]:
# Create subset of data - latest 1 week, 2 weeks, and 3 weeks
train_df['t_dat'] = pd.to_datetime(train_df['t_dat'])
train_df_3w = train_df[train_df['t_dat'] >= pd.to_datetime('2020-08-31')].copy()
train_df_2w = train_df[train_df['t_dat'] >= pd.to_datetime('2020-09-07')].copy()
train_df_1w = train_df[train_df['t_dat'] >= pd.to_datetime('2020-09-15')].copy()

transactions_3w = defaultdict(dict)
transactions_2w = defaultdict(dict)
transactions_1w = defaultdict(dict)

# Create a dictionary containing the articles (and the count) purchased by each customer
# Separate dictionary based on the transaction data
for cust_id, art_id in zip(train_df_3w['customer_id'], train_df_3w['article_id']):
    
    if art_id not in transactions_3w[cust_id]:
        transactions_3w[cust_id][art_id] = 0
    transactions_3w[cust_id][art_id] += 1

for cust_id, art_id in zip(train_df_2w['customer_id'], train_df_2w['article_id']):
    
    if art_id not in transactions_2w[cust_id]:
        transactions_2w[cust_id][art_id] = 0
    transactions_2w[cust_id][art_id] += 1

for cust_id, art_id in zip(train_df_1w['customer_id'], train_df_1w['article_id']):
    
    if art_id not in transactions_1w[cust_id]:
        transactions_1w[cust_id][art_id] = 0
    transactions_1w[cust_id][art_id] += 1

print('Number of customers in latest 3 weeks of data ', len(transactions_3w))
print('Number of customers in latest 2 weeks of data ', len(transactions_2w))
print('Number of customers in latest 1 week of data ', len(transactions_1w))

top12_articles_3w = list((train_df_3w['article_id'].value_counts()).index)[:12]
top12_articles_2w = list((train_df_2w['article_id'].value_counts()).index)[:12]
top12_articles_1w = list((train_df_1w['article_id'].value_counts()).index)[:12]

In [6]:
# Read sample submission csv
sample_sub_df = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
print(sample_sub_df.shape)
sample_sub_df.head()

In [7]:
output_df = sample_sub_df[['customer_id']]
prediction_list = []
num_predictions = 12

top12_articles_1w_str = ' '.join(top12_articles_1w)

# For each customer, predict their next purcahases as the top-12 purchases made in the lastest k-weeks. 
# Here, first check k=1 (1 week). If no customer data, then check k=2, then k=3.

for cust_id in sample_sub_df['customer_id'].values.reshape((-1,)):
    
    if cust_id in transactions_1w:
        cust_purchases = sorted((transactions_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        cust_articles = [y[0] for y in cust_purchases]
        if len(cust_articles)>num_predictions:
            cust_pred = ' '.join(cust_articles[:num_predictions])
        else:
            cust_pred = ' '.join(cust_articles+top12_articles_1w[:(num_predictions-len(cust_articles))])
            
    elif cust_id in transactions_2w:
        cust_purchases = sorted((transactions_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        cust_articles = [y[0] for y in cust_purchases]
        if len(cust_articles)>num_predictions:
            cust_pred = ' '.join(cust_articles[:num_predictions])
        else:
            cust_pred = ' '.join(cust_articles+top12_articles_2w[:(num_predictions-len(cust_articles))])
            
    elif cust_id in transactions_3w:
        cust_purchases = sorted((transactions_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        cust_articles = [y[0] for y in cust_purchases]
        if len(cust_articles)>num_predictions:
            cust_pred = ' '.join(cust_articles[:num_predictions])
        else:
            cust_pred = ' '.join(cust_articles+top12_articles_3w[:(num_predictions-len(cust_articles))])
    else:
        cust_pred = top12_articles_1w_str
    prediction_list.append(cust_pred)

output_df['prediction'] = prediction_list
print(output_df.shape)
output_df.head()

In [10]:
# Save output predictions
output_df.to_csv('output2.csv', index=False)