In [1]:
# Installing latest implicit library for ALS

!pip install --upgrade implicit

In [2]:
# Importing required libraries 

import os
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k
import glob
#import reco
from tqdm import tqdm
import datetime

# Time Decaying Popularity

This is a heuristic-based model. Other competitors work show that popularity and repetition is the major trend of transactions.
So we decide to use the following two heuristics to guide our model:
1. Recommend most bought items from last 4 weeks.
1. Recommend popular items from last 2 weeks weighted down by time.This is a heuristic-based model. Other competitors work show that popularity and repetition is the major trend of transactions.

## Data Preprocessing

In [3]:
# read in the transaction data again
df_transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
df_transactions["t_dat"] = pd.to_datetime(df_transactions["t_dat"])

Because of the large volume of data, we will only use the last five weeks of data. Among those five weeks, the first four weeks will be used as four training set and the last week will be our validation set

In [8]:
# function to generate four training sets each with one week of transaction data
def generate_train_sets(df, dates=[(9,16), (9,8), (9,1), (8,23), (8,15)]):
    train_sets = []
    for i in range(len(dates)-1):
        m1, d1 = dates[i]
        m2, d2 = dates[i+1]
        df_train = df.loc[(df['t_dat']>= datetime.datetime(2020, m2, d2)) & (df['t_dat'] < datetime.datetime(2020, m1, d1))]
        train_sets.append(df_train)
    return train_sets

In [9]:
train_sets = generate_train_sets(df_transactions)
# use the last week of transaction data as our validation data
val_set = df_transactions.loc[df_transactions["t_dat"] >= datetime.datetime(2020,9,16)]

In [10]:
# function to generate X_train, y_train for each training set (each week of data)
# X_train: each entry is a customer_id
# y_train: each entry is a list of all purchased items per user (has repetitions)
def generate_purchase_lists(train_sets):
    purchase_lists = []
    for df_train in train_sets:
        df_purchase = df_train.groupby(['customer_id']).agg({'article_id':lambda x:list(x)})
        purchase_lists.append(df_purchase)
    return purchase_lists

In [11]:
purchase_lists = generate_purchase_lists(train_sets)

### Time decay based popularity for items

Simply speaking, items bought more recently having more weight in the popularity list. In simple words, an item A bought 5 times on the first day of the train period is inferior than an item B bought 4 times on the last day of the train period.

$\text{pop_factor} = \frac{1}{\text{# days to 2020-9-16}}$

find most popular items in the last two weeks ranked by their pop_factor

In [12]:
# Calculate popularity factor for each row
df_train = pd.concat(train_sets[:2], axis=0)
df_train['pop_factor'] = df_train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16)-x).days)
# df_train[['t_dat', 'pop_factor']].drop_duplicates()

# sort article_id by the sum of its pop_factor
popular_items = df_train.groupby(['article_id']).agg({'pop_factor':'sum'}).reset_index().sort_values('pop_factor', ascending=False)['article_id'].to_list()

In [13]:
# generate list of all purchases per user in validation set 
df_purchases_val = val_set.groupby(['customer_id']).agg({'article_id':lambda x:list(x)}).reset_index()
# generate X_val and y_val
X_val = df_purchases_val['customer_id'].tolist()
y_val = df_purchases_val['article_id'].tolist()

In [14]:
# The function to calculate our average precision @ K
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# Develop on the previous function and calculate MAP@K
def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Validate our model on the validation set with metric MAP@12

In [15]:
from collections import Counter
def make_prediction(X):
    predictions = []
    for u in tqdm(X):
        pred = []
        for i in range(len(purchase_lists)):
            if u in purchase_lists[i].index:
                pred += [a_id for a_id, count in Counter(purchase_lists[i].loc[u].tolist()[0]).most_common()][:12]

        pred += popular_items[:12-len(pred)]
        predictions.append(pred)
    return predictions

In [16]:
ypred = make_prediction(X_val)
print(mapk(y_val, ypred))

## Predict on the test set and generate submission csv

In [17]:
submission = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
submission.head()

# repeat the preprocessing process
train_sets = generate_train_sets(df_transactions, dates=[(9,23), (9,16), (9,8), (8,31), (8,23)])
purchase_lists = generate_purchase_lists(train_sets)
df_train = pd.concat(train_sets[:2], axis=0)
df_train['pop_factor'] = df_train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23)-x).days)
popular_items = df_train.groupby(['article_id']).agg({'pop_factor':'sum'}).reset_index().sort_values('pop_factor', ascending=False)['article_id'].to_list()

In [18]:
ypred = make_prediction(submission['customer_id'].tolist())
str_ypred = []
for pred in ypred:
    str_ypred.append(" ".join([str(i) for i in pred]))
submission['prediction'] = str_ypred
submission.to_csv("submission.csv", index=False)
submission.head()

The test score given by kaggle of our model is 0.0216 (MAP@12 score), ranked 1118 out of 2395 teams.

In [21]:
del df_transactions, train_sets, val_set, purchase_lists, popular_items, df_train, df_purchases_val, X_val, y_val, ypred, str_ypred

# Implicit ALS model

In [22]:
# Importing data

df_transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
sample_submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
df_customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
df_articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={'article_id': str})

In [23]:
def preprocessing(df_t, df_c, df_a):
    # Due to the large volume of data, we are only using the data after 2020-09-14
    df_t = df_t[df_t['t_dat'] > '2020-09-14']
    
    # create index for customer_id and article_id and create corresponding mapping between id and index
    customer_map = dict(list(enumerate(df_c['customer_id'].unique().tolist())))
    customer_map_inv = {customer_id: idx for idx, customer_id in customer_map.items()}
    article_map = dict(list(enumerate(df_a['article_id'].unique().tolist())))
    article_map_inv = {article_id: idx for idx, article_id in article_map.items()}
    
    # replace customer_id and article_id with index to save memory
    df_t['customer_id'] = df_t['customer_id'].apply(lambda x:customer_map_inv[x])
    df_t['article_id'] = df_t['article_id'].apply(lambda x:article_map_inv[x])
    
    return df_t, customer_map, article_map

In [24]:
df_transactions, customer_map, article_map = preprocessing(df_transactions, df_customers, df_articles)

In [25]:
del df_customers, df_articles

In [28]:
# function to generate coo_matrix (customer x article) from transaction dataframe
def generate_coo_matrix(df_t):
    row = df_t['customer_id'].values
    col = df_t['article_id'].values
    data = np.ones(df_t.shape[0])
    return coo_matrix((data, (row, col)), shape=(len(customer_map), len(article_map)))

In [29]:
train_coo = generate_coo_matrix(df_transactions)

## Tuning Hyperparameter factors & iterations

In [30]:
# Function to Split a dataframe into training (3 weeks) and validation data (1 week)
def split_data(df_t, num_days_val=7):
    threshold = df_t['t_dat'].max() - pd.Timedelta(num_days_val)

    df_train = df_t[df_t['t_dat'] < threshold]
    df_val = df_t[df_t['t_dat'] >= threshold]
    return df_train, df_val

# Function to split data into training and validation and generate corresponding matrices
# coo_train: training data in COO sparse format and as (customers x articles)
# csr_train: training data in CSR sparse format and as (customers x articles)
# csr_val:  validation data in CSR sparse format and as (customers x articles)
def generate_val_data(df_t, num_days_val=7):
    df_train, df_val = split_data(df_t, num_days_val)
    coo_train = generate_coo_matrix(df_train)
    coo_val = generate_coo_matrix(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return coo_train, csr_train, csr_val

# Train an ALS model over matrices and validate with the evaluation matrix MAP@12
def validate(data, embed_dims=200, iters=20, alpha=0.01, show_progress=True):
    coo_train, csr_train, csr_val = data
    
    model = implicit.als.AlternatingLeastSquares(factors=embed_dims, 
                                                 iterations=iters, 
                                                 regularization=alpha, 
                                                 random_state=2022,
                                                 use_gpu=True)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated articles, which is the case.
    score = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    if show_progress:
        print("Embedding_dimensions: {}, Iterations: {}, Regularization: {}".format(embed_dims, iters, alpha))
        print("MAP@12: ", score)
    return score

In [31]:
val_data = generate_val_data(df_transactions)

In [32]:
params_dict = {
    'embed_dims': [40, 50, 60, 100, 200, 500, 1000],
    'iters': [3, 5, 7, 9, 11],
    'alpha': [0.1, 0.05, 0.01]
}

In [33]:
# GridSearch
def gridsearch(params_dict, val_data):
    best_score = 0
    for embed_dims in params_dict['embed_dims']:
        for iters in params_dict['iters']:
            for alpha in params_dict['alpha']:
                score = validate(val_data, embed_dims, iters, alpha, show_progress=False)
                if score > best_score:
                    best_score = score
                    best_params = {'factors': embed_dims, 'iterations': iters, 'regularization': alpha}
                    print("Best MAP@12 found. Updating: "+str(best_params))
    return best_params

In [34]:
# The best performing parameters are as follow
best_params = gridsearch(params_dict, val_data)

In [35]:
del val_data

## Train over the full dataset and generate submission csv

In [36]:
# generate coo_matrix and csr_matrix using all transactions data

coo_train = generate_coo_matrix(df_transactions)
csr_train = coo_train.tocsr()

In [37]:
# train model
best_params['random_state'] = 2022
best_params['use_gpu'] = True
model_als = implicit.als.AlternatingLeastSquares(**best_params)
model_als.fit(coo_train, show_progress=True)

In [60]:
def generate_submission(model_als, csr_train, unique_customers, df_sup):
    # make top12 recommendation on all users in training data
    preds, _ = model_als.recommend(unique_customers, csr_train[unique_customers], N=12, filter_already_liked_items=False)
    # map indices to the original customer id
    customer_id = [customer_map[i] for i in unique_customers]
    # map indices to the original article id and join the 12 ids into a single string as required by the submission format
    prediction = []
    for pred in preds:
        prediction.append(" ".join([article_map[i] for i in pred]))
    # merge with our supplement predictions to fill in predictions for users unseen in training
    df_pred_als = pd.DataFrame({'customer_id': customer_id, 'prediction': prediction})
    df_final_pred = pd.merge(df_sup, df_pred_als, on='customer_id', how='left', suffixes=('_sup', '_als'))
    final_predictions = []
    for i in range(len(df_final_pred)):
        if df_final_pred.iloc[i, :]['prediction_als'] is np.nan:
            final_predictions.append(df_final_pred.iloc[i, :]['prediction_sup'])
        else:
            final_predictions.append(df_final_pred.iloc[i, :]['prediction_als'])
    df_final_pred['prediction'] = final_predictions
    df_final_pred = df_final_pred[['customer_id', 'prediction']]
    # save to csv
    df_final_pred.to_csv("submission.csv", index=False)
    
    return df_final_pred

In [62]:
# use predictions from our Time Decaying Popularity model as our supplements
df_sup = submission
unique_customers = df_transactions['customer_id'].unique().tolist()
df_final_pred = generate_submission(model_als, csr_train, unique_customers, df_sup)

In [63]:
df_final_pred.head()

The test score given by kaggle of our model is 0.0183 (MAP@12 score), ranked 1630 out of 2395 teams.