# Implicit ALS model

In [1]:
# Installing latest implicit library for ALS

!pip install --upgrade implicit

In [22]:
# Importing required libraries 

import os
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k
import glob
#import reco
from tqdm import tqdm
import datetime

In [3]:
# Importing data

transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
sample_submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={'article_id': str})

In [4]:
# Due to the large volume of data, we are only using the data after 2020-09-14

transactions = transactions[transactions['t_dat'] > '2020-09-14']
transactions.shape

In [5]:
# Assigning incremental ids to customers and articles

all_customers = customers['customer_id'].unique().tolist()
all_articles = articles['article_id'].unique().tolist()

customer_ids = dict(list(enumerate(all_customers)))
article_ids = dict(list(enumerate(all_articles)))

transactions['customer_id'] = transactions['customer_id'].map({u: uidx for uidx, u in customer_ids.items()})
transactions['article_id'] = transactions['article_id'].map({i: iidx for iidx, i in article_ids.items()})

# delete the original dataframes to save memory
del customers, articles

In [6]:
# Creating coo_matrix (customer x article) and csr matrix (customer x article)

row = transactions['customer_id'].values
col = transactions['article_id'].values
data = np.ones(transactions.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(all_customers), len(all_articles)))
coo_train

In [11]:
# try fitting the model on training data for 2 interations

model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2, use_gpu=True, calculate_training_loss=True, random_state=2022)
model.fit(coo_train)

## Tuning Hyperparameter factors & iterations

In [12]:
# Function to transform a dataframe with transactions into a COO sparse (articles x customers) matrix
def to_customer_article_coo(transactions):
    row = transactions['customer_id'].values
    col = transactions['article_id'].values
    data = np.ones(transactions.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(all_customers), len(all_articles)))
    return coo

# Function to Split a dataframe into training (3 weeks) and validation data (1 week)
def split_data(transactions, validation_days=7):
    validation_cut = transactions['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = transactions[transactions['t_dat'] < validation_cut]
    df_val = transactions[transactions['t_dat'] >= validation_cut]
    return df_train, df_val

# Function to split data into training and validation and generate corresponding matrices
# coo_train: training data in COO sparse format and as (customers x articles)
# csr_train: training data in CSR sparse format and as (customers x articles)
# csr_val:  validation data in CSR sparse format and as (customers x articles)
def get_val_matrices(transactions, validation_days=7):
    df_train, df_val = split_data(transactions, validation_days=validation_days)
    coo_train = to_customer_article_coo(df_train)
    coo_val = to_customer_article_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }

# Train an ALS model over matrices and validate with the evaluation matrix MAP@12
# factors: embeddings dimension
def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=7,
                                                 use_gpu=True)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated articles, which is the case.
    # TODO: change MAP@12 to a library that allows repeated articles in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

In [13]:
matrices = get_val_matrices(transactions)

In [14]:
# GridSearch
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")

# delete the matrices to save memory
del matrices

In [15]:
# The best performing parameters are as follow
best_params

## Train over the full dataset and generate submission csv

In [16]:
#Training over the full dataset

coo_train = to_customer_article_coo(transactions)
csr_train = coo_train.tocsr()

In [17]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=7,
                                                 use_gpu=True)
    model.fit(coo_train, show_progress=show_progress)
    return model

best_params

In [18]:
model = train(coo_train, **best_params)

In [19]:
# Submission

# This is a baseline prediction provided by other competitors, we will use it to fill in null entries produced by our ALS model
heng_df = pd.read_csv('../input/heng-zhengs-time-is-our-best-friend-v2-submission/not_so_fancy_but_fast_benchmark.csv')


def submit(model, csr_train, custs, heng_df, submission_name="submission.csv"):
    preds = []
    batch_size = 2000

    for startidx in range(0, len(custs), batch_size):
        
        batch = custs[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
        
        for i, customerid in enumerate(batch):
            customer_id = customer_ids[customerid]
            customer_articles = ids[i]
            articleids = [article_ids[article_id] for article_id in customer_articles]
            preds.append((customer_id, ' '.join(articleids)))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
            
    # For cold start / unseen customers we will use Heng Zheng's baseline preditions to fill in
    df_preds = pd.merge(heng_df, df_preds, how='left', on='customer_id', suffixes=('_fill', '_als'))
    df_preds.loc[~df_preds['prediction_als'].isnull(), 'prediction'] = df_preds['prediction_als']
    df_preds.loc[df_preds['prediction_als'].isnull(), 'prediction'] = df_preds['prediction_fill']
    df_preds = df_preds[['customer_id', 'prediction']]
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [20]:
transactions_customers = transactions['customer_id'].unique().tolist()

len(transactions_customers)

In [21]:
# generate submission csv
df_preds = submit(model, csr_train, transactions_customers, heng_df)

The test score given by kaggle of our model is 0.0183 (MAP@12 score), ranked 1630 out of 2395 teams.

# Time Decaying Popularity

This is a heuristic-based model. Other competitors work show that popularity and repetition is the major trend of transactions.
So we decide to use the following two heuristics to guide our model:
1. Recommend most bought items from last 4 weeks.
1. Recommend popular items from last 2 weeks weighted down by time.

## Data Preprocessing

In [24]:
# read in the transaction data again
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])

Because of the large volume of data, we will only use the last five weeks of data. Among those five weeks, the first four weeks will be used as four training set and the last week will be our validation set

In [26]:
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])
train1 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,8)) & (transactions['t_dat'] < datetime.datetime(2020,9,16))]
train2 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,1)) & (transactions['t_dat'] < datetime.datetime(2020,9,8))]
train3 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,23)) & (transactions['t_dat'] < datetime.datetime(2020,9,1))]
train4 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,15)) & (transactions['t_dat'] < datetime.datetime(2020,8,23))]

val = transactions.loc[transactions["t_dat"] >= datetime.datetime(2020,9,16)]

In [27]:
# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

### Time decay based popularity for items

Simply speaking, items bought more recently having more weight in the popularity list. In simple words, an item A bought 5 times on the first day of the train period is inferior than an item B bought 4 times on the last day of the train period.

In [28]:
train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

train['pop_factor'].describe()

In [29]:
# The function to calculate our average precision @ K
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# Develop on the previous function and calculate MAP@K
def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [31]:
# List of all purchases per user in validation set 
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)

In [32]:
# creating validation set that corresponding to the format of our test set
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

Validate our model on the validation set with metric MAP@12

In [34]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("MAP@12 Score on Validation set:", mapk(val_items, outputs))

## Predict on the test set and generate submission csv

In [35]:
submission = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
submission.head()

In [36]:
from collections import Counter
outputs = []
cnt = 0

for user in tqdm(submission['customer_id']):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12 - len(user_output)]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12 - len(user_output)]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12 - len(user_output)]
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

In [37]:
submission['prediction'] = str_outputs
submission.to_csv("submission.csv", index=False)
submission.head()

The test score given by kaggle of our model is 0.0216 (MAP@12 score), ranked 1118 out of 2395 teams.