# H&M Personalized Fashion Recommendations: 3. Implicit 

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import seaborn as sns

from scipy.sparse import csr_matrix, coo_matrix

import torch
import implicit
from implicit.evaluation import mean_average_precision_at_k

from tqdm.notebook import tqdm

import logging
logging.basicConfig(level=logging.INFO)
logging.info("test")

import sys
sys.path.append("..")
from src.utils import *

SEED = 1

INFO:root:test


## 1. Download data 

In [2]:
articles = pd.read_csv("../input/articles_proc.csv", dtype={"article_id": str})
customers = pd.read_csv("../input/customers_proc.csv")
transactions_full = pd.read_csv("../input/transactions_full.csv", 
                                dtype={"article_id": str}, parse_dates=["t_dat"])
transactions_train = pd.read_csv("../input/transactions_train.csv", 
                                 dtype={"article_id": str}, parse_dates=["t_dat"])
transactions_test = pd.read_csv("../input/transactions_test.csv", 
                                dtype={"article_id": str}, parse_dates=["t_dat"])
sample_subm = pd.read_csv("../input/sample_submission.csv")

## 2. Make dataset 

In [2]:
class DatasetMaker:
    def __init__(self, 
                 articles, 
                 customers):        
        self.articles_num2id = dict(enumerate(articles["article_id"].unique()))
        self.articles_id2num = {id_: num for num, id_  in self.articles_num2id.items()}

        self.customers_num2id = dict(enumerate(customers["customer_id"].unique()))
        self.customers_id2num = {id_: num for num, id_ in self.customers_num2id.items()}
        
        self.data_shape = (customers.shape[0], articles.shape[0])
    
    def get_coo_matrix(self, data):
        data_csr = coo_matrix(
            (
                np.ones(data.shape[0]), 
                (
                    data["customer_id"].map(self.customers_id2num), 
                    data["article_id"].map(self.articles_id2num)
                )
            ),
            shape=self.data_shape,
            dtype=np.uint8
        )
        return data_csr
        
    def split_data(self, data, val_days: int = 7):
        val_split_date = data['t_dat'].max() - pd.Timedelta(val_days)
        
        data_train = data[data['t_dat'] < val_split_date]
        data_val = data[data['t_dat'] >= val_split_date]
        return data_train, data_val
    
    def limit_data(self, data, min_days_ago: int = 30, max_days_ago: int = 0):
        min_split_date = data['t_dat'].max() - pd.Timedelta(days=min_days_ago)
        max_split_date = data['t_dat'].max() - pd.Timedelta(days=max_days_ago)
        
        return data[data['t_dat'].between(min_split_date, max_split_date)]

## 3. Search best params for models

In [3]:
def find_best_params(Model = implicit.als.AlternatingLeastSquares, 
                     train_weeks_list = [1, 2, 3, 4, 6, 8, 12, 16, 20], 
                     factor_list = [3, 30, 300], 
                     iteration_list = [3, 30, 300], 
                     regularization_list = [0.01, 0.1, 1.0],
                     random_state: int = SEED,
                     use_gpu: bool = True,
                     print_best: bool = True,
                     print_all: bool = False):

    best_map_score = 0
    for train_weeks in train_weeks_list:
        dm = DatasetMaker(articles, customers)
        data = dm.limit_data(transactions_full, 
                             min_days_ago=7 * (1 + train_weeks), 
                             max_days_ago=0)
        train_data, val_data = dm.split_data(data, 
                                             val_days=7)
        train = dm.get_coo_matrix(train_data).tocsr()
        val = dm.get_coo_matrix(val_data).tocsr()
        for factors in factor_list:
            for iterations in iteration_list:
                for regularization in regularization_list:
                    model = Model(factors=factors, 
                                  iterations=iterations, 
                                  regularization=regularization,
                                  use_gpu=use_gpu,
                                  num_threads=16,
                                  random_state=random_state)
                    
                    params = {"train_weeks": train_weeks, "factors": factors, 
                              "iterations": iterations, "regularization": regularization, 
                              "random_state": random_state}
                    try:
                        model.fit(train, show_progress=False)
                    except:
                        print("ERROR: ", params)
                        torch.cuda.empty_cache()
                        del model
                        continue
                        
                    map_score = mean_average_precision_at_k(model, 
                                                            train, 
                                                            val, 
                                                            K=12, 
                                                            show_progress=False, 
                                                            num_threads=16)

                    
                    if print_all:
                        print(round(map_score, 4), params)
                    
                    if map_score > best_map_score:
                        best_map_score = map_score
                        best_params = params
                        if print_best:
                            print("### ", round(best_map_score, 4), best_params)
                    del model
                    
    return best_map_score, best_params

In [10]:
%%time
models = {
    "als": {
        "model": implicit.als.AlternatingLeastSquares,
        "params": {"use_gpu": True, "regularization_list": [0.01]}
    },
    "bpr": {
        "model": implicit.bpr.BayesianPersonalizedRanking,
        "params": {"use_gpu": True, "regularization_list": [0.01]}
    },
    "lmf": {
        "model": implicit.lmf.LogisticMatrixFactorization,
        "params": {"use_gpu": False, "regularization_list": [1.0]}
    }
}

find_result = {}
for model_name, model_data in models.items():
    print("########## ", model_name)
    best_map_score, best_params = find_best_params(model_data["model"], **model_data["params"],
                                                   print_best=True, print_all=False)
    find_result[model_name] = {"best_map_score": best_map_score, "best_params": best_params}
    print("######### BEST: ", best_map_score, best_params)
    print()

##########  als
###  0.001 {'train_weeks': 1, 'factors': 3, 'iterations': 3, 'regularization': 0.01, 'random_state': 1}
###  0.0013 {'train_weeks': 1, 'factors': 3, 'iterations': 30, 'regularization': 0.01, 'random_state': 1}
###  0.0013 {'train_weeks': 1, 'factors': 3, 'iterations': 300, 'regularization': 0.01, 'random_state': 1}
###  0.0018 {'train_weeks': 1, 'factors': 30, 'iterations': 3, 'regularization': 0.01, 'random_state': 1}
###  0.0021 {'train_weeks': 1, 'factors': 30, 'iterations': 30, 'regularization': 0.01, 'random_state': 1}
###  0.0022 {'train_weeks': 1, 'factors': 30, 'iterations': 300, 'regularization': 0.01, 'random_state': 1}
###  0.0025 {'train_weeks': 1, 'factors': 300, 'iterations': 3, 'regularization': 0.01, 'random_state': 1}
###  0.0026 {'train_weeks': 2, 'factors': 30, 'iterations': 3, 'regularization': 0.01, 'random_state': 1}
###  0.0035 {'train_weeks': 2, 'factors': 30, 'iterations': 30, 'regularization': 0.01, 'random_state': 1}
###  0.004 {'train_weeks':

### 3. Fine tuning 

In [5]:
def predict(model, train, dm, batch_size = 100000, cold_start: bool = True):
    # predict 
    user_list = list(range(0, dm.data_shape[0]))
    items, scores = [], []
    for start_user_num in tqdm(range(0, len(user_list), batch_size)):
        user_list_curr = user_list[start_user_num: start_user_num + batch_size]
        items_curr, scores_curr = model.recommend(
            user_list_curr, 
            train[user_list_curr], 
            N=12
        )
        items.extend(items_curr.tolist())
        scores.extend(scores_curr.tolist())

    # get popular items for cold start
    scores_sum_list = list(map(sum, scores))
    popular_items = list(
         map(
            lambda x: x[0],
            sorted(
                enumerate(
                    train.sum(axis=0).tolist()[0])
                ,key=lambda x: -x[1]
            )
        )
    )[:12]
    
    # create predict dataframe
    customer_id_list, articles_pred_list = [], []
    for customer_num, articles_num_list in tqdm(enumerate(items)):
        if cold_start and (scores_sum_list[customer_num] == 0):
            articles_num_list = popular_items
        articles_id_list = [dm.articles_num2id[article_num] for article_num in articles_num_list]
        articles_pred = " ".join(articles_id_list)

        customer_id_list.append(dm.customers_num2id[customer_num])
        articles_pred_list.append(articles_pred)
    
    predict_df = pd.DataFrame(data={"customer_id": customer_id_list, 
                                    "prediction": articles_pred_list})
    return predict_df

### Als

In [5]:
%%time
find_best_params(
    implicit.als.AlternatingLeastSquares, 
    regularization_list=[0.01], 
    train_weeks_list=[7, 8, 9],
    factor_list=[150, 200, 250],
    iteration_list=[4, 5, 6],
    use_gpu=True,
    print_best=True, print_all=False)

###  0.0052 {'train_weeks': 7, 'factors': 150, 'iterations': 4, 'regularization': 0.01, 'random_state': 1}
###  0.0054 {'train_weeks': 7, 'factors': 150, 'iterations': 5, 'regularization': 0.01, 'random_state': 1}
###  0.0054 {'train_weeks': 7, 'factors': 150, 'iterations': 6, 'regularization': 0.01, 'random_state': 1}
###  0.0055 {'train_weeks': 7, 'factors': 200, 'iterations': 4, 'regularization': 0.01, 'random_state': 1}
###  0.0055 {'train_weeks': 7, 'factors': 200, 'iterations': 5, 'regularization': 0.01, 'random_state': 1}
###  0.0055 {'train_weeks': 7, 'factors': 200, 'iterations': 6, 'regularization': 0.01, 'random_state': 1}
###  0.0056 {'train_weeks': 8, 'factors': 200, 'iterations': 5, 'regularization': 0.01, 'random_state': 1}
ERROR:  {'train_weeks': 9, 'factors': 250, 'iterations': 6, 'regularization': 0.01, 'random_state': 1}
CPU times: user 32.4 s, sys: 992 ms, total: 33.4 s
Wall time: 33.7 s


(0.0056160049381765815,
 {'train_weeks': 8,
  'factors': 200,
  'iterations': 5,
  'regularization': 0.01,
  'random_state': 1})

In [67]:
train_weeks = 8
factors = 200
iterations = 5
regularization = 0.01
random_state = 1

dm = DatasetMaker(articles, customers)
data = dm.limit_data(transactions_full, 
                     min_days_ago=7 * (train_weeks), 
                     max_days_ago=0)
train = dm.get_coo_matrix(data).tocsr()

best_als = implicit.als.AlternatingLeastSquares(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=1
)

best_als.fit(train, show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [68]:
als_df = predict(best_als, train, dm, cold_start=True)
als_df.to_csv("../output/13.als_best_cs.csv", index=False, header=True)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### Bpr 

In [None]:
%%time
find_best_params(
    implicit.bpr.BayesianPersonalizedRanking, 
    regularization_list=[0.01], 
    train_weeks_list=[8],
    factor_list=[400, 500, 600],
    iteration_list=[2000, 2500, 3000],
    use_gpu=True,
    print_best=True, print_all=False)

In [6]:
train_weeks = 8
factors = 500
iterations = 2500
regularization = 0.01
random_state = 1

dm = DatasetMaker(articles, customers)
data = dm.limit_data(transactions_full, 
                     min_days_ago=7 * train_weeks, 
                     max_days_ago=0)
train = dm.get_coo_matrix(data).tocsr()

best_bpr = implicit.bpr.BayesianPersonalizedRanking(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=True,
    num_threads=16,
    random_state=1
)

best_bpr.fit(train, show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=2500.0), HTML(value='')))




In [8]:
bpr_df = predict(best_bpr, train, dm, cold_start=True)
bpr_df.to_csv("../output/14.bpr_best_cs.csv", index=False, header=True)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### Lmf 

In [None]:
%%time
find_best_params(
    implicit.lmf.LogisticMatrixFactorization, 
    regularization_list=[1.0], 
    train_weeks_list=[1],
    factor_list=[2, 3, 4],
    iteration_list=[300, 350, 400],
    use_gpu=False,
    print_best=True, print_all=False)

In [9]:
train_weeks = 1
factors = 3
iterations = 350
regularization = 1.0
random_state = 1

dm = DatasetMaker(articles, customers)
data = dm.limit_data(transactions_full, 
                     min_days_ago=7 * (train_weeks), 
                     max_days_ago=0)
train = dm.get_coo_matrix(data).tocsr()

best_lmf = implicit.lmf.LogisticMatrixFactorization(
    factors=factors, 
    iterations=iterations, 
    regularization=regularization,
    use_gpu=False,
    num_threads=16,
    random_state=1
)

best_lmf.fit(train, show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=350.0), HTML(value='')))




In [10]:
lmf_df = predict(best_lmf, train, dm, cold_start=True)
lmf_df.to_csv("../output/15.lmf_best_cs.csv", index=False, header=True)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Ансамбль

In [4]:
pop_df = pd.read_csv("../output/4.top_1w.csv")
als_df = pd.read_csv("../output/13.als_best_cs.csv")
bpr_df = pd.read_csv("../output/14.bpr_best_cs.csv")
lmf_df = pd.read_csv("../output/15.lmf_best_cs.csv")

In [5]:
sub = (
    pop_df.rename({"prediction": "popular_prediction"}, axis=1)
        .merge(als_df.rename({"prediction": "als_prediction"}, axis=1), on="customer_id", how="inner")
        .merge(bpr_df.rename({"prediction": "bpr_prediction"}, axis=1), on="customer_id", how="inner")
        .merge(lmf_df.rename({"prediction": "lmf_prediction"}, axis=1), on="customer_id", how="inner")
)
sub

Unnamed: 0,customer_id,popular_prediction,als_prediction,bpr_prediction,lmf_prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0924243002 0918522001 0923758001 08...,0762846031 0568601006 0568601044 0568601007 05...,0858856005 0779781015 0716670009 0810591002 05...,0924243001 0918522001 0924243002 0909370001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...,0751471001 0706016001 0918292001 0448509014 09...,0890116001 0804472003 0855098001 0858082001 07...,0924243001 0918522001 0924243002 0909370001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0924243002 0918522001 0923758001 08...,0805000001 0794321011 0791587001 0791587015 07...,0794321011 0224521007 0805000001 0805000007 08...,0924243001 0918522001 0866731001 0924243002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...,0751471001 0706016001 0918292001 0448509014 09...,0890116001 0804472003 0855098001 0858082001 07...,0924243001 0918522001 0924243002 0909370001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...,0791587018 0791587001 0809961007 0791587009 07...,0791587009 0791587018 0791587001 0927530006 08...,0924243001 0918522001 0924243002 0909370001 09...
...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0924243001 0924243002 0918522001 0923758001 08...,0822311001 0860797001 0887681003 0858147001 08...,0822311001 0822344009 0822344012 0791587010 07...,0924243001 0918522001 0924243002 0909370001 09...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0918522001 0923758001 08...,0762846026 0889460003 0762846027 0759871002 07...,0759871011 0759871037 0759871034 0877777002 04...,0924243001 0918522001 0924243002 0909370001 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0924243002 0918522001 0923758001 08...,0762846026 0762846031 0719655001 0715624010 07...,0508691005 0635346001 0762846031 0629551003 06...,0924243001 0918522001 0924243002 0909370001 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...,0751471001 0706016001 0918292001 0448509014 09...,0890116001 0804472003 0855098001 0858082001 07...,0924243001 0918522001 0924243002 0909370001 09...


In [6]:
from collections import Counter

def merge_predictins(sub, weight: dict = None):
    all_counter = None
    sub = sub.set_index("customer_id")
    for col in tqdm(sub.columns):
        model_counter = sub[col].str.split(" ").apply(Counter)
        model_name = col.replace("_prediction", "")

        def mult_counter(counter, w):
            counter = counter.copy()
            for key in counter.keys():
                counter[key] *= w
            return counter
        
        if weight is None:
            w = 1
        else:
            w = weight[model_name]
        model_counter = model_counter.apply(lambda x: mult_counter(x, w))

        all_counter = all_counter + model_counter if all_counter is not None else model_counter
    top_list = all_counter.apply(lambda x: list(dict(sorted(x.items(), key=lambda x: -x[1])).keys())[:12])
    top_str = top_list.apply(lambda x: " ".join(x))
    sub_merged = top_str.reset_index()
    sub_merged.columns = ['customer_id', "prediction"]
    return sub_merged

In [7]:
sub_merged = merge_predictins(sub, weight = {"popular": 0.0071, "als": 0.0072, "bpr": 0.0051, "lmf": 0.0069})
sub_merged.to_csv("../output/16.pop_als_bpr_lmf_cs_weighed.csv", index=False, header=True)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [42]:
sub_merged = merge_predictins(sub, weight = {"popular": 1, "als": 1, "bpr": 1, "lmf": 1})
sub_merged.to_csv("../output/12.pop_als_bpr_lmf.csv", index=False, header=True)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


