# SAR 

SAR is a fast, scalable, adaptive algorithm for personalized recommendations based on user transaction history. It is powered by understanding the similarity between items, and recommending similar items to those a user has an existing affinity for.

In [1]:
# set the environment path to find Recommenders
import sys

import itertools
import logging
import os
import time

import numpy as np
import pandas as pd
import papermill as pm
import datetime
from functools import wraps

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split, python_chrono_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, diversity, novelty, serendipity, distributional_coverage, catalog_coverage 
from recommenders.models.sar.sar_singlenode import SARSingleNode

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

  from pyarrow import HadoopFileSystem


System version: 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:45:05) 
[Clang 11.1.0 ]
Pandas version: 1.1.5


In [2]:
#utils

# results table
cols = ["Data", "Algo", "K", "Train time (s)","Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k","Diversity","Novelty","Distributional coverage","Catalog coverage"]
df_results = pd.DataFrame(columns=cols)

def generate_summary(data, algo, k, train_time, rating_time, rating_metrics, ranking_time, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k, "Train time (s)": train_time, "Predicting time (s)": rating_time, "Recommending time (s)":ranking_time}
    if rating_metrics is None:
        rating_metrics = {
            "RMSE": np.nan,
            "MAE": np.nan,
            "R2": np.nan,
            "Explained Variance": np.nan,
        }
    if ranking_metrics is None:
        ranking_metrics = {
            "MAP": np.nan,
            "nDCG@k": np.nan,
            "Precision@k": np.nan,
            "Recall@k": np.nan,
        }
    if diversity_metrics is None:
        diversity_metrics = {
        "Diversity": np.nan,
        "Novelty": np.nan,
        "Distributional coverage": np.nan,
        "Catalog coverage": np.nan,
    }
    summary.update(diversity_metrics)
    summary.update(rating_metrics)
    summary.update(ranking_metrics)

    return summary


def convert_timestamp(datetime):
    date_string = str(datetime)
    date = datetime.datetime.strptime(date_string, "%m/%d/%Y")
    timestamp = datetime.datetime.timestamp(date)
    return(timestamp)

def preprocess_data(df):
    # Convert the float precision to 32-bit in order to reduce memory consumption 
    df.loc[:, header["col_rating"]] = df[header["col_rating"]].astype(np.float32)

    ## convert datetime64[ns] to pd.timestamp, then to datetime and finally to timestamp int in second
    df["order_date"] = df["order_date"].apply(lambda x: int(pd.to_datetime(pd.Timestamp(x), unit='s').strftime('%s')))
    
    return df 

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        arg = args[0] if len(args)>=1 else "" 
        print('func:%r  took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

# 0. Config params

In [3]:
# table results 
algo = "sar"
ranking_metrics = None
rating_metrics = None
diversity_metrics = None
train_time = np.nan
rating_time = np.nan
ranking_time = np.nan

# column name 
header = {
    "col_user": "customer_id",
    "col_item": "variant_id",
    "col_rating": "quantity",
    "col_timestamp": "order_date",
    "col_prediction": "Prediction",
}

# top k
TOP_K = 10

################ TO MODIFY ################

# date size with 3 choices : "100k","1M" and "all"
data_size = "100k"
# load splitted data 
load_splitted_data = True 

################ TO MODIFY ################

# 1. Data

## 1.1 Load Data

In [4]:
########### TO MODIFY ###########
def load_data(data_size):
    path = ""
    if data_size=="100k":
        path = '../../data/transaction_100k_df.pkl'
    elif data_size=="1M":
        path = '../../data/transaction_1M_df.pkl'
    elif data_size=="all":
        path = '../../data/transaction_all_df.pkl'
    
    if path != "":
        return pd.read_pickle(path)
    else :
        print("Please choose between 100k, 1M and all")
########### TO MODIFY ###########

In [5]:
# 2 ways to load the data
if not load_splitted_data : 
    # data not splitted 
    data = load_data(data_size)
else :
    # or  use stored splitted data to make it faster
    train = pd.read_pickle(f"../../data/train_{data_size}_df.pkl")
    test = pd.read_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

## 1.2 Split the data ( skip if load_splitted_data )

In [6]:
# chrono split but it is really slow ( +1h to split 8M data ) 
if not load_splitted_data :
    train, test = python_chrono_split(data,
                                      ratio=0.75,
                                      col_user=header["col_user"],
                                      col_item=header["col_item"],
                                      col_timestamp = header["col_timestamp"]
                                     )
    train.to_pickle(f"../../data/train_{data_size}_df.pkl")
    test.to_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

In [7]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[header["col_user"]].unique()),
    train_items=len(train[header["col_item"]].unique()),
    test_total=len(test),
    test_users=len(test[header["col_user"]].unique()),
    test_items=len(test[header["col_item"]].unique()),
))


Train:
Total Ratings: 74944
Unique Users: 720
Unique Items: 5348

Test:
Total Ratings: 24983
Unique Users: 720
Unique Items: 4090



## 1.3 Process data

In [8]:
train = preprocess_data(train)
test = preprocess_data(test)

# 2. Model

In [9]:
# model param
# similarity function with 3 choices : "jaccard", "lift" and "counts"
similarity_type="jaccard"
# time decay T
time_decay_coefficient=15
# timedecay activated or not 
timedecay_formula=True

## 2.1 Define model

In [10]:
# set log level to INFO
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SARSingleNode(
    similarity_type=similarity_type, 
    time_decay_coefficient=time_decay_coefficient, 
    timedecay_formula=timedecay_formula, 
    **header,
)

## 2.2 Train model

In [11]:
start = time.time()
model.fit(train)
train_time = time.time()-start

2021-10-25 15:34:55,507 INFO     Collecting user affinity matrix
2021-10-25 15:34:55,514 INFO     Calculating time-decayed affinities
2021-10-25 15:34:55,580 INFO     Creating index columns
2021-10-25 15:34:55,648 INFO     Building user affinity sparse matrix
2021-10-25 15:34:55,654 INFO     Calculating item co-occurrence
2021-10-25 15:34:56,101 INFO     Calculating item similarity
2021-10-25 15:34:56,102 INFO     Using jaccard based similarity
2021-10-25 15:34:57,865 INFO     Done training


## 2.3  recommend k items

In [12]:
# top k items to recommend
start = time.time()
n = 100000
tests = []
for i in range(0,len(test),n):
    print(i)
    if i+n<len(test):
        test_sample = test[i:i+n]
    else:
        test_sample = test[i:]
    top_k = model.recommend_k_items(test_sample, top_k = TOP_K, remove_seen=True)
    tests.append(top_k)
ranking_time = time.time()-start
top_k = pd.concat(tests)
top_k = top_k.sort_values(by=[header["col_prediction"]], ascending=False)
top_k

2021-10-25 15:34:58,000 INFO     Calculating recommendation scores


0


2021-10-25 15:34:58,384 INFO     Removing seen items


Unnamed: 0,customer_id,variant_id,Prediction
1690,US051502171641244282,438626,3.784013e-01
1691,US051502171641244282,432868,3.570986e-01
1692,US051502171641244282,434314,3.520696e-01
1693,US051502171641244282,429811,3.495227e-01
7170,US621111115825890,436289,3.232004e-01
...,...,...,...
5895,US621111114281349,425669,2.064447e-08
5896,US621111114281349,426232,1.990251e-08
5897,US621111114281349,422911,1.965823e-08
5898,US621111114281349,427391,1.956576e-08


# 3. Evaluate

## 3.1 Ranking metrics 

In [13]:
args = [test, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
              col_rating= header["col_rating"],
              col_prediction= header["col_prediction"],
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

ranking_metrics = {
    "MAP": eval_map,
    "nDCG@k": eval_ndcg,
    "Precision@k": eval_precision,
    "Recall@k": eval_recall,
}

print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}",sep='\n')

Model:
Top K:		 10
MAP:		 0.015832
NDCG:		 0.120409
Precision@K:	 0.111250
Recall@K:	 0.031310


## 3.2 Diversity metrics

In [14]:
args = [train, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
             )

eval_diversity = diversity(*args, **kwargs)
eval_novelty = novelty(*args, **kwargs)
eval_distributional_coverage = distributional_coverage(*args, **kwargs)
eval_catalog_coverage = catalog_coverage(*args,**kwargs)

diversity_metrics = {
    "Diversity": eval_diversity,
    "Novelty": eval_novelty,
    "Distributional coverage": eval_distributional_coverage,
    "Catalog coverage": eval_catalog_coverage,
}
        
print(f"Model:",
      f"Diversity :\t {eval_diversity}",
      f"Novelty:\t {eval_novelty:f}",
      f"Coverage:\t {eval_distributional_coverage:f}", sep='\n')

Model:
Diversity :	 0.7602995530883522
Novelty:	 11.058183
Coverage:	 8.978610


# 4. Results

In [30]:
summary = generate_summary(data_size,
                           algo,
                           TOP_K,
                           train_time, 
                           rating_time,
                           rating_metrics,
                           ranking_time,
                           ranking_metrics,
                           diversity_metrics)
df_results.loc[df_results.shape[0] + 1] = summary
df_results

Unnamed: 0,Data,Algo,K,Train time (s),Predicting time (s),RMSE,MAE,R2,Explained Variance,Recommending time (s),MAP,nDCG@k,Precision@k,Recall@k,Diversity,Novelty,Distributional coverage,Catalog coverage
1,all,sar,10,18.09361,,,,,,211.60146,0.100136,0.138245,0.034234,0.21693,0.957637,11.336318,11.040913,0.957958
2,100k,sar,10,1.867308,,,,,,1.340116,0.015832,0.120409,0.11125,0.03131,0.7603,11.058183,8.97861,0.274308
