# SVD 

In [44]:
import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
from functools import wraps
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split, python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var,
                                                       map_at_k, ndcg_at_k, precision_at_k, recall_at_k, get_top_k_items,
                                                       diversity, novelty, distributional_coverage, catalog_coverage )
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.6.13 | packaged by conda-forge | (default, Sep 23 2021, 07:55:15) 
[GCC Clang 11.1.0]
Surprise version: 1.1.1


In [45]:
#utils

# results table
cols = ["Data", "Algo", "K", "Train time (s)","Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k","Diversity","Novelty","Distributional coverage","Catalog coverage"]
df_results = pd.DataFrame(columns=cols)

def generate_summary(data, algo, k, train_time, rating_time, rating_metrics, ranking_time, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k, "Train time (s)": train_time, "Predicting time (s)": rating_time, "Recommending time (s)":ranking_time}
    if rating_metrics is None:
        rating_metrics = {
            "RMSE": np.nan,
            "MAE": np.nan,
            "R2": np.nan,
            "Explained Variance": np.nan,
        }
    if ranking_metrics is None:
        ranking_metrics = {
            "MAP": np.nan,
            "nDCG@k": np.nan,
            "Precision@k": np.nan,
            "Recall@k": np.nan,
        }
    if diversity_metrics is None:
        diversity_metrics = {
        "Diversity": np.nan,
        "Novelty": np.nan,
        "Distributional coverage": np.nan,
        "Catalog coverage": np.nan,
    }
    summary.update(diversity_metrics)
    summary.update(rating_metrics)
    summary.update(ranking_metrics)

    return summary


def convert_timestamp(datetime):
    date_string = str(datetime)
    date = datetime.datetime.strptime(date_string, "%m/%d/%Y")
    timestamp = datetime.datetime.timestamp(date)
    return(timestamp)

def preprocess_data(df):
    # Convert the float precision to 32-bit in order to reduce memory consumption 
    df.loc[:, header["col_rating"]] = df[header["col_rating"]].astype(np.float32)
    df = df[[header["col_user"],header["col_item"],header["col_rating"]]]
    return df 

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        arg = args[0] if len(args)>=1 else "" 
        print('func:%r  took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

def predict_and_calculate_metrics(svd,test):
    predictions = predict(svd, test, usercol=header["col_user"], itemcol=header["col_item"])
    eval_rmse = rmse(test, predictions, 
                     col_user = header["col_user"],
                     col_item = header["col_item"],
                     col_rating = header["col_rating"],
                     col_prediction = header["col_prediction"]
                    )
    return(eval_rmse)
    

# 0. Config params

In [81]:
# table results 
algo = "svd"
ranking_metrics = None
rating_metrics = None
diversity_metrics = None
train_time = np.nan
rating_time = np.nan
ranking_time = np.nan

# column name 
header = {
    "col_user": "customer_id",
    "col_item": "variant_id",
    "col_rating": "quantity",
    "col_timestamp": "order_date",
    "col_prediction": "prediction",
}

# top k
TOP_K = 10

################ TO MODIFY ################

# date size with 3 choices : "100k","1M" and "all"
data_size = "all"
# load splitted data 
load_splitted_data = True 

################ TO MODIFY ################

# 1. Data

# 1.1 Load data 

In [82]:
########### TO MODIFY ###########
def load_data(data_size):
    path = ""
    if data_size=="100k":
        path = '../../data/transaction_100k_df.pkl'
    elif data_size=="1M":
        path = '../../data/transaction_1M_df.pkl'
    elif data_size=="all":
        path = '../../data/transaction_all_df.pkl'
    
    if path != "":
        return pd.read_pickle(path)
    else :
        print("Please choose between 100k, 1M and all")
########### TO MODIFY ###########

In [83]:
# 2 ways to load the data
if not load_splitted_data : 
    # data not splitted 
    data = load_data(data_size)
else :
    # or  use stored splitted data to make it faster
    train = pd.read_pickle(f"../../data/train_{data_size}_df.pkl")
    test = pd.read_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

## 1.2 Split the data ( skip if load_splitted_data )

In [84]:
# chrono split but it is really slow ( +1h to split 8M data ) 
if not load_splitted_data :
    train, test = python_chrono_split(data,
                                      ratio=0.75,
                                      col_user=header["col_user"],
                                      col_item=header["col_item"],
                                      col_timestamp = header["col_timestamp"]
                                     )
    train.to_pickle(f"../../data/train_{data_size}_df.pkl")
    test.to_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

In [85]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[header["col_user"]].unique()),
    train_items=len(train[header["col_item"]].unique()),
    test_total=len(test),
    test_users=len(test[header["col_user"]].unique()),
    test_items=len(test[header["col_item"]].unique()),
))


Train:
Total Ratings: 6041296
Unique Users: 1497612
Unique Items: 7659

Test:
Total Ratings: 1799883
Unique Users: 869943
Unique Items: 7022



## 1.3 Process data

In [86]:
train = preprocess_data(train)
test = preprocess_data(test)

# 'reader' is being used to get rating scale (for MovieLens, the scale is [1, 5]).
# 'rating_scale' parameter can be used instead for the later version of surprise lib:
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader(rating_scale=(0,max(train["quantity"])))).build_full_trainset()

# 2. Model

## 2.1 Define model

##### lr=0.001 for 100k and 1M but 0.0001 for all

In [96]:
svd = surprise.SVD(random_state=0, n_factors=3, n_epochs=700,biased=False, lr_all=0.0001,verbose=False)

## 2.2 Train model

In [98]:
with Timer() as train_time:
    svd.fit(train_set)
    
# train and plot the rmse plot for training and testing data 
def _train():
    RMSE_train = []
    RMSE_test = []
    nb_epoch = 500
    with Timer() as train_time:
        for i in tqdm(range(1,nb_epoch+1,50)):
            svd = surprise.SVD(random_state=0, n_factors=3, n_epochs=i,biased=False,init_std_dev=0.1,lr_all=0.0001,reg_all = 0.02, verbose=False)
            svd.fit(train_set)
            val = predict_and_calculate_metrics(svd,train)
            RMSE_train.append(val)
            val = predict_and_calculate_metrics(svd,test)
            RMSE_test.append(val)
    plt.plot(RMSE_train,label='train')
    plt.plot(RMSE_test,label='test')
    plt.legend()
    plt.show()
    print("Took {} seconds for training.".format(train_time.interval))

## 2.3 Predict test raiting

In [99]:
with Timer() as rating_time:
    predictions = predict(svd, test, usercol=header["col_user"], itemcol=header["col_item"])
predictions.head()

Unnamed: 0,customer_id,variant_id,prediction
0,0,433036,0.955443
1,10,433025,2.654011
2,10,422990,7.062648
3,14,433255,0.410683
4,37,422808,1.138263


## 2.4  recommend k items

In [100]:
# recommend items by batch ( bc it causes kernel failure ) 
train["group"] = train.groupby("customer_id").ngroup()
nb_user = len(train["customer_id"].unique())
top_k_list=[]

In [101]:
with Timer() as ranking_time:
    for i in tqdm(range(0,nb_user,100000)):
        train_subset = train[(train["group"]>=i) & (train["group"]<i+10000)]
        all_predictions = compute_ranking_predictions(svd, train_subset, usercol=header["col_user"], itemcol=header["col_item"], remove_seen=True)
        top_k_subset = get_top_k_items(all_predictions, col_user=header["col_user"], col_rating=header["col_prediction"], k=TOP_K)
        top_k_list.append(top_k_subset)
top_k = pd.concat(top_k_list)
top_k.head()

  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,customer_id,variant_id,prediction,rank
0,0,422990,3.502466,1
1,0,433028,3.241559,2
2,0,431622,2.735629,3
3,0,425941,2.575728,4
4,0,425911,2.561913,5


# 3. Evaluate

## 3.1 Raiting metrics

In [102]:
args = [test, predictions]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
              col_rating = header["col_rating"],
              col_prediction = header["col_prediction"]
             )

eval_rmse = rmse(*args, **kwargs)
eval_mae = mae(*args, **kwargs)
eval_rsquared = rsquared(*args, **kwargs)
eval_exp_var = exp_var(*args, **kwargs)

rating_metrics = {
    "RMSE": eval_rmse,
    "MAE": eval_mae,
    "R2": eval_rsquared,
    "Explained Variance": eval_exp_var,
}

print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

RMSE:		2.107780
MAE:		0.736401
rsquared:	-0.058925
exp var:	-0.033717


## 3.2 Ranking metrics 

In [103]:
args = [test, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
              col_rating = header["col_rating"],
              col_prediction = header["col_prediction"],
              relevancy_method = 'top_k', 
              k = TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

ranking_metrics = {
    "MAP": eval_map,
    "nDCG@k": eval_ndcg,
    "Precision@k": eval_precision,
    "Recall@k": eval_recall
}

print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}",sep='\n')

Model:
Top K:		 10
MAP:		 0.007453
NDCG:		 0.010208
Precision@K:	 0.002405
Recall@K:	 0.014684


## 3.3 Diversity metrics

In [104]:
args = [train, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
             )

eval_diversity = diversity(*args, **kwargs)
eval_novelty = novelty(*args, **kwargs)
eval_distributional_coverage = distributional_coverage(*args, **kwargs)
eval_catalog_coverage = catalog_coverage(*args,**kwargs)

diversity_metrics = {
    "Diversity": eval_diversity,
    "Novelty": eval_novelty,
    "Distributional coverage": eval_distributional_coverage,
    "Catalog coverage": eval_catalog_coverage,
}
        
print(f"Model:",
      f"Diversity :\t\t\t {eval_diversity}",
      f"Novelty:\t\t\t {eval_novelty:f}",
      f"Catalog coverage:\t\t {eval_catalog_coverage:f}",
      f"Distributional coverage:\t {eval_distributional_coverage:f}",sep='\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reco_df[col_relevance] = 1.0


Model:
Diversity :			 0.9889497845639011
Novelty:			 11.154896
Catalog coverage:		 0.031075
Distributional coverage:	 5.524555


##### 4 Summary

In [105]:
summary = generate_summary(data_size,
                           algo,
                           TOP_K,
                           train_time, 
                           rating_time,
                           rating_metrics,
                           ranking_time,
                           ranking_metrics,
                           diversity_metrics)
df_results.loc[df_results.shape[0] + 1] = summary
df_results

Unnamed: 0,Data,Algo,K,Train time (s),Predicting time (s),RMSE,MAE,R2,Explained Variance,Recommending time (s),MAP,nDCG@k,Precision@k,Recall@k,Diversity,Novelty,Distributional coverage,Catalog coverage
1,all,svd,10,320.4409,15.2747,1933.422339,1933.379917,-890979.313,-38.101434,5.6938,0.003718,0.007425,0.002993,0.014763,0.987356,11.203618,9.916785,0.183836
2,all,svd,10,320.4409,15.2747,1933.422339,1933.379917,-890979.313,-38.101434,41811.3658,0.004625,0.007775,0.002376,0.014803,0.987503,11.206919,9.727227,0.1871
3,all,svd,10,2205.582,20.6284,2.10778,0.736401,-0.058925,-0.033717,5173.3098,0.007453,0.010208,0.002405,0.014684,0.98895,11.154896,5.524555,0.031075
