# Popular

In [6]:
# set the environment path to find Recommenders
import sys

import itertools
import logging
import os
import time
import random

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import papermill as pm
import datetime
from functools import wraps

from recommenders.datasets.python_splitters import python_stratified_split, python_chrono_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, diversity, novelty, serendipity, distributional_coverage, catalog_coverage 
from recommenders.utils.timer import Timer

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.13 | packaged by conda-forge | (default, Sep 23 2021, 07:55:15) 
[GCC Clang 11.1.0]
Pandas version: 1.1.5


In [7]:
#utils

# results table
cols = ["Data", "Algo", "K", "Train time (s)","Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k","Diversity","Novelty","Distributional coverage","Catalog coverage"]
df_results = pd.DataFrame(columns=cols)

def generate_summary(data, algo, k, train_time, rating_time, rating_metrics, ranking_time, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k, "Train time (s)": train_time, "Predicting time (s)": rating_time, "Recommending time (s)":ranking_time}
    if rating_metrics is None:
        rating_metrics = {
            "RMSE": np.nan,
            "MAE": np.nan,
            "R2": np.nan,
            "Explained Variance": np.nan,
        }
    if ranking_metrics is None:
        ranking_metrics = {
            "MAP": np.nan,
            "nDCG@k": np.nan,
            "Precision@k": np.nan,
            "Recall@k": np.nan,
        }
    if diversity_metrics is None:
        diversity_metrics = {
        "Diversity": np.nan,
        "Novelty": np.nan,
        "Distributional coverage": np.nan,
        "Catalog coverage": np.nan,
    }
    summary.update(diversity_metrics)
    summary.update(rating_metrics)
    summary.update(ranking_metrics)

    return summary


def convert_timestamp(datetime):
    date_string = str(datetime)
    date = datetime.datetime.strptime(date_string, "%m/%d/%Y")
    timestamp = datetime.datetime.timestamp(date)
    return(timestamp)

def preprocess_data(df):
    # Convert the float precision to 32-bit in order to reduce memory consumption 
    df.loc[:, header["col_rating"]] = df[header["col_rating"]].astype(np.float32)
    df = df[[header["col_user"],header["col_item"],header["col_rating"]]]
    return df 

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        arg = args[0] if len(args)>=1 else "" 
        print('func:%r  took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

# 0. Config params

In [31]:
# table results 
algo = "popular"
ranking_metrics = None
rating_metrics = None
diversity_metrics = None
train_time = np.nan
rating_time = np.nan
ranking_time = np.nan

# column name 
header = {
    "col_user": "customer_id",
    "col_item": "variant_id",
    "col_rating": "quantity",
    "col_timestamp": "order_date",
    "col_prediction": "prediction",
}

# top k
TOP_K = 10

################ TO MODIFY ################

# date size with 3 choices : "100k","1M" and "all"
data_size = "all"
# load splitted data 
load_splitted_data = True 

################ TO MODIFY ################

# 1. Data

## 1.1 Load Data

In [32]:
########### TO MODIFY ###########
def load_data(data_size):
    path = ""
    if data_size=="100k":
        path = '../../data/transaction_100k_df.pkl'
    elif data_size=="1M":
        path = '../../data/transaction_1M_df.pkl'
    elif data_size=="all":
        path = '../../data/transaction_all_df.pkl'
    
    if path != "":
        return pd.read_pickle(path)
    else :
        print("Please choose between 100k, 1M and all")
########### TO MODIFY ###########

In [33]:
# 2 ways to load the data
if not load_splitted_data : 
    # data not splitted 
    data = load_data(data_size)
else :
    # or  use stored splitted data to make it faster
    train = pd.read_pickle(f"../../data/train_{data_size}_df.pkl")
    test = pd.read_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

## 1.2 Split the data ( skip if load_splitted_data )

In [34]:
# chrono split but it is really slow ( +1h to split 8M data ) 
if not load_splitted_data :
    train, test = python_chrono_split(data,
                                      ratio=0.75,
                                      col_user=header["col_user"],
                                      col_item=header["col_item"],
                                      col_timestamp = header["col_timestamp"]
                                     )
    train.to_pickle(f"../../data/train_{data_size}_df.pkl")
    test.to_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

In [35]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[header["col_user"]].unique()),
    train_items=len(train[header["col_item"]].unique()),
    test_total=len(test),
    test_users=len(test[header["col_user"]].unique()),
    test_items=len(test[header["col_item"]].unique()),
))


Train:
Total Ratings: 6041296
Unique Users: 1497612
Unique Items: 7659

Test:
Total Ratings: 1799883
Unique Users: 869943
Unique Items: 7022



## 1.3 Process data

In [36]:
train = preprocess_data(train)
test = preprocess_data(test)

# 2. Model

## 2.3  recommend k items

In [37]:
with Timer() as ranking_time:
    # get the 100 most popular items 
    top = train.groupby(header["col_item"]).agg({header["col_rating"]:"sum"}).sort_values(by=[header["col_rating"]],ascending=False).reset_index()
    top = top.head(100)[header["col_item"]].tolist()

    users = []
    items = []
    list_users = list(train[header["col_user"]].unique())

    for user in tqdm(list_users):
        users += [user]*100
        items += top

    top_all = pd.DataFrame({header["col_user"]:users,header["col_item"]:items})

    # remove seen items and 
    top_k = pd.merge(train, top_all, on = [header["col_user"],header["col_item"]],how="outer",indicator=True)
    top_k = top_k[top_k['_merge']=='right_only']

    # select the 10 most popular items
    top_k = top_k.groupby(header["col_user"]).head(TOP_K)
    top_k[header["col_prediction"]] = top_k.groupby(header["col_user"], sort=False).cumcount() + 1
    top_k = top_k.drop(header["col_rating"],axis=1)

  0%|          | 0/1497612 [00:00<?, ?it/s]

# 3. Evaluate

## 3.1 Ranking metrics 

In [38]:
args = [test, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
              col_rating= header["col_rating"],
              col_prediction= header["col_prediction"],
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

ranking_metrics = {
    "MAP": eval_map,
    "nDCG@k": eval_ndcg,
    "Precision@k": eval_precision,
    "Recall@k": eval_recall,
}

print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}",sep='\n')

Model:
Top K:		 10
MAP:		 0.009730
NDCG:		 0.020413
Precision@K:	 0.007956
Recall@K:	 0.047232


## 3.2 Diversity metrics

In [39]:
args = [train, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
             )

eval_diversity = diversity(*args, **kwargs)
eval_novelty = novelty(*args, **kwargs)
eval_distributional_coverage = distributional_coverage(*args, **kwargs)
eval_catalog_coverage = catalog_coverage(*args,**kwargs)

diversity_metrics = {
    "Diversity": eval_diversity,
    "Novelty": eval_novelty,
    "Distributional coverage": eval_distributional_coverage,
    "Catalog coverage": eval_catalog_coverage,
}
        
print(f"Model:",
      f"Diversity :\t {eval_diversity}",
      f"Novelty:\t {eval_novelty:f}",
      f"Distributional Coverage:\t {eval_distributional_coverage:f}",
      f"Catalog Coverage:\t {eval_catalog_coverage:f}", sep='\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reco_df[col_relevance] = 1.0


Model:
Diversity :	 0.9687655323330641
Novelty:	 7.961962
Distributional Coverage:	 3.409857
Catalog Coverage:	 0.002481


# 4. Results

In [40]:
summary = generate_summary(data_size,
                           algo,
                           TOP_K,
                           train_time, 
                           rating_time,
                           rating_metrics,
                           ranking_time,
                           ranking_metrics,
                           diversity_metrics)
df_results.loc[df_results.shape[0] + 1] = summary
df_results

Unnamed: 0,Data,Algo,K,Train time (s),Predicting time (s),RMSE,MAE,R2,Explained Variance,Recommending time (s),MAP,nDCG@k,Precision@k,Recall@k,Diversity,Novelty,Distributional coverage,Catalog coverage
1,100k,popular,10,,,,,,,0.2397,0.004115,0.045793,0.049583,0.013647,0.725257,8.942318,3.98002,0.004488
2,1M,popular,10,,,,,,,2.3132,0.003371,0.016208,0.019308,0.018529,0.877848,8.577696,3.61935,0.003416
3,all,popular,10,,,,,,,322.5469,0.00973,0.020413,0.007956,0.047232,0.968766,7.961962,3.409857,0.002481
