# T_SVD 

In [2]:
import sys

from sklearn.decomposition import TruncatedSVD
import sklearn
import pandas as pd
import numpy as np
from functools import wraps
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split, python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var,
                                                       map_at_k, ndcg_at_k, precision_at_k, recall_at_k,
                                                       diversity, novelty, distributional_coverage, catalog_coverage )

print("System version: {}".format(sys.version))

System version: 3.6.13 | packaged by conda-forge | (default, Sep 23 2021, 07:55:15) 
[GCC Clang 11.1.0]


In [3]:
#utils

# results table
cols = ["Data", "Algo", "K", "Train time (s)","Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k","Diversity","Novelty","Distributional coverage","Catalog coverage"]
df_results = pd.DataFrame(columns=cols)

def generate_summary(data, algo, k, train_time, rating_time, rating_metrics, ranking_time, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k, "Train time (s)": train_time, "Predicting time (s)": rating_time, "Recommending time (s)":ranking_time}
    if rating_metrics is None:
        rating_metrics = {
            "RMSE": np.nan,
            "MAE": np.nan,
            "R2": np.nan,
            "Explained Variance": np.nan,
        }
    if ranking_metrics is None:
        ranking_metrics = {
            "MAP": np.nan,
            "nDCG@k": np.nan,
            "Precision@k": np.nan,
            "Recall@k": np.nan,
        }
    if diversity_metrics is None:
        diversity_metrics = {
        "Diversity": np.nan,
        "Novelty": np.nan,
        "Distributional coverage": np.nan,
        "Catalog coverage": np.nan,
    }
    summary.update(diversity_metrics)
    summary.update(rating_metrics)
    summary.update(ranking_metrics)

    return summary


def convert_timestamp(datetime):
    date_string = str(datetime)
    date = datetime.datetime.strptime(date_string, "%m/%d/%Y")
    timestamp = datetime.datetime.timestamp(date)
    return(timestamp)

def preprocess_data(df):
    # Convert the float precision to 32-bit in order to reduce memory consumption 
    df.loc[:, header["col_rating"]] = df[header["col_rating"]].astype(np.float32)
    df = df[[header["col_user"],header["col_item"],header["col_rating"]]]
    return df 

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        arg = args[0] if len(args)>=1 else "" 
        print('func:%r  took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

def _list_similar_products(product_id, utility_matrix, correlation_matrix,k,all_items_bought=set()):
    product_names = list(utility_matrix.index)
    product_id_index = product_names.index(product_id)
    correlation_product_id = correlation_matrix[product_id_index]
    correlation_product_id_indexed = [ (product_names[i],j) for i,j in enumerate(correlation_product_id) if product_names[i] not in all_items_bought ]
    correlation_product_id_indexed.sort(key=lambda x : x[1],reverse = True)
    recommendations = correlation_product_id_indexed[:k]
    return recommendations

def get_top_k_items_t_svd(train,utility_matrix,correlation_matrix,col_user,col_item,col_rating,col_prediction,k,remove_seen=True):
    top_k = pd.DataFrame({col_user:[],col_item:[],col_prediction:[]})
    users = []
    items = []
    scores = []
    # get the top products (based on the number of items bought) for each customer 
    idx = train.groupby([col_user])[col_rating].transform(max) == train[col_rating]
    top_products_per_customer = train[idx].drop_duplicates(subset=col_user, keep="last")
    for i in tqdm(range(len(top_products_per_customer)), desc = 'Customer'):
        customer = top_products_per_customer[col_user].iloc[i]
        top_product = top_products_per_customer[col_item].iloc[i]
        # get all the products bought by the customer 
        if remove_seen:
            all_items_bought = set(train[train[col_user]==customer][col_item])
        else:
            all_items_bought = set()
        list_similar_items_score = _list_similar_products(top_product, utility_matrix, correlation_matrix,k,all_items_bought)
        for j in range(len(list_similar_items_score)):
            item, score = list_similar_items_score[j]
            users.append(customer)
            items.append(item)
            scores.append(score)
    top_k[col_user]=users
    top_k[col_item]=items
    top_k[col_prediction]=scores
    return(top_k)

# 0. Config params

In [4]:
# table results 
algo = "t_svd"
ranking_metrics = None
rating_metrics = None
diversity_metrics = None
train_time = np.nan
rating_time = np.nan
ranking_time = np.nan

# column name 
header = {
    "col_user": "customer_id",
    "col_item": "variant_id",
    "col_rating": "quantity",
    "col_timestamp": "order_date",
    "col_prediction": "prediction",
}

# top k
TOP_K = 10

################ TO MODIFY ################

# date size with 3 choices : "100k","1M" and "all"
data_size = "all"
# load splitted data 
load_splitted_data = True 

################ TO MODIFY ################

# 1. Data

# 1.1 Load data 

In [5]:
########### TO MODIFY ###########
def load_data(data_size):
    path = ""
    if data_size=="100k":
        path = '../../data/transaction_100k_df.pkl'
    elif data_size=="1M":
        path = '../../data/transaction_1M_df.pkl'
    elif data_size=="all":
        path = '../../data/transaction_all_df.pkl'
    
    if path != "":
        return pd.read_pickle(path)
    else :
        print("Please choose between 100k, 1M and all")
########### TO MODIFY ###########

In [6]:
# 2 ways to load the data
if not load_splitted_data : 
    # data not splitted 
    data = load_data(data_size)
else :
    # or  use stored splitted data to make it faster
    train = pd.read_pickle(f"../../data/train_{data_size}_df.pkl")
    test = pd.read_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

## 1.2 Split the data ( skip if load_splitted_data )

In [7]:
# chrono split but it is really slow ( +1h to split 8M data ) 
if not load_splitted_data :
    train, test = python_chrono_split(data,
                                      ratio=0.75,
                                      col_user=header["col_user"],
                                      col_item=header["col_item"],
                                      col_timestamp = header["col_timestamp"]
                                     )
    train.to_pickle(f"../../data/train_{data_size}_df.pkl")
    test.to_pickle(f"../../data/test_{data_size}_df.pkl")
    train.shape[0], test.shape[0]

In [8]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[header["col_user"]].unique()),
    train_items=len(train[header["col_item"]].unique()),
    test_total=len(test),
    test_users=len(test[header["col_user"]].unique()),
    test_items=len(test[header["col_item"]].unique()),
))


Train:
Total Ratings: 6041296
Unique Users: 1497612
Unique Items: 7659

Test:
Total Ratings: 1799883
Unique Users: 869943
Unique Items: 7022



## 1.3 Process data

In [9]:
train = preprocess_data(train)
test = preprocess_data(test)

# 2. Model

## 2.1 Define model

In [10]:
# create pivot table from chunk in order to prevent int32 overflow ( pivot_table function do not work well )
chunk_size = 5000
chunks = [x for x in range(0, train.shape[0], chunk_size)]

ratings_utility_matrix = pd.DataFrame()

for i in tqdm(range(0, len(chunks) - 1)):
    chunk_df = train.iloc[ chunks[i]:chunks[i + 1] - 1]
    interactions = (
    chunk_df.groupby([header["col_user"], header["col_item"]])[header["col_rating"]]
    .sum()
    .unstack()
    .reset_index()
    .fillna(0)
    .set_index(header["col_user"])
    )
    ratings_utility_matrix = ratings_utility_matrix.append(interactions, sort=False) 
    
#ratings_utility_matrix = train.pivot_table(values=header["col_rating"], index= header["col_user"],
#                                                         columns=header["col_item"], fill_value=0)
utility_matrix = ratings_utility_matrix.T
t_svd = TruncatedSVD(n_components=3)

  0%|          | 0/1208 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 2.2 Train model

In [None]:
with Timer() as train_time:
    decomposed_matrix = t_svd.fit_transform(utility_matrix)

## 2.4  recommend k items

In [None]:
with Timer() as ranking_time:
    correlation_matrix = np.corrcoef(decomposed_matrix)
    print("correlation matrix computed")
    top_k = get_top_k_items_t_svd(train,
                                  utility_matrix,
                                  correlation_matrix,
                                  col_user=header["col_user"],
                                  col_item=header["col_item"],
                                  col_rating=header["col_rating"],
                                  col_prediction = header["col_prediction"],
                                  k=TOP_K,
                                  remove_seen=True)

# 3. Evaluate

## 3.1 Ranking metrics 

In [None]:
args = [test, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
              col_rating = header["col_rating"],
              relevancy_method = 'top_k', 
              k = TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

ranking_metrics = {
    "MAP": eval_map,
    "nDCG@k": eval_ndcg,
    "Precision@k": eval_precision,
    "Recall@k": eval_recall
}

print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}",sep='\n')

## 3.2 Diversity metrics

In [None]:
args = [train, top_k]

kwargs = dict(col_user = header["col_user"],
              col_item = header["col_item"],
             )

eval_diversity = diversity(*args, **kwargs)
eval_novelty = novelty(*args, **kwargs)
eval_distributional_coverage = distributional_coverage(*args, **kwargs)
eval_catalog_coverage = catalog_coverage(*args,**kwargs)

diversity_metrics = {
    "Diversity": eval_diversity,
    "Novelty": eval_novelty,
    "Distributional coverage": eval_distributional_coverage,
    "Catalog coverage": eval_catalog_coverage,
}
        
print(f"Model:",
      f"Diversity :\t\t\t {eval_diversity}",
      f"Novelty:\t\t\t {eval_novelty:f}",
      f"Catalog coverage:\t\t {eval_catalog_coverage:f}",
      f"Distributional coverage:\t {eval_distributional_coverage:f}",sep='\n')

# 4 Summary

In [None]:
summary = generate_summary(data_size,
                           algo,
                           TOP_K,
                           train_time, 
                           rating_time,
                           rating_metrics,
                           ranking_time,
                           ranking_metrics,
                           diversity_metrics)
df_results.loc[df_results.shape[0] + 1] = summary
df_results