In [2]:
from surprise import Reader, Dataset, SVD, accuracy, KNNBasic, BaselineOnly, NormalPredictor, KNNWithMeans, SVDpp
from surprise.model_selection import cross_validate, train_test_split
from surprise.accuracy import rmse, mae, mse
from surprise.similarities import cosine
from surprise.prediction_algorithms.predictions import Prediction
from collections import namedtuple, defaultdict
from scipy.stats import gaussian_kde
from scipy.integrate import quad
import pandas as pd
import itertools
import numpy as np
from matplotlib import pyplot as plt
import os
import seaborn as sns
import time

In [3]:
start = time.time()
reader = Reader(rating_scale=(values.rating.min(), values.rating.max()))
data = Dataset.load_from_df(values, reader)
train_set = data.build_full_trainset()
algo = SVD(n_factors=200, n_epochs=40)
algo.fit(train_set)
end = time.time()
print(end - start)

19.80806303024292


In [3]:
data = Dataset.load_from_df(values[:100000], reader)
y = cross_validate(algo, data, verbose=True, measures=['rmse'])

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.5800  2.6034  2.5399  2.6034  2.6973  2.6048  0.0518  
Fit time          3.02    2.81    2.64    3.96    2.92    3.07    0.46    
Test time         0.12    0.21    0.11    0.18    0.11    0.15    0.04    


In [4]:
def get_top_n(predictions, n=50):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def get_user_top_n(uid):
    """
    Retornar los 50 productos con mayor probabilidad para un usuario uid
    """
    predictions = list()
    for iid in matrix.columns:
        prediction = algo.predict(uid, iid)
        predictions.append(prediction)
    top_n = get_top_n(predictions, n=50)
    top_n_iid = [iid for (iid, rating) in top_n[uid]]
    recomendados = np.array(top_n_iid)
    return recomendados

def get_user_top_n_2(uid):
    """
    Retornar los 50 productos con mayor probabilidad para un usuario uid
    """
    predictions = list()
    for iid in matrix.columns:
        if np.isnan(matrix.loc[uid, iid]):
            prediction = algo.predict(uid, iid)
        else:
            prediction = Prediction(uid, iid, matrix.loc[uid,iid], matrix.loc[uid,iid], dict())
        predictions.append(prediction)
    top_n = get_top_n(predictions, n=50)
    top_n_iid = [iid for (iid, rating) in top_n[uid]]
    recomendados = np.array(top_n_iid)
    return recomendados

def get_user_top_n_raw(uid):
    recomendados = np.array(train[train.customer_id == uid].product_id[:50])
    return recomendados

def get_recall(comprados, recomendados):
    if comprados.shape[0] == 0:
        return 1.
    recall = np.intersect1d(comprados, recomendados).shape[0] / comprados.shape[0]
    return recall

def get_user_comprados(uid, dataset):
    """
    Retornar los productos que un usuario ha comprado en el dataset train o test
    """
    comprados = dataset[dataset.customer_id == uid].product_id.unique()
    return comprados

def get_user_recall_submission(uid, comprados):
    recomendados_submission = np.array(submission.loc[uid].str.split()[0]).astype('int')
    recall = get_recall(comprados, recomendados_submission)
    return recall

In [None]:
start = time.time()
n = 20
recall_sum = 0
recall_sum_2 = 0
recall_sum_raw = 0
recall_submission_sum = 0
recall_kde_sum = 0
#for uid in np.random.choice(submission.index.values, size=n):
#for uid in submission.index.values[:n]:
for uid in test.groupby(['customer_id']).count().sort_values('fecha_compra').tail(n).index:
    comprados = get_user_comprados(uid, test)
    recomendados_kde = get_kde_recomendados(uid, test_fecha_min, test_fecha_max)
    recomendados = get_user_top_n(uid)
    recomendados_2 = get_user_top_n_2(uid)
    recomendados_raw = get_user_top_n_raw(uid)
    recall = get_recall(comprados, recomendados)
    recall_2 = get_recall(comprados, recomendados_2)
    recall_raw = get_recall(comprados, recomendados_raw)
    recall_submission = get_user_recall_submission(uid, comprados)
    recall_kde = get_recall(comprados, recomendados_kde)
    recall_sum += recall
    recall_sum_raw += recall_raw
    recall_sum_2 += recall_2
    recall_submission_sum += recall_submission
    recall_kde_sum += recall_kde
    #print(f'recall:{recall}, recall_submission: {recall_submission}')
print(f'average: {recall_sum/n}, average_2: {recall_sum_2/n}, average_raw: {recall_sum_raw/n}, average_submission: {recall_submission_sum/n}, avg kde: {recall_kde_sum/n}')
end = time.time()
print(end - start)

In [11]:
np.intersect1d([1,2,3,4],[1,2,2,2])

array([1, 2])

In [15]:
train

Unnamed: 0,fecha_compra,customer_id,product_id,type_id,business_id,channel_id,cantidad_venta,monto_venta_transf,peso_venta_transf,date_value,cantidad_venta_normalizado,rating
2106567,2023-01-26,1,743,BO,21,A,8.0,1.948278,0.926373,0.993902,0.999089,0.992997
2106931,2023-01-26,1,285,BO,21,A,8.0,2.376726,0.521085,0.993902,0.999089,0.992997
2106932,2023-01-26,1,294,BO,21,A,8.0,2.376726,0.521085,0.993902,0.999089,0.992997
2106933,2023-01-26,1,343,BO,21,A,8.0,2.376726,0.521085,0.993902,0.999089,0.992997
2082638,2023-01-21,1,99,BO,21,V,12.0,14.005601,0.486346,0.990515,0.999983,0.990498
...,...,...,...,...,...,...,...,...,...,...,...,...
1954417,2022-12-31,15312,672,GA,51,B,1.0,13.076333,3.223488,0.976287,0.500000,0.488144
1954523,2022-12-31,15312,149,GA,51,B,1.0,13.840591,1.100068,0.976287,0.500000,0.488144
1954591,2022-12-31,15312,188,GA,51,B,1.0,14.573005,2.894916,0.976287,0.500000,0.488144
1955304,2022-12-31,15312,34,GA,51,B,1.0,53.602256,5.300590,0.976287,0.500000,0.488144
