In [2]:
from surprise import Reader, Dataset, SVD, accuracy, KNNBasic, BaselineOnly, NormalPredictor, KNNWithMeans, SVDpp, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold
from surprise.accuracy import rmse, mae, mse
from surprise.similarities import cosine
from surprise.prediction_algorithms.predictions import Prediction
from collections import namedtuple, defaultdict
from scipy.stats import gaussian_kde, norm
from scipy.integrate import quad
import pandas as pd
import itertools
import datetime
import numpy as np
from matplotlib import pyplot as plt
import os
import seaborn as sns
import time
import csv
import heapq

In [3]:
pd.set_option('display.precision', 15)
submit = True

# Preprocessing

In [119]:
def logistic_function(x, k, x_0, L):
    """
    k: factor de crecimiento
    x_0: Valor de x donde el resultado sera L/2
    L: valor maximo de la funcion
    """
    return L / (1 + np.exp(-k * (x - x_0)))

train = pd.read_csv('train.csv', sep='|', parse_dates=['fecha_compra'])
train['date_value'] = (train.fecha_compra - train.fecha_compra.min()) / (train.fecha_compra.max() - train.fecha_compra.min()) / 4 + 0.75
# df_max = train.groupby('customer_id').agg(_max=('cantidad_venta', 'max'))
# train = pd.merge(train, df_max, how='left', left_on='customer_id', right_index=True)
# train.cantidad_venta = (train.cantidad_venta) / (train['_max')
train['cantidad_venta_normalizado'] = train.cantidad_venta.apply(lambda x: logistic_function(x, k=2, x_0=0.3, L=1))
train['rating'] = train.cantidad_venta_normalizado * train.date_value
if submit:
    test_fecha_min, test_fecha_max = pd.to_datetime('2023-02-06'), pd.to_datetime('2023-02-12')
else:
    test = train[train.fecha_compra > '2023-01-27']
    test_fecha_min, test_fecha_max = test.fecha_compra.min(), test.fecha_compra.max()
    train = train[train.fecha_compra <= '2023-01-27']
train.sort_values(by=['customer_id', 'product_id', 'fecha_compra'], ascending=[True, True, True], inplace=True)
# train['day'] = train.fecha_compra.dt.day
# train['month'] = train.fecha_compra.dt.month
# train['year'] = train.fecha_compra.dt.year
product = pd.read_csv('RentabilidadProduct.csv', sep='|')
# train = train[(train.fecha_compra > '2022-01-31') & (train.fecha_compra < '2023-02-01')]
submission = pd.read_csv('submit_example.csv', index_col='customer_id')
values = train.groupby(['customer_id', 'product_id']).agg(rating = ('rating', 'sum'), ultima_fecha = ('fecha_compra', 'max'))
if not submit:
    values_test = test.groupby(['customer_id', 'product_id']).agg(rating = ('rating', 'sum'), ultima_fecha = ('fecha_compra', 'max'))
matrix = pd.pivot_table(values, index='customer_id', columns='product_id', values='rating')
std = train.groupby(['customer_id', 'product_id'])['fecha_compra'].diff()
product = pd.merge(product, train.drop_duplicates(subset=['product_id', 'business_id'])[['product_id','business_id']], how='right', on='product_id')
product['std'] = product.product_id.apply(lambda iid: std.loc[train[train.product_id == iid].index].std().days)

propenso = train.groupby(['customer_id']).product_id.agg(['nunique','count'])
propenso['propenso'] = propenso['nunique'] / propenso['count']
propenso = propenso.drop(['nunique','count'], axis=1)

customers = train.drop_duplicates('customer_id')[['customer_id','type_id']].copy().reset_index(drop=True)
customers.set_index('customer_id', inplace=True)
customers['propenso'] = 1 - propenso

def get_product_similarity(iid_1, iid_2, by):
    similarity = product.loc[product.product_id == iid_1, by].iloc[0] == product.loc[product.product_id == iid_2, by].iloc[0]
    return int(similarity)

# Test

In [12]:
def get_recall(comprados, recomendados):
    if comprados.shape[0] == 0:
        return 1.
    recall = np.intersect1d(comprados, recomendados, assume_unique=True).shape[0] / comprados.shape[0]
    return recall

def get_user_comprados(uid, dataset):
    """
    Retornar los productos que un usuario ha comprado en el dataset train o test
    """
    comprados = dataset[dataset.customer_id == uid].product_id.unique()
    return comprados

def get_user_recall_submission(uid, comprados):
    recomendados_submission = np.array(submission.loc[uid].str.split()[0][:30]).astype('int')
    recall = get_recall(comprados, recomendados_submission)
    return recall

# KDE

In [13]:
bw_method = 0.75
def plot_fechas_customer_product(uid, iid):
    fechas_compra = train[(train.customer_id == uid) & (train.product_id == iid)].fecha_compra
    # Crear una serie temporal con las fechas de compra como índice (esto puede ser útil para análisis posteriores)
    serie_temporal = pd.Series(data=[1] * len(fechas_compra), index=fechas_compra)
    # Visualizar la serie temporal
    plt.figure(figsize=(10, 4))
    plt.plot(serie_temporal.index, serie_temporal.values, marker='o')
    plt.xlabel('Fecha')
    plt.ylabel('Compras')
    plt.title('Compras por fecha')
    plt.grid(True)
    plt.show()
    
def plot_kde_diferencias_fecha_customer_product(uid, iid):
    fechas_compra = train[(train.customer_id == uid) & (train.product_id == iid)].fecha_compra
    diferencias = fechas_compra.diff()[1:].dt.days
    kde = gaussian_kde(diferencias, bw_method=bw_method)
    x = np.linspace(0, diferencias.max()*2, 100)
    print(diferencias.values)
    sns.lineplot(x=x, y=kde(x))

def get_probability_customer_product(uid, iid, min_n, max_n):
    """
    Calcula la probabilidad de que un cliente uid compre cierto producto iid entre los (min_n, max_n) dias siguientes
    """
    #print(f'uid={uid}, iid={iid}')
    fechas = train[(train.customer_id == uid) & (train.product_id == iid)].fecha_compra
    if fechas.count() == 0:
        return 0
    if fechas.count() == 1:
        return 0.00000000000001
    diferencias = fechas.diff()[1:].dt.days
    if diferencias.shape[0] <= 1 or diferencias.nunique() == 1 :
        distribucion_normal = norm(loc=diferencias.mean(), scale=100/diferencias.shape[0])
        probability_less_than_n = distribucion_normal.cdf(max_n + 0.5) - distribucion_normal.cdf(min_n - 0.5)
    else:
        kde = gaussian_kde(diferencias, bw_method=bw_method)
        probability_less_than_n = quad(kde, min_n-0.5, max_n+0.5)[0]
        if probability_less_than_n == 0:
            return 0.000000000001
    return probability_less_than_n * values.loc[(uid, iid), 'rating']

def get_probability_customer_product_fechas(uid, iid, fecha_min, fecha_max):
    """
    Calcula la probabilidad de que un cliente uid compre un producto iid en las fechas dadas
    Estas fechas tienen que ser despues de su ultima compra
    """
    try:
        ultima_fecha = values.loc[(uid, iid), 'ultima_fecha']
    except KeyError:
        return 0
    n_min = (fecha_min - ultima_fecha).days
    n_max = (fecha_max- ultima_fecha).days
    
    return get_probability_customer_product(uid, iid, n_min, n_max)

def get_kde_recomendados(uid, fecha_min, fecha_max):
    """
    Retorna los recomendados para un cliente uid en las fechas dadas
    """
    probabilidades = [[iid, get_probability_customer_product_fechas(uid, iid, fecha_min, fecha_max)] for iid in matrix.columns]
    recomendados = np.array(list(map(lambda x:x[0], sorted(probabilidades, key=lambda x:x[1], reverse=True)[:30])))
    return recomendados

def get_kde_recomendados_with_probabilities(uid, fecha_min, fecha_max):
    """
    Retorna los recomendados para un cliente uid en las fechas dadas
    """
    probabilidades = [[iid, get_probability_customer_product_fechas(uid, iid, fecha_min, fecha_max)] for iid in matrix.columns]
    recomendados = np.array(sorted(probabilidades, key=lambda x:x[1], reverse=True)[:30])
    iid_probability = pd.DataFrame(recomendados, columns=['iid', 'probability'])
    iid_probability.iid = iid_probability.iid.astype(int)
    return iid_probability

def get_union_recomendados(uid, fecha_min, fecha_max, kde_weight, cf_weight):

    df_cf = pd.DataFrame(get_user_top_n_2_with_rating(uid), columns=['iid', 'probability'])
    cf_min, cf_max = df_cf.probability.min(), df_cf.probability.max()
    df_cf.probability = (df_cf.probability - cf_min) / cf_max

    df_kde = get_kde_recomendados_with_probabilities(uid, fecha_min, fecha_max)
    kde_min, kde_max = df_kde.probability.min(), df_kde.probability.max()
    df_kde.probability = (df_kde.probability - kde_min) / kde_max

    df_kde.probability = df_kde.probability * kde_weight
    df_cf.probability = df_cf.probability * cf_weight

    recomendados = pd.concat([df_kde, df_cf], ignore_index=True).groupby('iid').sum().sort_values('probability', ascending=False).index[:30]
    return recomendados

# COLLABORATIVE FILTERING

In [6]:
values_reset = values.reset_index().drop('ultima_fecha', axis=1)
reader = Reader(line_format='user item rating timestamp', rating_scale=(0, values_reset.rating.max()))
data = Dataset.load_from_df(values_reset, reader)
train_set = data.build_full_trainset()
if not submit:
    test_set = values_test.reset_index().drop('ultima_fecha', axis=1).values.tolist()
algo = SVD(n_factors=200, n_epochs=40, init_mean=0, init_std_dev=0.1)
algo.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27085564340>

In [7]:
data = Dataset.load_from_df(values_reset[:100000], reader)
y = cross_validate(algo, data, verbose=True, measures=['rmse'])

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.0343  3.2118  3.0118  3.1153  3.0415  3.0830  0.0732  
Fit time          3.03    3.26    3.33    3.05    3.16    3.16    0.12    
Test time         0.16    0.33    0.15    0.13    0.15    0.18    0.07    


In [8]:
predictions = algo.test(values.reset_index().drop('ultima_fecha', axis=1).values.tolist())
rmse(predictions)

RMSE: 3.3840


3.3839997128898065

In [9]:
def get_top_n(predictions, n=30):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def get_user_top_n_2(uid):
    """
    Retornar los 50 productos con mayor probabilidad para un usuario uid
    """
    predictions = list()
    for iid in matrix.columns:
        if np.isnan(matrix.loc[uid, iid]):
            prediction = algo.predict(uid, iid)
        else:
            prediction = Prediction(uid, iid, matrix.loc[uid,iid], matrix.loc[uid,iid], dict())
        predictions.append(prediction)
    top_n = get_top_n(predictions, n=30)
    top_n_iid = [iid for (iid, rating) in top_n[uid]]
    recomendados = np.array(top_n_iid)
    return recomendados

def get_user_top_n_2_with_rating(uid):
    """
    Retornar los 50 productos con mayor probabilidad para un usuario uid
    """
    predictions = list()
    for iid in matrix.columns:
        if np.isnan(matrix.loc[uid, iid]):
            prediction = algo.predict(uid, iid)
        else:
            prediction = Prediction(uid, iid, matrix.loc[uid,iid], matrix.loc[uid,iid], dict())
        predictions.append(prediction)
    top_n = get_top_n(predictions, n=30)
    return top_n[uid]

# TESTING

In [10]:
start = time.time()
recall_kde_sum = 0
recall_CF_sum = 0
recall_unidos_sum = 0
recall_submission_sum = 0
#for uid in matrix.index[:n]:\
n = 5
#for uid in customers[customers.index.isin(test.customer_id.unique())].sort_values('propenso')[:n].index:
for uid in np.random.choice(matrix.index, size=n):
#for uid in test.groupby('customer_id').product_id.count().sort_values()[-n:].index:
    comprados = get_user_comprados(uid, test)
    recomendados = get_kde_recomendados(uid, test_fecha_min, test_fecha_max)
    recomendados_CF = get_user_top_n_2(uid)
    recomendados_unidos = get_union_recomendados(uid, test_fecha_min, test_fecha_max, kde_weight=10, cf_weight=0.00000000000001)
    recall_kde_sum += get_recall(comprados, recomendados)
    recall_CF_sum += get_recall(comprados, recomendados_CF)
    recall_unidos_sum += get_recall(comprados, recomendados_unidos)
    recall_submission_sum += get_user_recall_submission(uid, comprados)
print(f'avg kde: {recall_kde_sum/n}, avg CF: {recall_CF_sum/n}, avg unidos: {recall_unidos_sum/n}, avg submission: {recall_submission_sum/n}')
end = time.time()
print(f'time: {end - start}')
print(f'comprados: {comprados}')
print(f'recomendados: {recomendados}')
print(f'recomendados_CF: {recomendados_CF}')
print(f'recomendados_CF y KDE: {np.intersect1d(recomendados, recomendados_CF)}')
print(f'comprados y recomendados: {np.intersect1d(comprados, recomendados)}')
print(f'comprados y recomendados_CF: {np.intersect1d(comprados, recomendados_CF)}')
print(f'comprados y no recomendados: {np.setdiff1d(comprados, recomendados)}')
print(f'comprados y no recomendados_CF: {np.setdiff1d(comprados, recomendados_CF)}')
print(f'recomendados y comprados por CF y no KDE: {np.setdiff1d(np.intersect1d(recomendados_CF, comprados), recomendados)}')
print(f'nuevos comprados: {np.setdiff1d(comprados, get_user_comprados(uid, train))}')
print(f'cantidad de productos unicos comprados: {get_user_comprados(uid, train).shape[0]}')
print(f'propenso: {customers.loc[uid, "propenso"]}')

NameError: name 'test' is not defined

In [121]:
submission.drop(index=[1,2,3,4,5], inplace=True)

In [123]:
for uid in customers.index.astype('str').sort_values().astype('int64')[:5]:
    recomendados = get_kde_recomendados(uid, test_fecha_min, test_fecha_max)
    submission.at[uid, 'product_id'] = ' '.join(map(str, recomendados))

In [125]:
submission.to_csv('submit.csv')