In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

In [2]:
# Cargar datos
amazon = pd.read_csv('amazon.tsv', sep='\t')
num_rows = len(amazon)
midpoint = num_rows // 4
amazon = amazon.iloc[:midpoint]

# Eliminar duplicados y valores nulos
amazon = amazon.drop_duplicates()
amazon = amazon.dropna()
amazon['review_date'] = pd.to_datetime(amazon['review_date'])
amazon = amazon.drop(columns=['marketplace', 'product_category', 'review_id', 'product_parent', 'vine', 'verified_purchase', 'review_headline', 'review_date'])

In [3]:
# Asegurarse de que las columnas estén en el tipo de datos correcto
amazon['customer_id'] = amazon['customer_id'].astype(int)
amazon['product_id'] = amazon['product_id'].astype(str)
amazon['star_rating'] = amazon['star_rating'].astype(float)

# Tratamiento de valores atípicos
def detectar_y_tratar_valores_atipicos(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR

    df[col] = np.where(df[col] < limite_inferior, limite_inferior, df[col])
    df[col] = np.where(df[col] > limite_superior, limite_superior, df[col])

    return df

amazon = detectar_y_tratar_valores_atipicos(amazon, 'helpful_votes')
amazon = detectar_y_tratar_valores_atipicos(amazon, 'total_votes')
amazon = detectar_y_tratar_valores_atipicos(amazon, 'star_rating')

In [4]:
# Verificar que no haya valores nulos en las columnas necesarias
if (amazon['customer_id'].isnull().sum() > 0 or
    amazon['product_id'].isnull().sum() > 0 or
    amazon['star_rating'].isnull().sum() > 0):
    raise ValueError("Hay valores nulos en las columnas necesarias para la creación de la matriz de usuario-producto")


In [5]:
# Reducir el número de usuarios y productos para manejar mejor la memoria y evitar dimensiones negativas
# Seleccionar los 1000 usuarios y productos más activos
top_users = amazon['customer_id'].value_counts().nlargest(1000).index
top_products = amazon['product_id'].value_counts().nlargest(1000).index
amazon_reduced = amazon[amazon['customer_id'].isin(top_users) & amazon['product_id'].isin(top_products)]


In [6]:
# Crear la matriz de usuario-producto
user_product_matrix = amazon_reduced.pivot_table(index='customer_id', columns='product_id', values='star_rating').fillna(0)


In [7]:
# Verificar dimensiones de la matriz antes de la factorización
if user_product_matrix.shape[0] > 0 and user_product_matrix.shape[1] > 0:
    # Algoritmo de Matrix Factorization de Parte 3
    def matrix_factorization(user_product_matrix):
        nmf = NMF(n_components=10, init='random', random_state=0)
        W = nmf.fit_transform(user_product_matrix)
        H = nmf.components_
        return W, H

    # Integración del algoritmo
    W, H = matrix_factorization(user_product_matrix)
    reconstructed_matrix = np.dot(W, H)

    # Función para recomendar productos usando Matrix Factorization
    def recommend_products(user_id, num_recommendations=5):
        user_index = user_product_matrix.index.get_loc(user_id)
        user_ratings = reconstructed_matrix[user_index]
        user_rated_products = user_product_matrix.loc[user_id][user_product_matrix.loc[user_id] > 0].index
        recommendations = pd.Series(user_ratings, index=user_product_matrix.columns).drop(user_rated_products).sort_values(ascending=False)
        
        return recommendations.head(num_recommendations)

    # Calcular métricas de evaluación
    def hit_rate(recommended_items, relevant_items):
        return int(len(set(recommended_items) & set(relevant_items)) > 0)

    def coverage(recommended_items, all_items):
        return len(set(recommended_items)) / float(len(all_items))

    def mean_reciprocal_rank(recommended_items, relevant_items):
        for i, item in enumerate(recommended_items):
            if item in relevant_items:
                return 1.0 / (i + 1)
        return 0.0

    # Mostrar algunos customer_id disponibles
    print("Algunos customer_id disponibles:", user_product_matrix.index[:10])

    # Seleccionar un customer_id válido de los disponibles
    valid_customer_id = user_product_matrix.index[0]

    # Ejemplo de uso con un customer_id válido
    recommended_products = recommend_products(valid_customer_id, 5)
    recommended_products_array = recommended_products.index.tolist()
    df = pd.read_csv('amazon.tsv', sep='\t', nrows=50_000)
    filtered_data = df[df['product_id'].isin(recommended_products_array)]
    unique_recommended_titles = filtered_data.drop_duplicates(subset='product_id')[['product_id', 'product_title']]
    print(unique_recommended_titles)

    # Calcular métricas de ejemplo con diferentes valores de k
    relevant_items = amazon_reduced[amazon_reduced['customer_id'] == valid_customer_id]['product_id'].tolist()
    all_items = user_product_matrix.columns.tolist()
    for k in [5, 10, 20]:
        recommended_products = recommend_products(valid_customer_id, k)
        recommended_products_array = recommended_products.index.tolist()

        hr = hit_rate(recommended_products_array, relevant_items)
        cov = coverage(recommended_products_array, all_items)
        mrr = mean_reciprocal_rank(recommended_products_array, relevant_items)

        print(f'Hit Rate@{k}: {hr}')
        print(f'Coverage@{k}: {cov}')
        print(f'MRR@{k}: {mrr}')
else:
    print("Error: La matriz de usuario-producto tiene dimensiones no válidas.")

Algunos customer_id disponibles: Index([12081595, 12083294, 12128218, 12134476, 12173619, 12185304, 12202815,
       12221367, 12227112, 12230154],
      dtype='int64', name='customer_id')
       product_id                                product_title
295    0451526341                   Animal farm: A Fairy Story
575    0316011770                                The Historian
645    0743226712                                         1776
6243   0553212583          Wuthering Heights (Bantam Classics)
21283  055327449X  The Illustrated Man (Grand Master Editions)
Hit Rate@5: 0
Coverage@5: 0.008710801393728223
MRR@5: 0.0
Hit Rate@10: 0
Coverage@10: 0.017421602787456445
MRR@10: 0.0
Hit Rate@20: 0
Coverage@20: 0.03484320557491289
MRR@20: 0.0
