In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

In [None]:
# 1. Cargar los datasets
path = '../Datasets/'
ratings = pd.read_csv(path + 'ml-small.csv')

In [7]:
def predict_rating(user_index, item_index, data, clusters, user_means):
    cluster_id = clusters[user_index]
    cluster_members = np.where(clusters == cluster_id)[0]
    
    votes = []
    user_vector = data[user_index]
    user_mean = user_means[user_index]

    for neighbor in cluster_members:
        if neighbor == user_index:
            continue
        neighbor_rating = data[neighbor, item_index]
        if neighbor_rating == 0:
            continue

        neighbor_mean = user_means[neighbor]
        deviation = neighbor_rating - neighbor_mean

        distance = np.linalg.norm(user_vector - data[neighbor])
        weight = 1 / (distance**2 + 1e-5)

        votes.append((deviation, weight))
    
    if not votes:
        return user_mean  # Si no hay votos, devolver la media del usuario

    # Promedio ponderado de las desviaciones
    num = sum(dev * w for dev, w in votes)
    denom = sum(w for _, w in votes)

    pred = user_mean + (num / denom)
    return np.clip(pred, 0.5, 5.0)  # Asegura que el rating esté en el rango válido

In [None]:
# 2. Crear matriz usuario-película
# Pivot table: filas = usuarios, columnas = películas, valores = rating
ratings_matrix = ratings.pivot(index='userId', columns='itemId', values='rating').fillna(0)

In [9]:
# Parámetros para KMeans y 5-fold cross validation
k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = []

# Convertir el DataFrame en un array de índices para iterar en el splitting
ratings_indices = ratings.index.values

In [None]:
# 2. Iterar sobre cada fold
for train_indices, test_indices in kf.split(ratings_indices):
    
    # Crear conjuntos de entrenamiento y prueba a partir de los índices
    train_ratings = ratings.iloc[train_indices]
    test_ratings = ratings.iloc[test_indices]
    
    # Construir la matriz usuario-película a partir del conjunto de entrenamiento
    train_matrix = train_ratings.pivot(index='userId', columns='itemId', values='rating').fillna(0)

    # Calcular la media por usuario para ajustar como KNNWithMeans
    user_means = train_matrix.replace(0, np.NaN).mean(axis=1).values
    
    # Realizar clustering en la matriz de entrenamiento
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters_train = kmeans.fit_predict(train_matrix)
    
    # Listas para acumular predicciones y valores reales
    predictions = []
    true_values = []
    
    # Convertir índices de usuarios y películas para facilitar búsqueda
    usuarios = list(train_matrix.index)
    peliculas = list(train_matrix.columns)
    train_data = train_matrix.values
    
    # Iterar sobre cada registro del conjunto de prueba
    for idx, row in test_ratings.iterrows():
        user = row['userId']
        movie = row['itemId']
        true_rating = row['rating']
        
        # Si el usuario o la película no están en el conjunto de entrenamiento, se omite el registro
        if user not in usuarios or movie not in peliculas:
            continue
        
        user_idx = usuarios.index(user)
        movie_idx = peliculas.index(movie)
        
        pred = predict_rating(user_idx, movie_idx, train_data, clusters_train, user_means)
        predictions.append(pred)
        true_values.append(true_rating)
    
    # Calcular el MAE para el fold actual, si existen predicciones
    if predictions:
        mae_fold = np.mean(np.abs(np.array(predictions) - np.array(true_values)))
        mae_scores.append(mae_fold)
        print(f"Fold MAE: {mae_fold:.4f}")

# Mostrar el MAE y NMAE promedio en todos los folds
if mae_scores:
    mae_promedio = np.mean(mae_scores)
    nmae_promedio = mae_promedio / 4.5  # Rango de ratings en MovieLens
    print(f"\nMAE promedio en 5-fold cross validation: {mae_promedio:.4f}")
    print(f"NMAE promedio en 5-fold cross validation: {nmae_promedio:.4f}")
else:
    print("No se realizaron predicciones en ninguno de los folds.")

Fold MAE: 0.7109
Fold MAE: 0.6987
Fold MAE: 0.7068
Fold MAE: 0.7104
Fold MAE: 0.7056

MAE promedio en 5-fold cross validation: 0.7065
NMAE promedio en 5-fold cross validation: 0.1570


Hard clustering with dense matrix user-item

In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

# Configuration
dataset_path = '../Datasets/'
ratings = pd.read_csv(dataset_path + 'ml-small.csv')

# Determine rating range dynamically
MIN_RATING = ratings['rating'].min()
MAX_RATING = ratings['rating'].max()

# Global parameters
n_clusters = 100    # Número de clusters
seed       = 42   # Para reproducibilidad
eps        = 1e-5 # Para evitar ceros en sim

def build_train_model(train_df):
    # 1) Matriz usuario×película
    train_mat  = train_df.pivot(index='userId', columns='itemId',
                                values='rating').fillna(0)
    data       = train_mat.values
    user_list  = train_mat.index.to_list()
    movie_list = train_mat.columns.to_list()

    # 2) Medias por usuario
    user_means = np.nan_to_num(train_mat.replace(0, np.nan).mean(axis=1).values)

    # 3) Clustering
    start_time = time.time()
    kmeans   = KMeans(n_clusters=n_clusters, random_state=seed)
    clusters = kmeans.fit_predict(data)
    end_time   = time.time()
    print(f"Clustering took {end_time - start_time:.2f} seconds")

    # 4) Precompute: miembros de cada cluster
    cluster_members = {
        cid: np.where(clusters == cid)[0]
        for cid in range(n_clusters)
    }

    # 5) Precompute: matriz de similitud coseno
    norms      = np.linalg.norm(data, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    normalized = data / norms
    cos_sim    = normalized @ normalized.T

    # 6) Precompute: para cada película, lista de usuarios que la valoraron
    item_users = {
        m_idx: data[:, m_idx].nonzero()[0]
        for m_idx in range(data.shape[1])
    }

    # 7) Mapas de índice para lookup rápido
    user_idx_map  = {uid: idx for idx, uid in enumerate(user_list)}
    movie_idx_map = {mid: idx for idx, mid in enumerate(movie_list)}

    # Añadir rango dinámico al modelo
    return {
        'data': data,
        'user_means': user_means,
        'clusters': clusters,
        'cluster_members': cluster_members,
        'cos_sim': cos_sim,
        'item_users': item_users,
        'user_idx_map': user_idx_map,
        'movie_idx_map': movie_idx_map,
        'min_rating': train_df['rating'].min(),
        'max_rating': train_df['rating'].max()
    }

def predict_rating(u, m, model):
    user_means      = model['user_means']
    cos_sim         = model['cos_sim']
    item_users      = model['item_users']
    cluster_members = model['cluster_members']
    clusters        = model['clusters']
    data            = model['data']

    cid     = clusters[u]
    members = cluster_members[cid]

    # Encuentra quienes valoraron m dentro del cluster
    raters = np.intersect1d(item_users[m], members, assume_unique=True)
    raters = raters[raters != u]  # excluir self

    if raters.size == 0:
        return user_means[u]

    # Sólo similitudes positivas
    sims     = cos_sim[u, raters]
    positive = sims > 0
    if not np.any(positive):
        return user_means[u]

    sims      = sims[positive] + eps
    neighbors = raters[positive]
    deviations= data[neighbors, m] - user_means[neighbors]

    pred = user_means[u] + sims.dot(deviations) / sims.sum()

    # clip usando rango dinámico del modelo
    return np.clip(pred, model['min_rating'], model['max_rating'])

def predict_random(u, m, model):
    return np.random.uniform(model['min_rating'], model['max_rating'])

def predict_user_mean(u, m, model):
    return model['user_means'][u]

def cross_validate_selective(ratings_df, n_splits=5):
    kf   = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    maes = []
    rmses = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(ratings_df), 1):
        train_df = ratings_df.iloc[train_idx]
        test_df  = ratings_df.iloc[test_idx]

        model = build_train_model(train_df)

        y_true, y_pred = [], []
        for _, row in test_df.iterrows():
            u_id, m_id, true_r = row['userId'], row['itemId'], row['rating']
            if u_id not in model['user_idx_map'] or m_id not in model['movie_idx_map']:
                continue
            u_idx = model['user_idx_map'][u_id]
            m_idx = model['movie_idx_map'][m_id]

            y_pred.append(predict_rating(u_idx, m_idx, model))
            y_true.append(true_r)

        mae = mean_absolute_error(y_true, y_pred)
        # Calcular RMSE a partir de MSE para compatibilidad con versiones antiguas de sklearn
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        print(f"Fold {fold}: MAE = {mae:.4f}, RMSE = {rmse:.4f}")
        maes.append(mae)
        rmses.append(rmse)

    print(f"Avg MAE: {np.mean(maes):.4f}, Avg RMSE: {np.mean(rmses):.4f}")

if __name__ == '__main__':
    cross_validate_selective(ratings)


Clustering took 0.98 seconds
Fold 1: MAE = 0.7481, RMSE = 0.9674
Clustering took 0.61 seconds
Fold 2: MAE = 0.7325, RMSE = 0.9435
Clustering took 0.75 seconds
Fold 3: MAE = 0.7418, RMSE = 0.9583
Clustering took 0.74 seconds
Fold 4: MAE = 0.7432, RMSE = 0.9608
Clustering took 0.93 seconds
Fold 5: MAE = 0.7407, RMSE = 0.9581
Avg MAE: 0.7413, Avg RMSE: 0.9576


Hard clustering with disperse matrix user-item

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Configuration
dataset_path = '../Datasets/'
ratings = pd.read_csv(dataset_path + 'ml-small.csv')

# Determine rating range dynamically
MIN_RATING = ratings['rating'].min()
MAX_RATING = ratings['rating'].max()

# Global parameters
n_clusters = 10    # Número de clusters
seed       = 42    # Para reproducibilidad
eps        = 1e-5  # Para evitar ceros en sim


def build_train_model(train_df):
    # 1) Índices y construcción dispersa
    user_list  = train_df['userId'].unique()
    movie_list = train_df['itemId'].unique()
    user_idx   = {u: i for i, u in enumerate(user_list)}
    movie_idx  = {m: j for j, m in enumerate(movie_list)}

    rows = train_df['userId'].map(user_idx)
    cols = train_df['itemId'].map(movie_idx)
    data = train_df['rating'].values
    data_sparse = csr_matrix((data, (rows, cols)),
                             shape=(len(user_list), len(movie_list)))

    # 2) Medias por usuario
    sums   = np.array(data_sparse.sum(axis=1)).flatten()
    counts = np.diff(data_sparse.indptr)
    user_means = np.divide(sums, counts, out=np.zeros_like(sums), where=counts!=0)

    # 3) Normas y normalización (para coseno)
    norms = np.sqrt(data_sparse.multiply(data_sparse).sum(axis=1)).A1
    norms[norms == 0] = 1.0
    inv_norms = 1.0 / norms
    norm_mat = diags(inv_norms)
    data_norm = norm_mat.dot(data_sparse)  # CSR with normalized rows

    # 4) Clustering en datos dispersos
    kmeans   = KMeans(n_clusters=n_clusters, random_state=seed)
    clusters = kmeans.fit_predict(data_sparse)
    cluster_members = {cid: np.where(clusters == cid)[0]
                       for cid in range(n_clusters)}

    # 5) Usuarios por película
    item_users = {m: data_sparse[:, m].nonzero()[0]
                  for m in range(data_sparse.shape[1])}

    return {
        'data': data_sparse,
        'data_norm': data_norm,
        'user_means': user_means,
        'clusters': clusters,
        'cluster_members': cluster_members,
        'item_users': item_users,
        'user_idx_map': user_idx,
        'movie_idx_map': movie_idx,
        'min_rating': train_df['rating'].min(),
        'max_rating': train_df['rating'].max()
    }


def predict_rating(u, m, model):
    user_means      = model['user_means']
    clusters        = model['clusters']
    cluster_members = model['cluster_members']
    item_users      = model['item_users']
    data_sparse     = model['data']
    data_norm       = model['data_norm']

    # obtener raters en mismo cluster que han valorado m
    cid     = clusters[u]
    members = cluster_members[cid]
    raters  = np.intersect1d(item_users[m], members, assume_unique=True)
    raters  = raters[raters != u]
    if raters.size == 0:
        return user_means[u]

    # calcular similitudes solo con raters
    row_u = data_norm.getrow(u)
    # data_norm[raters] es CSR shape (nr, n_items)
    sims  = row_u.dot(data_norm[raters].T).toarray().ravel()

    positive = sims > 0
    if not positive.any():
        return user_means[u]

    sims_pos   = sims[positive] + eps
    neigh      = raters[positive]
    # extraer ratings de vecinos para ítem m
    ratings_neigh = np.array(data_sparse[neigh, m].toarray()).ravel()
    deviations    = ratings_neigh - user_means[neigh]

    pred = user_means[u] + sims_pos.dot(deviations) / sims_pos.sum()
    return np.clip(pred, model['min_rating'], model['max_rating'])

def predict_random(u, m, model):
    return np.random.uniform(model['min_rating'], model['max_rating'])

def predict_user_mean(u, m, model):
    return model['user_means'][u]


def cross_validate_selective(ratings_df, n_splits=5):
    kf   = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    maes = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(ratings_df), 1):
        train_df = ratings_df.iloc[train_idx]
        test_df  = ratings_df.iloc[test_idx]
        model = build_train_model(train_df)
        y_true, y_pred = [], []
        for _, row in test_df.iterrows():
            u_id, m_id, true_r = row['userId'], row['itemId'], row['rating']
            if u_id not in model['user_idx_map'] or m_id not in model['movie_idx_map']:
                continue
            u_idx = model['user_idx_map'][u_id]
            m_idx = model['movie_idx_map'][m_id]
            y_pred.append(predict_rating(u_idx, m_idx, model))
            y_true.append(true_r)
        mae = mean_absolute_error(y_true, y_pred)
        print(f"Fold {fold}: MAE = {mae:.4f}")
        maes.append(mae)
    print(f"Avg MAE: {np.mean(maes):.4f}")

if __name__ == '__main__':
    cross_validate_selective(ratings)


Fold 1: MAE = 0.7096
Fold 2: MAE = 0.7038
Fold 3: MAE = 0.7133
Fold 4: MAE = 0.7140
Fold 5: MAE = 0.7146
Avg MAE: 0.7111
