In [164]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz
from sklearn.preprocessing import StandardScaler

In [None]:
path = '../Datasets/'
ratings = pd.read_csv(path + 'ml-small.csv')

In [None]:
# Create user-item matrix
user_item = ratings.pivot_table(index='userId', columns='itemId', values='rating')

# Keep mask of original missing entries
missing_mask = user_item.isnull()

# Fill missing values (e.g., with zeros or user mean)
# Here, we fill with user mean to preserve relative preferences
def fill_with_user_mean(row):
    return row.fillna(row.mean())

user_item_filled = user_item.apply(fill_with_user_mean, axis=1)

0

In [167]:
# Standardize data
scaler = StandardScaler()
user_item_scaled = scaler.fit_transform(user_item_filled)

# Transpose data: features (movies) x samples (users)
data = user_item_scaled.T

In [168]:
# Fuzzy c-means parameters
n_clusters = 5    # Number of clusters
m = 2.0           # Fuzziness parameter
error = 0.005     # Stopping criterion
maxiter = 1000    # Maximum iterations
seed = 42         # For reproducibility

# Run fuzzy c-means
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    data, c=n_clusters, m=m, error=error, maxiter=maxiter, init=None, seed=seed
)

In [169]:
# ---------------------------
# Defuzzificación por COG
# ---------------------------

# 1. Preparamos un vector columna con las etiquetas 1..n_clusters
cluster_labels = np.arange(1, n_clusters + 1)[:, None]  # shape (n_clusters, 1)

# 2. Calculamos el Centro de Gravedad para cada usuario j:
#    cog[j] = sum_i ( label_i * u[i,j] )
#    (dado que sum_i u[i,j] == 1, no hace falta normalizar)
cog = (cluster_labels * u).sum(axis=0)  # shape (n_users,)

# 3. Redondeamos al entero más cercano para asignar cluster
assigned_clusters = np.rint(cog).astype(int)
#assigned_clusters_argmax = np.argmax(u, axis=0) + 1

# 4. Creamos un DataFrame con la asignación final
df_assignments = pd.DataFrame({
    'userId': user_item_filled.index,
    'cluster_cog': assigned_clusters
})

'''
hard_labels = np.argmax(u, axis=0) + 1
df_assignments['cluster_argmax'] = hard_labels
'''

"\nhard_labels = np.argmax(u, axis=0) + 1\ndf_assignments['cluster_argmax'] = hard_labels\n"

In [None]:
# --- Prediction step using Pearson-based user-based CF ---
# 1. Compute user means and center the filled ratings
user_means = user_item_filled.mean(axis=1)                           # shape (n_users,)
R_centered = user_item_filled.sub(user_means, axis=0)                # DataFrame (n_users x n_items)



# 2. Compute Pearson correlation similarity between users
#    using centered ratings
user_similarity = R_centered.T.corr(method='pearson')                 # DataFrame (n_users x n_users)
user_similarity = user_similarity.fillna(0)


# 3. Filter similarities to only those within the same cluster
cluster_series = df_assignments.set_index('userId')['cluster_cog']
# Align indices
user_similarity = user_similarity.loc[user_item_filled.index, user_item_filled.index]
cluster_series = cluster_series.loc[user_item_filled.index]
# Build mask: True if same cluster
same_cluster_mask = np.equal.outer(cluster_series.values, cluster_series.values)
# Apply mask
user_similarity_filtered = user_similarity * same_cluster_mask

# 4. Retain only top_k neighbors per user
top_k = 10
# Initialize weight matrix W
W = pd.DataFrame(
    np.zeros_like(user_similarity_filtered.values),
    index=user_similarity_filtered.index,
    columns=user_similarity_filtered.columns
)
for uid in user_similarity_filtered.index:
    sim_row = user_similarity_filtered.loc[uid].copy()
    sim_row[uid] = np.nan  # exclude self
    # pick top_k neighbors
    top_neighbors = sim_row.nlargest(top_k).index
    W.loc[uid, top_neighbors] = sim_row.loc[top_neighbors]

# 5. Compute predicted deviations with a single matrix multiplication
#    numerator: weighted sum of neighbor-centered ratings
numerator = W.values.dot(R_centered.values)                           # shape (n_users x n_items)
#    denominator: sum of absolute similarities per user
denominator = np.abs(W).sum(axis=1).values                            # shape (n_users,)
# avoid division by zero
denominator[denominator == 0] = 1e-9
#    predicted centered ratings
delta_pred = numerator / denominator[:, None]                        # shape (n_users x n_items)

# 6. Reconstruct full predicted rating matrix
predicted_matrix = delta_pred + user_means.values[:, None]            # add back user means
predicted_df = pd.DataFrame(
    predicted_matrix,
    index=user_item_filled.index,
    columns=user_item_filled.columns
)

# 7. Replace predictions with actual ratings where available
predicted_df[~missing_mask] = user_item[~missing_mask]

0

In [None]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Configuration
path = '../Datasets/'
ratings = pd.read_csv(path + 'ml-1m.csv')

# Global parameters
n_clusters = 10    # Number of fuzzy clusters
m = 2              # Fuzziness parameter
error = 0.005      # Stopping criterion for fuzzy C-Means
maxiter = 1000     # Max iterations
top_k = 10         # Number of neighbors
seed = 42          # For reproducibility

# Rating range for NMAE
a_min = ratings.rating.min()
a_max = ratings.rating.max()

def build_completed_matrix(train_df):
    # --- Construct user-item matrix with pandas for indexing ---
    user_item = train_df.pivot(index='userId', columns='itemId', values='rating')
    users = user_item.index.values
    items = user_item.columns.values

    # Mask of observed ratings
    mask = ~np.isnan(user_item.values)

    # Fill missing with user means
    filled = user_item.values.copy()
    user_means = np.nanmean(filled, axis=1, keepdims=True)
    filled[np.isnan(filled)] = np.take(user_means, np.where(np.isnan(filled))[0])

    # Standardize across items (axis=1)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(filled)
    data = scaled.T  # transpose for features x samples

    # Fuzzy C-means clustering
    cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
        data, c=n_clusters, m=m, error=error, maxiter=maxiter, init=None, seed=seed
    )

    # Defuzzify via center of gravity
    labels = np.arange(1, n_clusters+1)[:, None]
    cog = (labels * u).sum(axis=0)
    cluster_assign = np.rint(cog).astype(int)
    user_to_cluster = dict(zip(users, cluster_assign))

    # Center ratings by subtracting user means
    R_centered = filled - user_means

    # Compute Pearson similarity via vectorized NumPy
    X = R_centered
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1e-9
    X_norm = X / norms
    sim = X_norm @ X_norm.T
    sim = np.nan_to_num(sim)

    # Mask similarities across clusters and self-similarity
    cluster_arr = np.array(cluster_assign)
    same_cl = (cluster_arr[:, None] == cluster_arr[None, :])
    sim *= same_cl
    n = sim.shape[0]
    sim[np.arange(n), np.arange(n)] = 0

    # Vectorized top-k neighbor selection
    # find indices of the top_k highest sims per user
    neighbor_idxs = np.argpartition(-sim, top_k, axis=1)[:, :top_k]
    W = np.zeros_like(sim)
    rows = np.repeat(np.arange(n), top_k)
    cols = neighbor_idxs.ravel()
    W[rows, cols] = sim[rows, cols]

    # Predict deviations and reconstruct ratings
    denom = np.abs(W).sum(axis=1, keepdims=True)
    denom[denom == 0] = 1e-9
    delta = W @ X / denom
    preds = delta + user_means

    # Wrap back into DataFrame and restore known ratings
    pred_df = pd.DataFrame(preds, index=users, columns=items)
    pred_df.values[mask] = filled[mask]
    return pred_df

# 5-Fold Cross-Validation
def cross_validate(ratings_df, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    maes, nmaes = [], []
    for train_idx, test_idx in kf.split(ratings_df):
        train, test = ratings_df.iloc[train_idx], ratings_df.iloc[test_idx]
        comp = build_completed_matrix(train)
        y_t, y_p = [], []
        for _, r in test.iterrows():
            u, i, true = r.userId, r.itemId, r.rating
            if u in comp.index and i in comp.columns:
                y_t.append(true)
                y_p.append(comp.loc[u, i])
        mae = mean_absolute_error(y_t, y_p)
        nmae = mae/(a_max - a_min)
        maes.append(mae); nmaes.append(nmae)
        print(f"MAE={mae:.4f}, NMAE={nmae:.4f}")
    print(f"Avg MAE={np.mean(maes):.4f}, Avg NMAE={np.mean(nmaes):.4f}")

if __name__ == '__main__':
    cross_validate(ratings)


MAE=0.7732, NMAE=0.1933


KeyboardInterrupt: 