In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from implicit.als import AlternatingLeastSquares

In [2]:
df_original = pd.read_csv("/mnt/data/public/bgg/bgg-19m-reviews.csv")

# EDA

In [4]:
df = df_original.copy()
df = df.iloc[:,1:]

In [5]:
df.isna().sum()

user             66
rating            0
comment    15596189
ID                0
name              0
dtype: int64

In [6]:
df.dropna(subset = ["user"], inplace = True)

In [7]:
duplicates = df[df.duplicated(subset=["user", "name"], keep=False)]
duplicates.sort_values(by="user").tail(2)

Unnamed: 0,user,rating,comment,ID,name
1916530,zzzabiss,8.0,Va muy bien como un filler introductorio. He r...,129622,Love Letter
12089270,zzzabiss,8.0,,277085,Love Letter


In [8]:
df_description = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")

  df_description = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")


In [9]:
print(df_description[df_description["id"] == 129622]['alternate'])
print(df_description[df_description["id"] == 277085]['alternate'])

17    ['Letters to Santa', 'List Miłosny', 'Lista Sk...
Name: alternate, dtype: object
1057    ['List miłosny (Edycja Premium)', 'Love Letter...
Name: alternate, dtype: object


they just save the common names under names, but same name have different editions (example, the two rows below are the regular vs premium edition)

based on my opinion, i suggest to sort by item ID since premium editions may come at different price points/qualities

In [10]:
duplicates = df[df.duplicated(subset=["user", "ID"], keep=False)]

if we sort by ID though, no duplicates

# Make Utility Matrix

In [11]:
df = df[["user","rating","ID"]]
df = df.rename(columns={"user": "user_name", 
                        "ID": "item_id"})

In [14]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
def create_csr_matrix(df_flat, 
                      user_col = 'user_name',
                      item_col = 'item_id', 
                      rating_col = 'rating'):
    """Create csr matrix from flatenned DataFrame"""
    # map users to unique interger row indices
    user_mapping = pd.Categorical(df_flat[user_col])
    user_indices = user_mapping.codes
    user_dict = dict(enumerate(user_mapping.categories))
    
    # map items to unique interger col indices
    item_mapping = pd.Categorical(df_flat[item_col])
    item_indices = item_mapping.codes
    item_dict = dict(enumerate(item_mapping.categories))
    
    # build sparse matrix
    df_utility = csr_matrix((df_flat[rating_col], (user_indices, item_indices)))

    return df_utility

In [16]:
df_train_utility = create_csr_matrix(df_train)

# ALS

In [35]:
def get_rsme_batched(model, test_matrix, 
                    user_col = 'user_name',
                    item_col = 'item_id', 
                    rating_col = 'rating',
                    batch_size=2_000_000):
    """
    Compute RMSE on nonzero entries of test_matrix in a memory-safe way.

    Args:
      model: trained implicit.als model (has user_factors, item_factors as numpy arrays)
      test_matrix: scipy.sparse matrix (CSR or any) with explicit ratings in .data
      batch_size: number of rating entries to evaluate at once

    Returns:
      RMSE (float)
    """
    if not isinstance(test_matrix, pd.DataFrame):
        test_matrix = pd.DataFrame(test_matrix, columns=[user_col, item_col, rating_col])

    users = test_matrix[user_col]
    items = test_matrix[item_col]
    true_ratings = test_matrix[rating_col].astype(np.float32)

    n = len(true_ratings)
    if n == 0:
        raise ValueError("No ratings in test_matrix")

    # Ensure model factors are float32 for speed/memory
    user_factors = model.user_factors.astype(np.float32) 
    item_factors = model.item_factors.astype(np.float32)

    mse = 0.0
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n) #get end per batch
        u = users[start:end]
        i = items[start:end]
        y_true = true_ratings[start:end]

        # Dot product
        y_preds = np.sum(user_factors[u] * item_factors[i], axis=1)

        # accumulate squared error
        err = y_true - y_preds
        mse += np.sum(err ** 2)

    rmse = np.sqrt(mse_sum / n)
    return rmse


In [39]:
# ogs
def rsme_batched(model, test_matrix, batch_size=2_000_000):
    """
    Compute RMSE on nonzero entries of test_matrix in a memory-safe way.

    Args:
      model: trained implicit.als model (has user_factors, item_factors as numpy arrays)
      test_matrix: scipy.sparse matrix (CSR or any) with explicit ratings in .data
      batch_size: number of rating entries to evaluate at once

    Returns:
      RMSE (float)
    """
    # Convert to COO for easy row/col/data access
    coo = test_matrix.tocoo()
    users = coo.row.astype(np.float32)
    items = coo.col.astype(np.float32)
    true_ratings = coo.data.astype(np.float32)

    n = len(true_ratings)
    if n == 0:
        raise ValueError("No ratings in test_matrix")

    # Ensure model factors are float32 for speed/memory
    user_factors = model.user_factors.astype(np.float32) 
    item_factors = model.item_factors.astype(np.float32)

    mse_sum = 0.0
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        u_batch = users[start:end]
        i_batch = items[start:end]
        y_true = true_ratings[start:end].astype(np.float32)

        # Vectorized dot: elementwise mult then sum across factors
        preds = np.sum(user_factors[u_batch] * item_factors[i_batch], axis=1)

        # accumulate squared error
        err = y_true - preds
        mse_sum += np.sum(err * err)

    rmse = np.sqrt(mse_sum / n)
    return rmse


In [18]:
# use default parameters for factors, regulatizaiton, iterations
baseline_als_model = AlternatingLeastSquares(random_state=42, 
                                             calculate_training_loss=True)
baseline_als_model.fit(df_train_utility, show_progress=True)

  check_blas_config()
  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

NameError: name 'model' is not defined

In [22]:
import joblib

joblib.dump(baseline_als_model, "ALS_implicit_cpu.joblib")

['ALS_implicit_cpu.joblib']

In [26]:
df_test_utility = create_csr_matrix(df_test)
rmse_als = get_rsme_batched(baseline_als_model, df_test_utility)

In [27]:
rmse_als 

7.2248049095856315

In [40]:
df_test_utility = create_csr_matrix(df_test)
rmse_als = get_rsme_batched(baseline_als_model, df_test)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
best_model = None
best_score = -float('inf')
best_params = None

for params in grid:
    model = AlternatingLeastSquares(
        factors=params['factors'],
        regularization=params['regularization'],
        iterations=params['iterations'],
        random_state=42,
        calculate_training_loss=True,
        use_gpu=False
    )
    
    model.fit(user_item_csr)
    user_factors, item_factors = model.user_factors, model.item_factors
    score = evaluate_model(user_factors, item_factors, test_data)
    
    if score > best_score:
        best_score = score
        best_model = model
        best_params = params

print(f"Best Parameters: {best_params}, Best Score (MSE): {-best_score}")

In [23]:
def als(M_arr, d, tol):
    M_csr = M_arr.copy()
    M_csc = M_arr.tocsc()
    

    n, k = M_arr.shape

    # Initialize U and V matrices with ones
    U = np.ones((n, d))
    V = np.ones((k, d))

    while True:
        # Fix U and solve for V (each item independently)
        print("Fix U and solve V")
        for j in range(k):
            users = M_csc[:, j].indices
            if users.size > 0:
                A = U[users]
                b = M_csc[users, j].toarray().flatten()
                try:
                    V[j, :] = np.linalg.solve(A.T @ A, A.T @ b)
                except np.linalg.LinAlgError:
                    V[j, :] = np.linalg.lstsq(A, b, rcond=None)[0]

        print("Fix V and solve U")
        # Fix V and solve for U (each user independently)
        for i in range(n):
            items = M_csr[i, :].indices
            if items.size > 0:
                A = V[items]
                b = M_csr[i, items].toarray().flatten()
                try:
                    U[i, :] = np.linalg.solve(A.T @ A, A.T @ b)
                except np.linalg.LinAlgError:
                    U[i, :] = np.linalg.lstsq(A, b, rcond=None)[0]

        # Calculate current predictions
        pred = U @ V.T

        # Calculate RMSE for observed ratings (use NumPy arrays)
        rmse = np.sqrt(np.mean((M_arr[mask] - pred[mask]) ** 2))

        # Check for convergence
        if rmse < tol:
            break
        else:
            break

    return U, Vzz

In [24]:
f_user, f_item = als(df_utility, 50, 2)

Fix U and solve V
Fix V and solve U


KeyboardInterrupt: 