In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [4]:
df_original = pd.read_csv("/mnt/data/public/bgg/bgg-19m-reviews.csv")

In [5]:
df_original.nunique()

Unnamed: 0    18964807
user            412815
rating           10759
comment        3046149
ID               21839
name             21440
dtype: int64

# EDA

In [50]:
df = df_original.copy()
df = df.iloc[:,1:]

In [51]:
df.isna().sum()

user             66
rating            0
comment    15596189
ID                0
name              0
dtype: int64

In [52]:
df.dropna(subset = ["user"], inplace = True)

In [60]:
duplicates = df[df.duplicated(subset=["user", "name"], keep=False)]
duplicates.sort_values(by="user").tail(2)

Unnamed: 0,user,rating,comment,ID,name
1916530,zzzabiss,8.0,Va muy bien como un filler introductorio. He r...,129622,Love Letter
12089270,zzzabiss,8.0,,277085,Love Letter


In [56]:
df_description = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")

  df_description = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")


In [58]:
print(df_description[df_description["id"] == 129622]['alternate'])
print(df_description[df_description["id"] == 277085]['alternate'])

17    ['Letters to Santa', 'List Miłosny', 'Lista Sk...
Name: alternate, dtype: object
1057    ['List miłosny (Edycja Premium)', 'Love Letter...
Name: alternate, dtype: object


they just save the common names under names, but same name have different editions (example, the two rows below are the regular vs premium edition)

based on my opinion, i suggest to sort by item ID since premium editions may come at different price points/qualities

In [61]:
duplicates = df[df.duplicated(subset=["user", "ID"], keep=False)]

if we sort by ID though, no duplicates

# Make Utility Matrix

In [66]:
df = df[["user","rating","ID"]]
df = df.rename(columns={"user": "user_name", 
                        "ID": "item_id"})

In [67]:
# map users to unique interger row indices
user_mapping = pd.Categorical(df['user_name'])
user_indices = user_mapping.codes
user_dict = dict(enumerate(user_mapping.categories))

# map items to unique interger col indices
item_mapping = pd.Categorical(df['item_id'])
item_indices = item_mapping.codes
item_dict = dict(enumerate(item_mapping.categories))

# build sparse matrix
df_utility = csr_matrix((df['rating'], (user_indices, item_indices)))

In [68]:
df_utility

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18964741 stored elements and shape (412815, 21839)>

# Neighbor-based

In [None]:
def user_complete(df_utility, k):
    """Computes completed utility matrix using 
    adjusted cosine similarity of users
     
    Args:
        df_utility (dataFrame): utility matrix
        k (int): number of similar users
    
    Returns:
        dataFrame: completed utility matrix
    """
    
    users = df_utility.index
    items = df_utility.columns
    cos_sim_df = pd.DataFrame(columns = users, index = users)
    
    # copy original ratings
    result = df_utility.copy()

    # get mean of row then subtract per row
    mean = df_utility.mean(axis=1)
    df_utility = df_utility.sub(mean, axis=0)
    
    # create most similar dictionary for easy querying
    most_sim = dict()
    
    for i in users:
        for j in users:
            if pd.isna(cos_sim_df.loc[i, j]):
                a = df_utility.loc[i]
                b = df_utility.loc[j]

                # check for common non-null values in a and b
                mask = a.notna() & b.notna()

                # only keep common non-null values
                a = a[mask]
                b = b[mask]
                    
                denom_a = np.sqrt(sum(a**2))
                denom_b = np.sqrt(sum(b**2))

                # handle special cases
                # (no overlap, self-referencing, zero denominator)
                if (len(a) == 0 or 
                    i == j or 
                    denom_a == 0 or denom_b == 0):
                    cos_sim_df.loc[i,j] = 0
                    cos_sim_df.loc[j,i] = 0
                    
                else:
                    cos_sim_df.loc[i,j] = a.dot(b) / (denom_a*denom_b)
                    cos_sim_df.loc[j,i] = a.dot(b) / (denom_a*denom_b)
                    
        similar = cos_sim_df.loc[i]
        # only get positive similarity scores 
        similar = similar[similar>0].reset_index()
        similar = similar.sort_values(by=[i,"index"], 
                                      ascending = [False, True])
        similar = similar.set_index("index")
        similar = similar.head(k)
        #print(similar)
        most_sim[i] = list(similar.index)


    
    for row in users:
        for col in items:
            # only compute if item is not rated
            if pd.isna(df_utility.loc[row, col]):
                # get rating of similar users
                rate = np.array([df_utility.loc[sim, col] 
                                 for sim in most_sim[row]])
                # get weights of similar users
                weight = np.array([cos_sim_df.loc[row, sim] 
                                   for sim in most_sim[row]])

                # check for common non-null values
                mask = ~np.isnan(rate) & ~np.isnan(weight)

                # only keep common non-null values
                rate = rate[mask]
                weight = weight[mask]

                # express predicted rate in original rating scale
                # store in results to preserve original data
                if weight.sum() != 0:
                    result.loc[row, col] = (mean[row] + 
                    (np.dot(weight, rate) / weight.sum()))
    return result