In [9]:
# ! pip install cupy --quiet

In [10]:
# ! pip install numpy==1.16.5 --quiet

In [11]:
import pandas as pd
import cupy as cp

In [35]:
# Load your DataFrame
# Replace 'path/to/your/data.csv' with the actual path to your CSV file
df = pd.read_csv('/kaggle/input/board-games-database-from-boardgamegeek/user_ratings.csv')

df = df.groupby('Username', group_keys=False).apply(lambda x: x.sample(frac=0.01, random_state = 42))

df = df.rename(columns={
    'BGGId':'item_id',
    'Rating':'rating',
    'Username':'user_id'
})

In [36]:
df = df.drop_duplicates(subset = ['item_id', 'user_id'])

In [37]:
min_rating_count = 10

user_rating_count = df.groupby('user_id').count()
user_rating_count = user_rating_count[user_rating_count['rating'] >= min_rating_count]
user_list_filtered = list(user_rating_count.index)
df = df[df['user_id'].isin(user_list_filtered)]

In [38]:
df

Unnamed: 0,item_id,rating,user_id
13123513,271055,8.0,549sd
6474052,161970,8.0,549sd
9168070,2093,7.0,549sd
18515363,223151,7.0,549sd
578057,250643,6.0,549sd
...,...,...,...
13298939,311193,7.0,zodball
18924470,143701,5.0,zodball
17297519,155122,6.0,zodball
14624436,223049,7.0,zodball


In [39]:
# Drop any unnecessary columns
df = df[['user_id', 'item_id', 'rating']]

# Assuming you have unique user and item IDs, create mappings to indices
user_to_index = {user_id: idx for idx, user_id in enumerate(df['user_id'].unique())}
item_to_index = {item_id: idx for idx, item_id in enumerate(df['item_id'].unique())}

# Replace original user and item IDs with indices
df['user_index'] = df['user_id'].map(user_to_index)
df['item_index'] = df['item_id'].map(item_to_index)

# Drop the original user and item ID columns
df.drop(['user_id', 'item_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_index'] = df['user_id'].map(user_to_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['item_index'] = df['item_id'].map(item_to_index)


In [40]:
df

Unnamed: 0,rating,user_index,item_index
13123513,8.0,0,0
6474052,8.0,0,1
9168070,7.0,0,2
18515363,7.0,0,3
578057,6.0,0,4
...,...,...,...
13298939,7.0,816,2217
18924470,5.0,816,5619
17297519,6.0,816,2576
14624436,7.0,816,5620


In [41]:
# Get the number of users and items
num_users = len(user_to_index)
num_items = len(item_to_index)

# Initialize the implicit feedback matrix on the GPU
implicit_feedback_matrix = cp.zeros((num_users, num_items), dtype=cp.float32)

In [42]:
# Fill the implicit feedback matrix based on user-item interactions
for _, row in df.iterrows():
    implicit_feedback_matrix[row['user_index'], row['item_index']] = 1

print("Implicit feedback matrix shape:", implicit_feedback_matrix.shape)

Implicit feedback matrix shape: (817, 5622)


In [46]:
# Perform matrix factorization using SVD on the GPU
k = 10  # Number of latent features

# Transfer data to GPU
implicit_feedback_matrix_gpu = cp.asarray(implicit_feedback_matrix)

# SVD using CuPy
U_gpu, sigma_gpu, Vt_gpu = cp.linalg.svd(implicit_feedback_matrix_gpu, full_matrices=False)

In [51]:
sigma_gpu

array([8.175991 , 8.030272 , 7.93411  , 7.654975 , 7.45176  , 7.293922 ,
       7.157311 , 6.9922447, 6.715144 , 6.569649 , 6.5014925, 6.4562435,
       6.291327 , 6.281843 , 6.2084546, 6.095835 , 5.9165144, 5.7976446,
       5.7803183, 5.7693825, 5.728677 , 5.7191296, 5.650006 , 5.6044264,
       5.575679 , 5.5687804, 5.5435286, 5.5248632, 5.516504 , 5.4837866,
       5.472847 , 5.4575577, 5.441901 , 5.4111686, 5.3950524, 5.3865933,
       5.3723426, 5.368865 , 5.3501706, 5.3292513, 5.316676 , 5.3031664,
       5.287431 , 5.2794547, 5.273196 , 5.2522583, 5.2362866, 5.225046 ,
       5.2173743, 5.1988215, 5.1884747, 5.1681867, 5.161925 , 5.149006 ,
       5.1399374, 5.138224 , 5.1275654, 5.116384 , 5.1110225, 5.1035585,
       5.0874114, 5.0753393, 5.065539 , 5.048074 , 5.0411944, 5.032419 ,
       5.0256824, 5.0221663, 5.002572 , 4.994042 , 4.9808855, 4.9786115,
       4.967901 , 4.9570165, 4.942455 , 4.9384117, 4.9368906, 4.9224143,
       4.9089384, 4.9051795, 4.899931 , 4.8896813, 

In [53]:
U_gpu = U_gpu[:, :k]
sigma_gpu = sigma_gpu[:k]
Vt_gpu = Vt_gpu[:k, :]

In [55]:
# Convert sigma to a diagonal matrix on the GPU
sigma_gpu = cp.diag(sigma_gpu)

# Compute the user-feature matrix (U * sigma) on the GPU
user_feature_matrix_gpu = cp.dot(U_gpu, sigma_gpu)

# Compute the item-feature matrix (sigma * Vt) on the GPU
item_feature_matrix_gpu = cp.dot(sigma_gpu, Vt_gpu)

# Transfer results back to CPU
user_feature_matrix = cp.asnumpy(user_feature_matrix_gpu)
item_feature_matrix = cp.asnumpy(item_feature_matrix_gpu)

print("User-feature matrix shape:", user_feature_matrix.shape)
print("Item-feature matrix shape:", item_feature_matrix.shape)


User-feature matrix shape: (817, 10)
Item-feature matrix shape: (10, 5622)


In [56]:
index_to_user = {index: user_id for user_id, index in user_to_index.items()}

In [63]:
user_to_user_feature = {}

In [64]:
for index, user_id in index_to_user.items():
    user_to_user_feature[user_id] = user_feature_matrix[index]

In [67]:
import pickle

# Save to a pickle file
with open('/kaggle/working/user_features_dic.pkl', 'wb') as pickle_file:
    pickle.dump(user_to_user_feature, pickle_file)