#  1. Recommendation - Loading

This notebook performs the following actions:
1. Filter for users who performed at least K reccomendations
2. If enabled, randomly selected n users
3. Perform some data manipulations (e.g. converting the reccomendations from boolean to binary values)
4. Build the user-game matrix which is *sparse*


In [2]:
import os
import zipfile
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.sparse import csc_matrix
from scipy.sparse import vstack
import pickle
from pathlib import Path

In [3]:
### All random states are assigned to 42 for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [4]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

## Loading the Recommendation dataset

In [5]:
K_REC = 20
N_USERS = 2

In [6]:
%run "0.Splitting_Reading_Recommendation_File.ipynb"

In [7]:
recommendations_df

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4
...,...,...,...,...,...,...,...,...
41154789,633230,0,0,2021-02-15,True,41.0,1606890,41154789
41154790,758870,8,0,2019-07-18,False,8.0,1786254,41154790
41154791,696170,3,10,2018-03-26,False,2.0,6370324,41154791
41154792,696170,0,0,2018-06-11,True,4.0,1044289,41154792


In [8]:
def get_df_pos_recs(df):
    """keep only positive recommendations, eliminate negative recommendations"""
    df = df[df["is_recommended"]==True]
    return df

recommendations_df = get_df_pos_recs(recommendations_df)

In [9]:
print("################  Recommendations (all data, positive recommendations only) ################")
print("Users:", '{0:,.0f}'.format(len(recommendations_df.user_id.unique())))
print("Games:", '{0:,.0f}'.format(len(recommendations_df.app_id.unique())))
print("Number of reviews:", '{0:,.0f}'.format(recommendations_df.shape[0]))

################  Recommendations (all data, positive recommendations only) ################
Users: 12,663,134
Games: 37,419
Number of reviews: 35,304,398


In [10]:
def users_at_least_k_recs(df, K, n_usr):
    """
    Return a list of users who perform at least K recommendations. Also, filtering for n_users randomly selected
    
    Args:
        df (dataframe): recommendation dataset
        K (integer): representing the minimum number of recommendations for a user to be included in the final list
        n_usr (integer): number of randomly selected users
    Return:
        users (list): a list of randomly selected users who performed at least K recommendations
    """
    user_recommendation_counts = df.groupby("user_id").size()
    users_with_at_least_K_recs = list(user_recommendation_counts[user_recommendation_counts>=K_REC].index)
    # Randomly selectecting n_usr. Note that replace = False as I do not want that a user is picked more than once
    usr_to_keep = np.random.choice(users_with_at_least_K_recs, size = n_usr, replace = False)
    return usr_to_keep

users_to_keep = users_at_least_k_recs(recommendations_df, K_REC, N_USERS)
print("Number of users",  '{0:,.0f}'.format(len(users_to_keep)))
print("First five users id", users_to_keep[:5])


Number of users 2
First five users id [8305133   48950]


In [11]:
def loading_recommendations(df, sample=True):
    """
    Creates a dataframe with some data manipulations and if Sample = True filter for the subset of users defined previously

    Args:
        df (dataframe): the reccomendation dataframe
        sample (boolean): whether to filter or not
    
    Returns:
        df (dataframe): a cleaned dataframe
    """
    df = df.copy()
    if sample==True:
        df = df[df["user_id"].isin(users_to_keep)]
           
    # Convert boolean values to binary value 1: recommended 0: not reccomended (assumption) because not yet seen
    df.loc[:, "is_recommended"] = df["is_recommended"].apply(lambda x: 1 if x == True else 0)
    # Use Pandas codes which starts from 0 and save memory for categorical data 
    df['user_id_categorical'] = pd.Categorical(df['user_id']).codes
    df['app_id_categorical'] = pd.Categorical(df['app_id']).codes
    df = df.sort_values(by="user_id_categorical")
    # convert review date to pandas datetime
    df["date"] = pd.to_datetime(df["date"])
    return df

In [12]:
recommendations = loading_recommendations(recommendations_df)
recommendations

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,user_id_categorical,app_id_categorical
2942315,431960,0,0,2021-07-29,1,82.9,48950,2942315,0,16
39458992,1805930,2,0,2021-12-20,1,2.0,48950,39458992,0,48
39574170,637850,0,0,2021-12-20,1,1.0,48950,39574170,0,25
39767929,1279700,0,0,2022-02-16,1,0.0,48950,39767929,0,39
38301470,1120360,0,0,2022-01-31,1,1.4,48950,38301470,0,35
37749216,964350,0,0,2021-07-29,1,0.8,48950,37749216,0,31
37282212,1419370,0,0,2021-07-29,1,1.7,48950,37282212,0,42
37205959,390290,0,0,2022-10-09,1,1.8,48950,37205959,0,13
36582203,497640,0,0,2021-07-29,1,2.8,48950,36582203,0,19
35874402,939400,2,0,2022-01-13,1,2.2,48950,35874402,0,30


In [13]:
print("################  Recommendations (Sample) ################")
print("Minimum number of recommendations to be included in the sample:", K_REC)
print("Number of randomly selected users:", N_USERS)
print("Users:", '{0:,.0f}'.format(len(recommendations.user_id.unique())))
print("Games:", '{0:,.0f}'.format(len(recommendations.app_id.unique())))
print("Number of reviews:", '{0:,.0f}'.format(recommendations.shape[0]))

################  Recommendations (Sample) ################
Minimum number of recommendations to be included in the sample: 20
Number of randomly selected users: 2
Users: 2
Games: 49
Number of reviews: 49


## Building the User-Game Matrix

In [14]:
num_users = len(recommendations["user_id_categorical"].unique())
num_games = len(recommendations["app_id_categorical"].unique())

In [24]:
recommendations[["user_id_categorical","app_id_categorical"]].drop_duplicates()\
    .sort_values(by=["user_id_categorical","app_id_categorical"])

Unnamed: 0,user_id_categorical,app_id_categorical
7734647,0,3
30427782,0,12
37205959,0,13
2942315,0,16
19780802,0,18
36582203,0,19
39574170,0,25
30389464,0,26
32093773,0,28
28874329,0,29


In [22]:
def build_user_game_matrix(num_usr, num_gms):
    mat = sp.sparse.dok_matrix((num_usr, num_gms), dtype='int8')
    for user_game in recommendations[["user_id_categorical","app_id_categorical"]].values:
        # print(user_game[0], user_game[1])
        mat[user_game[0], user_game[1]] = 1
    return mat

user_game_matrix = build_user_game_matrix(num_users, num_games)
user_game_matrix

<2x49 sparse matrix of type '<class 'numpy.int8'>'
	with 49 stored elements in Dictionary Of Keys format>

In [17]:
user_game_matrix.toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0]], dtype=int8)

In [26]:
user_game_matrix[0,13]

1

In [18]:
print("################ User-Game Matrix ################")
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix.size))


################ User-Game Matrix ################
Number of Rows: 2
Numbeer of Columns: 49
Number of stored values: 49


## Save Users and Games Mapping

In [27]:
games_idx = recommendations[["app_id","app_id_categorical"]].drop_duplicates()
games_idx.to_csv("matrix/games_idx.csv", index=False)

In [28]:
user_idx = recommendations[["user_id","user_id_categorical"]].drop_duplicates()
user_idx.to_csv("matrix/users_idx.csv", index=False)

In [29]:
user_to_game_idx = recommendations[["user_id_categorical","app_id_categorical"]].drop_duplicates()
user_to_game_idx.to_csv("matrix/users2games_idx.csv", index=False)

In [1]:
pd.read_csv("matrix/users2games_idx.csv").sort_values(by=["user_id_categorical","app_id_categorical"])

NameError: name 'pd' is not defined