# User-Game Matrix (*recommendation dataset*)

In [19]:
import os
import zipfile
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse import vstack
import pickle
from pathlib import Path

In [2]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [3]:
def loading_recommendations(path):
    # df = dd.read_csv(path)
    df = pd.read_csv(path)
    # keep only users that do recommend, eliminate those who does not recommend
    df = df[df["is_recommended"]==True]
    df["is_recommended"] = df["is_recommended"].apply(lambda x: 1 if x == True else 0)
    df['user_id_categorical'] = pd.Categorical(df['user_id']).codes
    ## App_ID starts from 1. I leave the 0 to the user_id column. See later
    df['app_id_categorical'] = pd.Categorical(df['app_id']).codes + 1
    df = df.sort_values(by="user_id_categorical")
    #df = df.iloc[:20000]
    return df

In [4]:
recommendations = loading_recommendations("resources/recommendations.csv")
recommendations

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,user_id_categorical,app_id_categorical
30478477,235540,28,2,2015-10-17,1,21.9,0,30478477,0,1698
24437527,49520,35,16,2015-08-12,1,2.1,0,24437527,0,895
34510797,627690,8,0,2022-03-06,1,103.3,0,34510797,0,12333
22036352,1454400,6,2,2022-02-18,1,762.6,0,22036352,0,28682
36809055,317400,15,0,2015-08-01,1,18.0,0,36809055,0,3676
...,...,...,...,...,...,...,...,...,...,...
5228835,397540,0,0,2020-11-25,1,160.9,14306059,5228835,12663129,6218
39735230,1112830,0,0,2020-10-10,1,1.0,14306060,39735230,12663130,22680
23151356,1407200,0,0,2022-09-02,1,171.0,14306061,23151356,12663131,27909
25486974,1987080,0,0,2022-08-28,1,15.5,14306062,25486974,12663132,35826


In [5]:
recommendations.shape

(35304398, 10)

In [6]:
NUMBER_OF_UNIQUE_USERS = recommendations["user_id_categorical"].nunique()
NUMBER_OF_UNIQUE_GAMES = recommendations["app_id_categorical"].nunique()

In [7]:
print("There are", '{0:,.0f}'.format(NUMBER_OF_UNIQUE_USERS) , "users")
print("There are", '{0:,.0f}'.format(NUMBER_OF_UNIQUE_GAMES) , "games")

There are 12,663,134 users
There are 37,419 games


## Data Exploration

**Average number of reviews per user**

In [8]:
review_user = recommendations[["user_id","review_id"]].groupby("user_id").count()
review_user = review_user.reset_index()
review_user.columns = [["user_id", "#_reviews"]]
review_user.head()

Unnamed: 0,user_id,#_reviews
0,0,28
1,1,1
2,2,4
3,3,2
4,4,1


In [9]:
review_user["#_reviews"].describe()

Unnamed: 0,#_reviews
count,13781059.0
mean,3.0
std,8.1
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,6045.0


In [10]:
# For instance, let's take the reviews of user_id == 0
# recommendations[recommendations["user_id"]==0]

**Average number of reviews per game**

In [11]:
review_game = recommendations[["app_id","review_id"]].groupby("app_id").count()
review_game = review_game.reset_index()
review_game.columns = [["app_id", "#_reviews"]]
review_game.head()

Unnamed: 0,app_id,#_reviews
0,10,41043
1,20,4284
2,30,4432
3,40,1610
4,50,9721


In [12]:
review_game["#_reviews"].describe()

Unnamed: 0,#_reviews
count,37610.0
mean,1094.3
std,7689.3
min,1.0
25%,13.0
50%,39.0
75%,179.8
max,319492.0


In [13]:
# For instance, let's take the reviews of app_id == 10
recommendations[recommendations["app_id"]==10]

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
6175295,10,0,0,2021-12-06,1,22.6,2524579,6175295
6175299,10,0,0,2021-11-25,1,3.5,7761084,6175299
6175300,10,0,0,2021-11-25,1,5.4,9377644,6175300
6175304,10,0,0,2021-10-16,1,211.4,2713079,6175304
6175307,10,0,0,2021-07-26,1,359.6,10214996,6175307
...,...,...,...,...,...,...,...,...
38989335,10,2,0,2020-01-11,1,698.0,8010358,38989335
38989815,10,0,0,2021-07-17,1,13.0,6258273,38989815
38990023,10,2,0,2021-11-15,1,194.0,166934,38990023
38991478,10,0,0,2020-12-18,1,30.0,13780121,38991478


## User-game Matrix

In [8]:
# All games in columns
apps_id = sorted(recommendations["app_id_categorical"].unique())
apps_id_df = pd.DataFrame(index = apps_id).T
# Adding the 0 column which represents the user_id
apps_id_df[0] = np.NaN
apps_id_df = apps_id_df.reindex(sorted(apps_id_df.columns), axis=1)
apps_id_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37410,37411,37412,37413,37414,37415,37416,37417,37418,37419


In [9]:
# List of games
print(len(apps_id))
apps_id[:5]
# apps_id starts from 1. 0 is the user_id column

37419


[1, 2, 3, 4, 5]

In [10]:
# All users, they are already sorted
users_list = recommendations["user_id_categorical"].unique()
print("Number of unique users", '{0:,.0f}'.format(len(users_list)))
users_list[525130: 525145]

Number of unique users 12,663,134


array([525130, 525131, 525132, 525133, 525134, 525135, 525136, 525137,
       525138, 525139, 525140, 525141, 525142, 525143, 525144])

In [11]:
def get_user_game_sparse_matrix(df, start_index, end_index):
    """
    Args::
        df::dataframe, like recommendations, approximately 41 milion of records
        start_index:: integer, the start of user_id index for slicing the df
        end_index:: integer, the end of user_id index for slicing the df
    Returns::
        sparse user-matrix to save space from a pivot table with users as index and games as columns
        
    First, this function reduces the size of the data using the indexes. 
    Second, it reads data in chunks to efficiently build a pivot table with users as index and games as columns.
    Third, a pivot table is saved into a csv file
    """
    # Slicing the data frame by groups of users, +1 to consider also the second extreme of the range    
    df = df[df["user_id_categorical"].between(start_index,end_index, inclusive="left")]
    unique_users = df["user_id_categorical"].nunique()
    print("The length of dataframe is", '{0:,.0f}'.format(len(df)))
    print("The users in this dataframe are", '{0:,.0f}'.format(unique_users), 
                "out of", '{0:,.0f}'.format(NUMBER_OF_UNIQUE_USERS))

    # Pivot the chunked DataFrame
    pivoted_chunk = pd.pivot_table(df, values='is_recommended', index='user_id_categorical',
                                     columns='app_id_categorical')\
                            .fillna(0)\
                            .astype("int32")
        
    # resetting index and renaming to allow the sorting of integer columns
    pivoted_chunk = pivoted_chunk.reset_index().rename({'user_id_categorical':0}, axis=1)

    # Sorting columns
    pivoted_chunk = pivoted_chunk.reindex(sorted(pivoted_chunk.columns), axis=1)

    # Having all app_id
    pivoted_chunk = pd.concat([apps_id_df, pivoted_chunk], ignore_index=False, axis=0)\
                            .fillna(0)\
                            .astype("int32")
    
    # Creating a sparse column-based matrix
    sparse = csc_matrix(pivoted_chunk)
    return sparse

In [27]:
# users_games_matrix_csc = get_user_game_sparse_matrix(df=recommendations, start_index=0, end_index=10000)
# users_games_matrix_csc

In [15]:
def save_matrices(df, df_subset_size):
    """
    Args:
        df::dataframe, like recommendations, approximately 41 milion of records
        df_subset_size:: integer, the size to slice the data
                
    This function creates several user-games matrix calling another function to create a single
    csv file
    """
    for end in range(0, len(df)+df_subset_size, df_subset_size):
        print(end, end+df_subset_size)
        matrix = get_user_game_sparse_matrix(df, end, end+df_subset_size)
        with open('assets/sparse_matrix_'+str(end)+'-'+str(end+df_subset_size)+'.pkl', 'wb') as file:
            pickle.dump(matrix, file)
    return None

In [16]:
save_matrices(recommendations, 10000)

0 10000
The length of dataframe is 27,220
The users in this dataframe are 10,000 out of 12,663,134
10000 20000
The length of dataframe is 27,693
The users in this dataframe are 10,000 out of 12,663,134
20000 30000
The length of dataframe is 26,100
The users in this dataframe are 10,000 out of 12,663,134
30000 40000
The length of dataframe is 25,868
The users in this dataframe are 10,000 out of 12,663,134
40000 50000
The length of dataframe is 26,917
The users in this dataframe are 10,000 out of 12,663,134
50000 60000
The length of dataframe is 26,638
The users in this dataframe are 10,000 out of 12,663,134
60000 70000
The length of dataframe is 26,334
The users in this dataframe are 10,000 out of 12,663,134
70000 80000
The length of dataframe is 24,737
The users in this dataframe are 10,000 out of 12,663,134
80000 90000
The length of dataframe is 26,959
The users in this dataframe are 10,000 out of 12,663,134
90000 100000
The length of dataframe is 25,703
The users in this dataframe ar

In [17]:
def read_files(path):
    """
    read each picke files and create one unique sparse matrix
    """
    pickles = []
    # sorting the path
    paths = sorted(Path(path).iterdir(), key=os.path.getmtime)
    for f in paths:
        with open(f, 'rb') as file:
            loaded_matrix = pickle.load(file)
            #loaded_csc = loaded_matrix.tocsc()
        pickles.append(loaded_matrix)
        
    print("Number of files:", len(pickles))
    m = vstack(pickles, format="csc")
    return m

In [20]:
get_user_game_sparse_matrix_csc = read_files("assets")
get_user_game_sparse_matrix_csc

Number of files: 3532


<12663134x37420 sparse matrix of type '<class 'numpy.intc'>'
	with 47967516 stored elements in Compressed Sparse Column format>

In [24]:
get_user_game_sparse_matrix_csc.shape

(12663134, 37420)

In [22]:
# Data Check
def check_between_sparse_and_dataframe(USER_ID_TO_CHECK):
    """
    ok if array indices match app_id_categorical value
    """
    sample = get_user_game_sparse_matrix_csc[USER_ID_TO_CHECK,:].toarray()
    indices = np.where(sample == 1)
    user_rec = recommendations[(recommendations.user_id_categorical == USER_ID_TO_CHECK)].sort_values(by="app_id_categorical")
    return (indices, user_rec)

In [23]:
check_between_sparse_and_dataframe(9900010)

((array([0, 0], dtype=int64), array([1506, 2333], dtype=int64)),
           app_id  helpful  funny        date  is_recommended  hours   user_id  \
 16606518  222480        0      0  2013-05-29               1   68.2  11209562   
 19798307  265550       21      0  2014-10-31               1   44.5  11209562   
 
           review_id  user_id_categorical  app_id_categorical  
 16606518   16606518              9900010                1506  
 19798307   19798307              9900010                2333  )

In [33]:
def save_user_game_matrix(path, sparse):
    """
    save the final matrix in a pickle file
    """
    with open(os.path.join(path,'user_game_matrix.pkl'), 'wb') as file:
        pickle.dump(sparse, file)
    return None

In [34]:
save_user_game_matrix("matrix", get_user_game_sparse_matrix_csc)