# User-Game Matrix - Bulding

In [None]:
%run "Recommendation - Loading.ipynb"

In [None]:
def mapping(df, cols, sort, path, file_name, file_type):
    """
    create a file with the mapping of users or games depending on the argument cols
    """
    mapping_idx = df[cols].drop_duplicates().sort_values(by=sort)
    if file_type=='csv':
        mapping_idx.to_csv(os.path.join(path, file_name), index=False)
    elif file_type=='pickle':
        mapping_idx.to_pickle(os.path.join(path, file_name), compression="zip")
    return mapping_idx


In [None]:
games_idx = mapping(recommendations, ["app_id", "app_id_categorical"], "app_id_categorical", "matrix", "games_idx.csv", "csv")
games_idx.head()

Unnamed: 0,app_id,app_id_categorical
21319078,10,1
31910508,20,2
11845672,30,3
31147398,40,4
35331086,50,5


In [None]:
users_idx = mapping(recommendations, ["user_id", "user_id_categorical"], "user_id_categorical", "matrix", "users_idx.pkl", "pickle")
users_idx.head()

Unnamed: 0,user_id,user_id_categorical
30478477,0,0
4715883,2,1
14376678,3,2
26659877,4,3
11296061,5,4


In [None]:
# All games in columns
apps_id = sorted(recommendations["app_id_categorical"].unique())
apps_id_df = pd.DataFrame(index = apps_id).T
# Adding the 0 column which represents the user_id
apps_id_df[0] = np.NaN
apps_id_df = apps_id_df.reindex(sorted(apps_id_df.columns), axis=1)
apps_id_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37410,37411,37412,37413,37414,37415,37416,37417,37418,37419


In [None]:
# List of games
print(len(apps_id))
apps_id[:5]
# apps_id starts from 1. 0 is the user_id column

37419


[1, 2, 3, 4, 5]

In [None]:
# All users, they are already sorted
users_list = recommendations["user_id_categorical"].unique()
print("Number of unique users", '{0:,.0f}'.format(len(users_list)))
users_list[525130: 525145]

Number of unique users 12,663,134


array([525130, 525131, 525132, 525133, 525134, 525135, 525136, 525137,
       525138, 525139, 525140, 525141, 525142, 525143, 525144])

In [None]:
def get_user_game_sparse_matrix(df, start_index, end_index):
    """
    Args::
        df::dataframe, like recommendations, approximately 41 milion of records
        start_index:: integer, the start of user_id index for slicing the df
        end_index:: integer, the end of user_id index for slicing the df
    Returns::
        sparse user-matrix to save space from a pivot table with users as index and games as columns
        
    First, this function reduces the size of the data using the indexes. 
    Second, it reads data in chunks to efficiently build a pivot table with users as index and games as columns.
    Third, a pivot table is saved into a csv file
    """
    # Slicing the data frame by groups of users, +1 to consider also the second extreme of the range    
    df = df[df["user_id_categorical"].between(start_index,end_index, inclusive="left")]
    unique_users = df["user_id_categorical"].nunique()
    print("The length of dataframe is", '{0:,.0f}'.format(len(df)))
    print("The users in this dataframe are", '{0:,.0f}'.format(unique_users), 
                "out of", '{0:,.0f}'.format(NUMBER_OF_UNIQUE_USERS))

    # Pivot the chunked DataFrame
    pivoted_chunk = pd.pivot_table(df, values='is_recommended', index='user_id_categorical',
                                     columns='app_id_categorical')\
                            .fillna(0)\
                            .astype("int32")
        
    # resetting index and renaming to allow the sorting of integer columns
    pivoted_chunk = pivoted_chunk.reset_index().rename({'user_id_categorical':0}, axis=1)

    # Sorting columns
    pivoted_chunk = pivoted_chunk.reindex(sorted(pivoted_chunk.columns), axis=1)

    # Having all app_id
    pivoted_chunk = pd.concat([apps_id_df, pivoted_chunk], ignore_index=False, axis=0)\
                            .fillna(0)\
                            .astype("int32")
    
    # Creating a sparse column-based matrix
    sparse = csc_matrix(pivoted_chunk)
    return sparse

In [None]:
# users_games_matrix_csc = get_user_game_sparse_matrix(df=recommendations, start_index=0, end_index=10000)
# users_games_matrix_csc

In [None]:
def save_matrices(df, df_subset_size):
    """
    Args:
        df::dataframe, like recommendations, approximately 41 milion of records
        df_subset_size:: integer, the size to slice the data
                
    This function creates several user-games matrix calling another function to create a single
    csv file
    """
    for end in range(0, len(df)+df_subset_size, df_subset_size):
        print(end, end+df_subset_size)
        matrix = get_user_game_sparse_matrix(df, end, end+df_subset_size)
        with open('assets/sparse_matrix_'+str(end)+'-'+str(end+df_subset_size)+'.pkl', 'wb') as file:
            pickle.dump(matrix, file)
    return None

In [None]:
save_matrices(recommendations, 10000)

0 10000
The length of dataframe is 27,220
The users in this dataframe are 10,000 out of 12,663,134
10000 20000
The length of dataframe is 27,693
The users in this dataframe are 10,000 out of 12,663,134
20000 30000
The length of dataframe is 26,100
The users in this dataframe are 10,000 out of 12,663,134
30000 40000
The length of dataframe is 25,868
The users in this dataframe are 10,000 out of 12,663,134
40000 50000
The length of dataframe is 26,917
The users in this dataframe are 10,000 out of 12,663,134
50000 60000
The length of dataframe is 26,638
The users in this dataframe are 10,000 out of 12,663,134
60000 70000
The length of dataframe is 26,334
The users in this dataframe are 10,000 out of 12,663,134
70000 80000
The length of dataframe is 24,737
The users in this dataframe are 10,000 out of 12,663,134
80000 90000
The length of dataframe is 26,959
The users in this dataframe are 10,000 out of 12,663,134
90000 100000
The length of dataframe is 25,703
The users in this dataframe ar

In [None]:
def read_files(path):
    """
    read each picke files and create one unique sparse matrix
    """
    pickles = []
    # sorting the path
    paths = sorted(Path(path).iterdir(), key=os.path.getmtime)
    for f in paths:
        with open(f, 'rb') as file:
            loaded_matrix = pickle.load(file)
        pickles.append(loaded_matrix)
        
    print("Number of files:", len(pickles))
    m = vstack(pickles, format="csc")
    return m

In [None]:
get_user_game_sparse_matrix_csc = read_files("assets")
get_user_game_sparse_matrix_csc

Number of files: 3532


<12663134x37420 sparse matrix of type '<class 'numpy.intc'>'
	with 47967516 stored elements in Compressed Sparse Column format>

In [None]:
get_user_game_sparse_matrix_csc.shape

(12663134, 37420)

In [None]:
# Data Check
def check_between_sparse_and_dataframe(USER_ID_TO_CHECK):
    """
    ok if array indices match app_id_categorical value
    """
    sample = get_user_game_sparse_matrix_csc[USER_ID_TO_CHECK,:].toarray()
    indices = np.where(sample == 1)
    user_rec = recommendations[(recommendations.user_id_categorical == USER_ID_TO_CHECK)].sort_values(by="app_id_categorical")
    return (indices, user_rec)

In [None]:
check_between_sparse_and_dataframe(9900010)

((array([0, 0], dtype=int64), array([1506, 2333], dtype=int64)),
           app_id  helpful  funny        date  is_recommended  hours   user_id  \
 16606518  222480        0      0  2013-05-29               1   68.2  11209562   
 19798307  265550       21      0  2014-10-31               1   44.5  11209562   
 
           review_id  user_id_categorical  app_id_categorical  
 16606518   16606518              9900010                1506  
 19798307   19798307              9900010                2333  )

In [None]:
def save_user_game_matrix(path, sparse):
    """
    save the final matrix in a pickle file
    """
    with open(os.path.join(path,'user_game_matrix.pkl'), 'wb') as file:
        pickle.dump(sparse, file)
    return None

In [None]:
save_user_game_matrix("matrix", get_user_game_sparse_matrix_csc)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=78c133f5-defd-458d-ba8f-cbdc9ae58cfb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>