#  1. Recommendation - Loading

In [1]:
import os
import zipfile
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy.sparse import csc_matrix
from scipy.sparse import vstack
import pickle
from pathlib import Path

In [2]:
### All random states are assigned to 42 for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [3]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

## Loading the Recommendation dataset

In [4]:
K_REC = 20
N_USERS = 10000

In [5]:
%run "0.Splitting_Reading_Recommendation_File.ipynb"

In [6]:
recommendations_df

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,236430,0,0,2014-06-19,True,82.9,434717,16300000
1,976730,0,0,2021-10-24,True,14.6,12122913,16300001
2,413150,0,0,2021-01-12,True,103.6,6920371,16300002
3,1551360,0,0,2022-09-04,True,748.5,10888033,16300003
4,250320,0,0,2022-04-25,True,9.6,13391688,16300004
...,...,...,...,...,...,...,...,...
41154789,740130,0,0,2021-09-18,True,93.6,12470978,9699994
41154790,740130,0,0,2021-10-01,True,166.0,799025,9699995
41154791,740130,0,0,2021-09-14,True,21.3,6697072,9699996
41154792,740130,0,0,2021-09-14,False,41.9,220653,9699997


In [7]:
def get_df_pos_recs(df):
    """keep only positive recommendations, eliminate negative recommendations"""
    df = df[df["is_recommended"]==True]
    return df

recommendations_df = get_df_pos_recs(recommendations_df)

In [8]:
print("################  Recommendations (all data, positive recommendations only) ################")
print("Users:", '{0:,.0f}'.format(len(recommendations_df.user_id.unique())))
print("Games:", '{0:,.0f}'.format(len(recommendations_df.app_id.unique())))

################  Recommendations (all data, positive recommendations only) ################
Users: 12,636,209
Games: 37,419


In [9]:
def users_at_least_k_recs(df, K, n_usr):
    """
    Return a list of users who perform at least K recommendations. Also, filtering for n_users randomly selected
    Args::
        df: dataframe, recommendation dataset
        K: integer, representing the minimum number of recommendations for a user to be included in the final list
        n_usr: integer, number of randomly selected users
    Return:
        users: a list of randomly selected users who perform at least K recommendations
    """
    # df = df[df["is_recommended"]==True]
    series = df.groupby("user_id")["user_id"].count()>=K
    series1 = series[series]
    users = list(series1.index)
    usr_to_keep = np.random.choice(users, size = n_usr)
    return usr_to_keep

users_to_keep = users_at_least_k_recs(recommendations_df, K_REC, N_USERS)
print("Number of users",  '{0:,.0f}'.format(len(users_to_keep)))
print("First five users id", users_to_keep[:5])


Number of users 10,000
First five users id [11203022 12827342 11719330  9553563 11051045]


In [10]:
def loading_recommendations(df, sample=True):
    
    if sample==True:
        df = df[df["user_id"].isin(users_to_keep)]
                
    df1 = df.copy()
    # df1 = df1[df1["is_recommended"]==True]
    df1["is_recommended"] = df1["is_recommended"].apply(lambda x: 1 if x == True else 0)
    df1['user_id_categorical'] = pd.Categorical(df1['user_id']).codes
    ## App_ID starts from 1. I leave the 0 to the user_id column. See later
    df1['app_id_categorical'] = pd.Categorical(df1['app_id']).codes
    df1 = df1.sort_values(by="user_id_categorical")
    df1["date"] = pd.to_datetime(df1["date"])
    return df1

In [11]:
recommendations = loading_recommendations(recommendations_df)
recommendations

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,user_id_categorical,app_id_categorical
27008251,292030,0,0,2019-06-24,1,309.5,2881,36353456,0,2488
22390813,319630,0,0,2021-02-22,1,176.1,2881,20736018,0,3067
3614243,38410,0,0,2022-05-19,1,139.3,2881,35414243,0,583
31371044,244450,0,0,2021-03-19,1,248.2,2881,25716249,0,1577
37593695,1461680,4,0,2021-08-24,1,45.0,2881,39738900,0,17120
...,...,...,...,...,...,...,...,...,...,...
32250651,674940,0,0,2020-06-29,1,26.7,14304542,21395856,9687,9160
5642530,356570,0,0,2020-06-29,1,10.6,14304542,30242530,9687,3918
29383695,588430,0,0,2021-11-16,1,1.9,14304542,22328900,9687,7989
4915235,360430,2,0,2022-02-04,1,39.9,14304542,31315235,9687,4011


In [12]:
print("################  Recommendations (Sample) ################")
print("Minimum number of recommendations to be included in the sample:", K_REC)
print("Number of randomly selected users:", N_USERS)
print("Users:", '{0:,.0f}'.format(len(recommendations.user_id.unique())))
print("Games:", '{0:,.0f}'.format(len(recommendations.app_id.unique())))

################  Recommendations (Sample) ################
Minimum number of recommendations to included in the sample: 20
Number of randomly selected users: 10000
Users: 9,688
Games: 20,615


In [13]:
recommendations.shape

(365754, 10)

## Building the User-Game Matrix

In [14]:
num_users = len(recommendations["user_id_categorical"].unique())
num_games = len(recommendations["app_id_categorical"].unique())

In [15]:
recommendations[["user_id_categorical","app_id_categorical"]].drop_duplicates()

Unnamed: 0,user_id_categorical,app_id_categorical
27008251,0,2488
22390813,0,3067
3614243,0,583
31371044,0,1577
37593695,0,17120
...,...,...
32250651,9687,9160
5642530,9687,3918
29383695,9687,7989
4915235,9687,4011


In [16]:
def build_user_game_matrix(num_usr, num_gms):
    mat = sp.sparse.dok_matrix((num_usr+1, num_gms+1), dtype='int8')
    for user_game in recommendations[["user_id_categorical","app_id_categorical"]].values:
        mat[user_game[0], user_game[1]] = 1
    return mat

user_game_matrix = build_user_game_matrix(num_users, num_games)
user_game_matrix

<9689x20616 sparse matrix of type '<class 'numpy.int8'>'
	with 365753 stored elements in Dictionary Of Keys format>

In [17]:
# user_game_matrix[0, :].keys()

In [18]:
print("################ User-Game Matrix ################")
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix.size))


################ User-Game Matrix ################
Number of Rows: 9,689
Numbeer of Columns: 20,616
Number of stored values: 365,753


## Save Users and Games Mapping

In [22]:
games_idx = recommendations[["app_id","app_id_categorical"]].drop_duplicates()
games_idx.to_csv("matrix/games_idx.csv", index=False)