# 6. Recommendations Sample

## Introduction

The purpose of this notebook is to reduce the size of the recommendations dataset that we can work on without incurring in memory issues

The filtering is done via **random sampling**

In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy import sparse
import gzip


In [2]:
### All random states are assigned to 42 for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [3]:
%run "1.Recommendation_Loading.ipynb"

## Filtering For Users and Games

In [4]:
def users_at_least_k_recs(df, K, n_users):
    """
    Return a list of users who perform at least K recommendations. Also, filtering for n_users randomly selected
    Args::
        df: dataframe, recommendation dataset
        K: integer, representing the minimum number of recommendations for a user to be included in the final list
        n_users: integer, number of randomly selected users
    Return:
        users: a list of randomly selected users who perform at least K recommendations
    """
    series = df.groupby("user_id_categorical")["user_id_categorical"].count()>=K
    series1 = series[series]
    users = list(series1.index)
    return np.random.choice(users, size = n_users)

users_to_keep = users_at_least_k_recs(recommendations, 20, 10000)
print("Number of users",  '{0:,.0f}'.format(len(users_to_keep)))
print("First five users id", users_to_keep[:5])


Number of users 10,000
First five users id [ 9880850 11308871 10339131  8414394  9734353]


## Loading the User-Game matrix and Apply Filtering

In [5]:
def decompress_pickle(input_file):
    with gzip.open(input_file, 'rb') as f:
        data = pickle.load(f)
    return data

user_game_matrix = decompress_pickle('matrix/user_game_matrix.pkl.gz')
user_game_matrix


  data = pickle.load(f)


<12663134x37420 sparse matrix of type '<class 'numpy.int32'>'
	with 47967516 stored elements in Compressed Sparse Row format>

In [6]:
# Convert this array/matrix to Dictionary Of Keys format
# user_game_matrix = user_game_matrix.todok()
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix.size))


Number of Rows: 12,663,134
Numbeer of Columns: 37,420
Number of stored values: 47,967,516


In [7]:
# FILTERING the MATRIX for USERS to KEEP
# Matrix with only the subset of users who did at least k recs
mask = np.isin(np.array(user_game_matrix[:,0].todense()).reshape(-1), users_to_keep)
user_game_matrix_k_rec = user_game_matrix[mask]
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec

Number of Rows: 9,690
Numbeer of Columns: 37,420
Number of stored values: 378,793


<9690x37420 sparse matrix of type '<class 'numpy.int32'>'
	with 378793 stored elements in Compressed Sparse Row format>

In [8]:
# INSERTING a GAME ID ROW to the user-game matrix
games_id = np.arange(0, user_game_matrix_k_rec.shape[1]).reshape(1,-1)
games_id = sparse.csc_matrix(games_id)
user_game_matrix_k_rec = sparse.vstack((games_id, user_game_matrix_k_rec))
user_game_matrix_k_rec = user_game_matrix_k_rec.tocsc()
user_game_matrix_k_rec.todense()


matrix([[       0,        1,        2, ...,    37417,    37418,    37419],
        [    2595,        0,        0, ...,        0,        0,        0],
        [    3376,        0,        0, ...,        0,        0,        0],
        ...,
        [12660465,        0,        0, ...,        0,        0,        0],
        [12661759,        0,        0, ...,        0,        0,        0],
        [12662371,        0,        0, ...,        0,        0,        0]],
       dtype=int32)

In [9]:
# dropping games with no recommendations
mask = list(np.array(np.sum(user_game_matrix_k_rec[1:,1:], axis=0)>=1).reshape(-1))
mask.insert(0,True)
user_game_matrix_k_rec = user_game_matrix_k_rec.T[mask].T
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec.todense()

Number of Rows: 9,691
Numbeer of Columns: 21,488
Number of stored values: 400,280


matrix([[       0,        1,        2, ...,    37399,    37401,    37414],
        [    2595,        0,        0, ...,        0,        0,        0],
        [    3376,        0,        0, ...,        0,        0,        0],
        ...,
        [12660465,        0,        0, ...,        0,        0,        0],
        [12661759,        0,        0, ...,        0,        0,        0],
        [12662371,        0,        0, ...,        0,        0,        0]],
       dtype=int32)

In [10]:
# GAMES MAPPING 
# keys: integer, starting from 1 which are the columns of the filtered matrix
# values: original app_id_categorical
all_games = np.array(user_game_matrix_k_rec[0,1:].todense()).reshape(-1)
games_mapping = {k:i for k,i in enumerate(all_games)}
swapped_games_dict = {v: k for k, v in games_mapping.items()}
# swapped_games_dict[3481]

In [11]:
# USERS mapping
users_to_keep = np.array(user_game_matrix_k_rec[1:,0].todense()).reshape(-1)
users_mapping = {k:i for k,i in enumerate(users_to_keep)}
swapped_users_dict = {v: k for k, v in users_mapping.items()}


In [12]:
def build_recommendations_sample(df):
    """
    Filtering the recommendations dataset for the random sampled users 
    """
    recommendations_sample = df[(df["user_id_categorical"].isin(users_to_keep))]
    return recommendations_sample

recommendations_sample = build_recommendations_sample(recommendations)
recommendations_sample.head()


Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,user_id_categorical,app_id_categorical
26924891,20920,2,0,2019-11-26,1,102.2,2881,26924891,2595,455
30777575,233130,4,0,2021-08-24,1,32.0,2881,30777575,2595,1650
24970114,445980,0,0,2022-05-19,1,42.5,2881,24970114,2595,7544
30211735,1237980,0,3,2022-05-19,1,31.8,2881,30211735,2595,25073
37812169,33120,0,0,2021-04-05,1,103.5,2881,37812169,2595,622
