# 9. Collaborative Filtering - k-NN

In [1]:
%run "1.Recommendation_Loading.ipynb"

  chunk_df = pd.read_csv(file_path, header=None, skiprows=skiprow)


In [2]:
import scipy as sp

In [3]:
recommendations["date"] = pd.to_datetime(recommendations["date"])

## Data Preparation

### Filtering For Users and Games

In [4]:
def users_at_least_k_recs(df, K, n_users):
    """
    Return a list of users who perform at least K recommendations. Also, filtering for n_users randomòy selected
    Args::
        df: dataframe, recommendation dataset
        K: integer, representing the minimum number of recommendations for a user to be included in the final list
        n_users: integer, number of randomly selected users
    Return:
        users: a list of randomly selected users who perform at least K recommendations
    """
    series = df.groupby("user_id_categorical")["user_id_categorical"].count()>=K
    series1 = series[series]
    users = list(series1.index)
    return np.random.choice(users, size = n_users)

users_to_keep = users_at_least_k_recs(recommendations, 20, 5000)
print("Number of users",  '{0:,.0f}'.format(len(users_to_keep)))
print("First five users id", users_to_keep[:5])


Number of users 5,000
First five users id [ 8425725  5039263 11252314 10079175   718140]


### Loading the matrix and Apply Filtering

In [5]:
# LOADING THE MATRIX
# Due to the large memory requirements, the user-game matrix has been created in the notebook "User-Games Matrix - Building"
import gzip
import pickle

with open("matrix/user_game_matrix.pkl", 'rb') as file:
    user_game_matrix = pickle.load(file)

# Convert this array/matrix to Dictionary Of Keys format
# user_game_matrix = user_game_matrix.todok()
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix.size))


  user_game_matrix = pickle.load(file)


Number of Rows: 12,663,134
Numbeer of Columns: 37,420
Number of stored values: 47,967,516


In [6]:
# FILTERING the MATRIX for USERS to KEEP
# Matrix with only the subset of users who did at least k recs
mask = np.isin(np.array(user_game_matrix[:,0].todense()).reshape(-1), users_to_keep)
user_game_matrix_k_rec = user_game_matrix[mask]
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec

Number of Rows: 4,927
Numbeer of Columns: 37,420
Number of stored values: 20,124


<4927x37420 sparse matrix of type '<class 'numpy.int32'>'
	with 20124 stored elements in Compressed Sparse Column format>

In [7]:
# INSERTING a GAME ID ROW to the user-game matrix
games_id = np.arange(0, user_game_matrix_k_rec.shape[1]).reshape(1,-1)
games_id = sp.sparse.csc_matrix(games_id)
user_game_matrix_k_rec = sp.sparse.vstack((games_id, user_game_matrix_k_rec))
user_game_matrix_k_rec.todense()


matrix([[       0,        1,        2, ...,    37417,    37418,    37419],
        [    1358,        0,        0, ...,        0,        0,        0],
        [    3400,        0,        0, ...,        0,        0,        0],
        ...,
        [12629139,        0,        0, ...,        0,        0,        0],
        [12633117,        0,        0, ...,        0,        0,        0],
        [12643129,        0,        0, ...,        0,        0,        0]])

In [8]:
# dropping games with no recommendations
mask = list(np.array(np.sum(user_game_matrix_k_rec[1:,1:], axis=0)>=1).reshape(-1))
mask.insert(0,True)
user_game_matrix_k_rec = user_game_matrix_k_rec.T[mask].T
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec.todense()

Number of Rows: 4,928
Numbeer of Columns: 3,884
Number of stored values: 24,007


matrix([[       0,        1,        3, ...,    37127,    37324,    37365],
        [    1358,        0,        0, ...,        0,        0,        0],
        [    3400,        0,        0, ...,        0,        0,        0],
        ...,
        [12629139,        0,        0, ...,        0,        0,        0],
        [12633117,        0,        0, ...,        0,        0,        0],
        [12643129,        0,        0, ...,        0,        0,        0]])

In [9]:
# GAMES MAPPING 
# keys: integer, starting from 1 which are the columns of the filtered matrix
# values: original app_id_categorical
all_games = np.array(user_game_matrix_k_rec[0,1:].todense()).reshape(-1)
games_mapping = {k:i for k,i in enumerate(all_games)}
swapped_games_dict = {v: k for k, v in games_mapping.items()}
# swapped_games_dict[3481]

In [10]:
# USERS mapping
users_to_keep = np.array(user_game_matrix_k_rec[1:,0].todense()).reshape(-1)
users_mapping = {k:i for k,i in enumerate(users_to_keep)}
swapped_users_dict = {v: k for k, v in users_mapping.items()}


In [11]:
def build_recommendations_sample(df):
    # recommendations_sample = df[(df["user_id_categorical"].isin(users_to_keep)) & (df["app_id_categorical"].isin(games_to_keep))]
    recommendations_sample = df[(df["user_id_categorical"].isin(users_to_keep))]
    return recommendations_sample

recommendations_sample = build_recommendations_sample(recommendations)
# recommendations_sample[recommendations_sample["user_id_categorical"]==731702]


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(user_game_matrix_k_rec)

In [19]:
from sklearn.neighbors import NearestNeighbors

# Create an instance of the NearestNeighbors model
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')

# Fit the model with the cosine similarity matrix
knn_model.fit(similarities)

# Example: Query the nearest neighbors for a specific user
user_index = 0  # Index of the user in the similarity matrix
query_user = similarities[user_index].reshape(1, -1)  # Reshape to match the input format

# Find the k nearest neighbors for the query user
distances, neighbor_indices = knn_model.kneighbors(query_user)

# Print the indices of the nearest neighbors and their corresponding distances
print("Nearest neighbors indices:", neighbor_indices)
print("Distances:", distances)

Nearest neighbors indices: [[  0   1   3   2   4   8  11  16 101 501]]
Distances: [[0.         0.99999729 0.99999774 0.99999791 0.99999796 0.99999796
  0.99999797 0.99999797 0.99999798 0.99999799]]


In [20]:
# Create an empty table to store the nearest neighbors' indices and distances for all users
neighbor_table = []

# Iterate over each user in the similarity matrix
for user_index in range(similarities.shape[0]):
    # Query the nearest neighbors for the current user
    query_user = similarities[user_index].reshape(1, -1)  # Reshape to match the input format
    distances, neighbor_indices = knn_model.kneighbors(query_user)
    
    # Append the user's nearest neighbors' indices and distances to the table
    neighbor_table.append({"User": user_index, "Nearest Neighbors": neighbor_indices[0], "Distances": distances[0]})

# Convert the table to a pandas DataFrame for easier visualization
import pandas as pd
neighbor_table_df = pd.DataFrame(neighbor_table)

# Print the table
print(neighbor_table_df)

      User                                  Nearest Neighbors  \
0        0               [0, 1, 3, 2, 4, 8, 11, 16, 101, 501]   
1        1            [1, 3, 2, 4, 12, 630, 16, 53, 3813, 67]   
2        2  [1936, 313, 309, 2512, 307, 634, 316, 1327, 16...   
3        3          [84, 4, 12, 3, 16, 630, 74, 38, 384, 115]   
4        4  [3224, 3221, 3231, 3230, 3229, 3226, 3225, 323...   
...    ...                                                ...   
4923  4923  [257, 542, 3972, 221, 1957, 1357, 241, 243, 60...   
4924  4924  [721, 719, 725, 724, 723, 722, 727, 720, 715, ...   
4925  4925  [483, 484, 479, 480, 482, 476, 493, 492, 469, ...   
4926  4926  [696, 183, 159, 749, 730, 720, 173, 795, 634, ...   
4927  4927  [313, 308, 1389, 330, 1434, 1445, 1450, 316, 1...   

                                              Distances  
0     [0.0, 0.9999972938756093, 0.9999977433281201, ...  
1     [4.440892098500626e-15, 1.1213252548714081e-13...  
2     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0