# Loading Data and Overview

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load clean data
interactions = pd.read_csv('../data/clean_interactions.csv')
metadata = pd.read_csv('../data/game_metadata.csv')

print(f"Interactions shape: {interactions.shape}")
print(f"Expected: {interactions.shape} interactions with user_id, app_id, rating columns")


Interactions shape: (17398025, 3)
Expected: (17398025, 3) interactions with user_id, app_id, rating columns


# Create sparse item-user matrix

In [3]:
# Encode IDs to compress matrix indices (memory optimization)
user_encoder = LabelEncoder()
game_encoder = LabelEncoder()

users_encoded = user_encoder.fit_transform(interactions['user_id'])
games_encoded = game_encoder.fit_transform(interactions['app_id'])

print(f"Encoded users: {len(user_encoder.classes_):,}")
print(f"Encoded games: {len(game_encoder.classes_):,}")

# Create Item-User matrix (games as rows, users as columns)
item_user_matrix = coo_matrix(
    (interactions['rating'].values, (games_encoded, users_encoded)),
    shape=(len(game_encoder.classes_), len(user_encoder.classes_))
).tocsr()

# Verify memory efficiency
print(f"Matrix shape: {item_user_matrix.shape}")  # Should be ~20K x 1.6M
print(f"Memory usage: {item_user_matrix.data.nbytes / 1024**2:.1f} MB")  # Should be ~199MB
print(f"Sparsity: {(1 - item_user_matrix.nnz / np.prod(item_user_matrix.shape)) * 100:.2f}%")
print(f"Non-zero elements: {item_user_matrix.nnz:,}")

# Save encoders and matrix
import pickle
pickle.dump((item_user_matrix, game_encoder, user_encoder), 
           open('../data/sparse_matrix_data.pkl', 'wb'))

Encoded users: 1,579,456
Encoded games: 20,156
Matrix shape: (20156, 1579456)
Memory usage: 132.7 MB
Sparsity: 99.95%
Non-zero elements: 17,398,011
