In [1]:
import numpy as np 
import pandas as pd

In [2]:
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

In [3]:
games = reduce_memory(pd.read_csv('./games.csv',usecols=["app_id","title"]))
recommendations = reduce_memory(pd.read_csv('recommendations.csv',usecols = ["app_id","hours","user_id"]))
users = reduce_memory(pd.read_csv('users.csv'))

In [4]:
print(users.shape)
print(list(users.columns.values))
print(games.shape)
print(list(games.columns.values))
print(recommendations.shape)
print(list(recommendations.columns.values))

(14306064, 3)
['user_id', 'products', 'reviews']
(50872, 2)
['app_id', 'title']
(41154794, 3)
['app_id', 'hours', 'user_id']


In [5]:
recommend_with_users = recommendations.merge(games,on='app_id')

In [6]:
recommend_with_users

Unnamed: 0,app_id,hours,user_id,title
0,975370,36.299999,51580,Dwarf Fortress
1,304390,11.500000,2586,FOR HONOR™
2,1085660,336.500000,253880,Destiny 2
3,703080,27.400000,259432,Planet Zoo
4,526870,7.900000,23869,Satisfactory
...,...,...,...,...
41154789,633230,41.000000,1606890,NARUTO TO BORUTO: SHINOBI STRIKER
41154790,758870,8.000000,1786254,Kynseed
41154791,696170,2.000000,6370324,SENRAN KAGURA Peach Beach Splash
41154792,696170,4.000000,1044289,SENRAN KAGURA Peach Beach Splash


In [7]:
no_of_ppl_played_df = recommend_with_users.groupby('app_id').count()['user_id'].reset_index()
no_of_ppl_played_df.rename(columns={'user_id':'no_of_ppl'},inplace=True)
no_of_ppl_played_df

Unnamed: 0,app_id,no_of_ppl
0,10,41043
1,20,4284
2,30,4432
3,40,1610
4,50,9721
...,...,...
37605,2245890,8
37606,2246290,5
37607,2248870,1
37608,2251240,3


In [8]:
no_of_ppl_played_df['no_of_ppl'].mean()

np.float64(1094.2513693166711)

In [9]:
no_of_ppl_played_df.shape

(37610, 2)

In [10]:
print(no_of_ppl_played_df['no_of_ppl'].min())
print(no_of_ppl_played_df['no_of_ppl'].max())

1
319492


In [11]:
hours_played_df = recommend_with_users.groupby('app_id')["hours"].mean().reset_index()
hours_played_df.rename(columns={'hours':'mean_hours'},inplace=True)
hours_played_df.head(5)

Unnamed: 0,app_id,mean_hours
0,10,245.776794
1,20,35.180767
2,30,78.345192
3,40,29.885468
4,50,18.470209


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import dask.dataframe as dd
from dask_ml.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix, coo_matrix

In [13]:
ddf = dd.from_pandas(pd.DataFrame(recommend_with_users), npartitions=4)

In [14]:
ddf

Unnamed: 0_level_0,app_id,hours,user_id,title
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,int32,float32,int32,string
10288699,...,...,...,...
20577398,...,...,...,...
30866096,...,...,...,...
41154793,...,...,...,...


# Incremental Recommendation Approach
---

In [15]:
import dask
# Normalize hours within each chunk and compute similarity matrix
def compute_similarity(chunk):
    min_hours = chunk['hours'].min()
    max_hours = chunk['hours'].max()
    chunk['normalized_hours'] = (chunk['hours'] - min_hours) / (max_hours - min_hours)
    
    rows = chunk['app_id'].values
    cols = chunk['user_id'].values
    data = chunk['normalized_hours'].values
    
    # Create a sparse matrix with shape based on max app_id and user_id values in the chunk
    sparse_matrix = coo_matrix((data, (rows, cols)))
    
    
    # # Create game-user interaction matrix
    # user_game_matrix = chunk.pivot(index='app_id', columns='user_id', values='normalized_hours').fillna(0)
    # sparse_matrix = coo_matrix(user_game_matrix.values)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(sparse_matrix)
    return similarity_matrix, user_game_matrix.index.tolist()


In [16]:
# Compute similarity matrices for each partition
delayed_results = [dask.delayed(compute_similarity)(chunk) for chunk in ddf.to_delayed()]

In [17]:
delayed_results

[Delayed('compute_similarity-bcecba08-0e25-42ee-b667-f8e7f12292f6'),
 Delayed('compute_similarity-493bf89e-2988-4b5b-a13e-c7c00e924350'),
 Delayed('compute_similarity-f49d09e9-1af3-44b8-bbb4-c20636b17ff1'),
 Delayed('compute_similarity-6c8f0e79-c4a9-47cd-9e86-0a79e01b5ecd')]

In [18]:
# Combine results from each partition
def aggregate_similarity(results):
    combined_similarity = None
    combined_game_ids = []

    for similarity_matrix, game_ids in results:
        combined_game_ids.extend(game_ids)
        if combined_similarity is None:
            combined_similarity = similarity_matrix
        else:
            # Expand combined_similarity to fit the new game similarities
            new_combined = np.zeros((combined_similarity.shape[0] + similarity_matrix.shape[0],
                                     combined_similarity.shape[1] + similarity_matrix.shape[1]))
            new_combined[:combined_similarity.shape[0], :combined_similarity.shape[1]] = combined_similarity
            new_combined[combined_similarity.shape[0]:, combined_similarity.shape[1]:] = similarity_matrix
            combined_similarity = new_combined

    return combined_similarity, combined_game_ids


In [19]:
# Compute aggregated similarity matrix
results = dask.compute(*delayed_results)
combined_similarity_matrix, combined_game_ids = aggregate_similarity(results)

MemoryError: Unable to allocate 17.8 TiB for an array with shape (2208921, 2208921) and data type float32

In [20]:
# Step 1: Compute min and max hours to normalize
min_hours = df['hours'].min().compute()
max_hours = df['hours'].max().compute()

# Step 2: Normalize hours using the min-max scaling formula
df['normalized_hours'] = (df['hours'] - min_hours) / (max_hours - min_hours)

In [22]:
df.head(5)

Unnamed: 0,app_id,hours,user_id,title,normalized_hours
0,975370,36.299999,51580,Dwarf Fortress,0.0363
1,304390,11.5,2586,FOR HONOR™,0.0115
2,1085660,336.5,253880,Destiny 2,0.3365
3,703080,27.4,259432,Planet Zoo,0.0274
4,526870,7.9,23869,Satisfactory,0.0079


In [25]:
# Step 2: Create a game-user interaction matrix
rows = df['app_id'].to_dask_array(lengths=True)
cols = df['user_id'].to_dask_array(lengths=True)
data = df['normalized_hours'].to_dask_array(lengths=True)

In [28]:
# Create a sparse matrix in COO format
game_user_sparse = coo_matrix((data, (rows, cols)))

In [29]:
# Step 3: Convert the COO matrix to CSR format for efficient arithmetic and matrix-vector operations
game_user_sparse = game_user_sparse.tocsr()

In [30]:
# Step 4: Calculate cosine similarity between games based on user hours played
game_similarity = cosine_similarity(game_user_sparse)

MemoryError: Unable to allocate 18.5 TiB for an array with shape (2253291, 2253291) and data type float32

In [20]:
scaler = MinMaxScaler()
recommend_with_users['normalized_hours'] = scaler.fit_transform(recommend_with_users[['hours']])
recommend_with_users.head(5)

Unnamed: 0,app_id,hours,user_id,title,normalized_hours
0,975370,36.3,51580,Dwarf Fortress,0.0363
1,304390,11.5,2586,FOR HONOR™,0.0115
2,1085660,336.5,253880,Destiny 2,0.3365
3,703080,27.4,259432,Planet Zoo,0.0274
4,526870,7.9,23869,Satisfactory,0.0079


In [49]:
# Step 1: Randomly sample a fraction of users or games
# You can sample either users or games based on your use case.
sample_size = 100000  # Number of samples you want to take
sampled_df = recommend_with_users.sample(n=sample_size, random_state=42)
sampled_df.shape

(100000, 5)

In [50]:
cols = sampled_df['user_id'].values
rows = sampled_df['app_id'].values
data = sampled_df['normalized_hours'].values

In [52]:
# Create a sparse matrix in COO format
user_game_sparse = coo_matrix((data, (rows, cols)))




In [53]:
# Step 3: Convert COO matrix to CSR format for efficient arithmetic and matrix-vector operations
user_game_sparse = user_game_sparse.tocsr()

In [54]:
# Step 4: Calculate cosine similarity between users based on normalized hours played
user_similarity = cosine_similarity(user_game_sparse)

MemoryError: Unable to allocate 36.1 TiB for an array with shape (2228891, 2228891) and data type float64

In [None]:
# Create a DataFrame for easier lookup
user_similarity_df = pd.DataFrame(user_similarity, index=df['user_id'].unique(), columns=df['user_id'].unique())