In [1]:
import numpy as np 
import pandas as pd

In [2]:
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

In [3]:
games = reduce_memory(pd.read_csv('./games.csv',usecols=["app_id","title"]))
recommendations = reduce_memory(pd.read_csv('recommendations.csv',usecols = ["app_id","hours","user_id"]))
# users = reduce_memory(pd.read_csv('users.csv'))

In [4]:
recommend_with_users = recommendations.merge(games,on='app_id')

In [5]:
no_of_ppl_played_df = recommend_with_users.groupby('app_id').count()['user_id'].reset_index()
no_of_ppl_played_df.rename(columns={'user_id':'no_of_ppl'},inplace=True)
no_of_ppl_played_df

Unnamed: 0,app_id,no_of_ppl
0,10,41043
1,20,4284
2,30,4432
3,40,1610
4,50,9721
...,...,...
37605,2245890,8
37606,2246290,5
37607,2248870,1
37608,2251240,3


In [6]:
hours_played_df = recommend_with_users.groupby('app_id')["hours"].mean().reset_index()
hours_played_df.rename(columns={'hours':'mean_hours'},inplace=True)
hours_played_df.head(5)

Unnamed: 0,app_id,mean_hours
0,10,245.776794
1,20,35.180767
2,30,78.345192
3,40,29.885468
4,50,18.470209


In [8]:
most_played_game_df = no_of_ppl_played_df.merge(hours_played_df,on='app_id')
popular_games_df = most_played_game_df[most_played_game_df['no_of_ppl'] >= 1000].sort_values("mean_hours",ascending=False).head(50)
popular_games_df.merge(games,on="app_id")

Unnamed: 0,app_id,no_of_ppl,mean_hours,title
0,570,216914,429.076874,Dota 2
1,730,219737,428.96875,Counter-Strike: Global Offensive
2,236850,44974,396.720093,Europa Universalis IV
3,39210,52488,373.633942,FINAL FANTASY XIV Online
4,394360,143187,370.750366,Hearts of Iron IV
5,1009850,1204,364.681976,OVR Advanced Settings
6,230410,127812,363.425781,Warframe
7,359550,189603,360.995483,Tom Clancy's Rainbow Six® Siege
8,1147690,4857,356.644287,NGU IDLE
9,381210,80582,346.990204,Dead by Daylight


In [9]:
popular_games_df.to_pickle('popular_games_df.pkl')

# Discarding games with low playerbase < 10000
---

In [10]:
# Count the number of games each user has played
user_game_counts = recommend_with_users.groupby('user_id')['app_id'].nunique()

In [11]:
# Set a threshold, e.g., only keep users who have played at least 10 games
user_threshold = 10
active_users = user_game_counts[user_game_counts >= user_threshold].index

# Filter the recommendations DataFrame to only include active users
filtered_df_games_played = recommend_with_users[recommend_with_users['user_id'].isin(active_users)]

In [12]:
filtered_df_games_played

Unnamed: 0,app_id,hours,user_id,title
22,534380,40.599998,22793,Dying Light 2 Stay Human
23,518790,10.000000,271318,theHunter: Call of the Wild™
27,42700,5.900000,433335,Call of Duty®: Black Ops
32,438100,8.100000,912612,VRChat
41,359550,166.600006,763450,Tom Clancy's Rainbow Six® Siege
...,...,...,...,...
41154788,391220,18.000000,9958247,Rise of the Tomb Raider™
41154790,758870,8.000000,1786254,Kynseed
41154791,696170,2.000000,6370324,SENRAN KAGURA Peach Beach Splash
41154792,696170,4.000000,1044289,SENRAN KAGURA Peach Beach Splash


In [13]:
# Count the number of unique users per game
player_counts = filtered_df_games_played.groupby('app_id')['user_id'].nunique()
player_counts

app_id
10         9537
20         2845
30         1961
40         1252
50         6957
           ... 
2245840      11
2245890       5
2246290       3
2248870       1
2253290       3
Name: user_id, Length: 37304, dtype: int64

In [14]:
# Set a threshold, e.g., keeping only games with at least 100 unique players
threshold = 10000
popular_games = player_counts[player_counts >= threshold].index

# Filter the recommendations DataFrame to only include popular games
filtered_df_player_count = filtered_df_games_played[filtered_df_games_played['app_id'].isin(popular_games)]

In [15]:
filtered_df_player_count


Unnamed: 0,app_id,hours,user_id,title
22,534380,40.599998,22793,Dying Light 2 Stay Human
23,518790,10.000000,271318,theHunter: Call of the Wild™
32,438100,8.100000,912612,VRChat
41,359550,166.600006,763450,Tom Clancy's Rainbow Six® Siege
51,730,21.400000,461080,Counter-Strike: Global Offensive
...,...,...,...,...
41154756,400,102.000000,12416904,Portal
41154777,298110,5.000000,13661350,Far Cry® 4
41154782,220,21.000000,5863193,Half-Life 2
41154785,391220,12.000000,12406164,Rise of the Tomb Raider™


In [16]:
# Group by 'app_id' and 'user_id' and aggregate the 'hours' column, for example, by taking the mean
filtered_df_player_count = filtered_df_player_count.groupby(['app_id', 'user_id']).agg({'hours': 'mean'}).reset_index()
filtered_df_player_count

Unnamed: 0,app_id,user_id,hours
0,70,397,172.399994
1,70,1298,1.400000
2,70,1878,24.000000
3,70,1959,4.900000
4,70,1999,10.200000
...,...,...,...
5786937,1938090,14302648,11.000000
5786938,1938090,14304223,94.300003
5786939,1938090,14304237,137.500000
5786940,1938090,14305005,7.600000


In [17]:
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
filtered_df_player_count.loc[:, 'normalized_hours']  = scaler.fit_transform(filtered_df_player_count[['hours']])

rows = filtered_df_player_count['app_id'].values  # game IDs
cols = filtered_df_player_count['user_id'].values  # user IDs
data = filtered_df_player_count['normalized_hours'].values  # normalized hours played

# # Create a sparse game-user interaction matrix
# sparse_matrix = coo_matrix((data, (rows, cols)), dtype=float)
user_game_matrix = filtered_df_player_count.pivot(index='app_id', columns='user_id', values='normalized_hours').fillna(0)

In [18]:
user_game_matrix

user_id,0,36,99,181,191,198,200,212,232,245,...,14305785,14305787,14305792,14305813,14305845,14305852,14305954,14305966,14306002,14306011
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
220,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
240,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
380,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
400,0.0,0.0,0.0,0.0,0.0,0.0306,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0054,0.0,0.0,0.0,0.004,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1625450,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1782210,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1794680,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1817070,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0000,0.0,0.0,0.0,0.000,0.0,0.0,0.0


In [19]:
# Step 3: Calculate cosine similarity between users based on normalized hours played
game_similarity = cosine_similarity(user_game_matrix)
game_similarity

array([[0.99999934, 0.277027  , 0.08404861, ..., 0.00575599, 0.01573243,
        0.00964335],
       [0.277027  , 1.0000002 , 0.11053436, ..., 0.00934695, 0.01078856,
        0.01283742],
       [0.08404861, 0.11053436, 1.000003  , ..., 0.01048653, 0.01214236,
        0.0169387 ],
       ...,
       [0.00575599, 0.00934695, 0.01048653, ..., 0.9999985 , 0.01680115,
        0.01940616],
       [0.01573243, 0.01078856, 0.01214236, ..., 0.01680115, 0.9999994 ,
        0.04407598],
       [0.00964335, 0.01283742, 0.0169387 , ..., 0.01940616, 0.04407598,
        1.0000018 ]], dtype=float32)

In [20]:
# Create a DataFrame for easier lookup
game_similarity_df = pd.DataFrame(game_similarity, index=user_game_matrix.index, columns=user_game_matrix.index)
game_similarity_df.to_csv('game_similarities.csv')

In [24]:
# Create a new DataFrame with app_id from the index of game_similarity_df
mapped_df = pd.DataFrame({
    'app_id': game_similarity_df.index
})

# Merge with filtered_df_player_count to get the titles
mapped_df = mapped_df.merge(games, on='app_id', how='left')
mapped_df

Unnamed: 0,app_id,title
0,70,Half-Life
1,220,Half-Life 2
2,240,Counter-Strike: Source
3,380,Half-Life 2: Episode One
4,400,Portal
...,...,...
297,1625450,Muck
298,1782210,Crab Game
299,1794680,Vampire Survivors
300,1817070,Marvel’s Spider-Man Remastered


In [28]:

def get_game_recommendations(game_name, n, game_similarity_df, mapped_df):
    # Check if the game name exists in the filtered_df_player_count
    if game_name not in mapped_df['title'].values:
        return "No recommendations"
    
    # Get the app_id of the input game
    app_id = mapped_df.loc[mapped_df['title'] == game_name, 'app_id'].values[0]
    
    # Check if the app_id is in the similarity matrix
    if app_id not in game_similarity_df.index:
        return "No recommendations"
    
    # Get similarity scores for the game and sort them in descending order
    similarity_scores = game_similarity_df.loc[app_id].sort_values(ascending=False)
    
    # Get top n similar app_ids (excluding the game itself)
    top_app_ids = similarity_scores.iloc[1:n+1].index  # Exclude the first as it's the game itself
    
    # Map app_ids to game titles
    recommendations = mapped_df[mapped_df['app_id'].isin(top_app_ids)]['title'].tolist()
    
    return recommendations

print(get_game_recommendations('Dota 2', 5, game_similarity_df, mapped_df))
# Output: ['Game B', 'Game C']

['Counter-Strike: Global Offensive', 'Path of Exile', 'The Witcher® 3: Wild Hunt', 'PUBG: BATTLEGROUNDS', 'Dota Underlords']


In [29]:
mapped_df.to_pickle('mapped_df.pkl')
game_similarity_df.to_pickle('game_similarity_df.pkl')
