In [35]:
# Making a Steam game recommender with Collaborative filtering

#Collaborative Filtering is a technique used for recommender systems. Collaborative filtering infers the preferences for new items given the known preferences from users.

#One of the ways to implement collaborative filtering is Matrix Factorization

#To explain it simple, think of it as a matrix multiplication problem

In [4]:
import numpy as np 
import pandas as pd 
import random
from collections import Counter
from sklearn.metrics import roc_curve, auc, average_precision_score
import tensorflow as tf
tf.__version__

'1.14.0'

In [None]:
#Data cleaning

#The data was corrupted with DLCs and expansion packs.

# I used whatever domain knowledge I have to cleanse the data.

#Also, there were non-video game contents such as the Source Film Maker, which had to be filtered out.

In [5]:
path = 'Resources/steam-200k.csv'
#path = 'steam-200k.csv'
df = pd.read_csv(path, header = None,
                 names = ['UserID', 'Game', 'Action', 'Hours', 'Other'])
df.head()

df['Hours_Played'] = df['Hours'].astype('float32')

df.loc[(df['Action'] == 'purchase') & (df['Hours'] == 1.0), 'Hours_Played'] = 0

In [6]:
df.UserID = df.UserID.astype('int')
df = df.sort_values(['UserID', 'Game', 'Hours_Played'])

clean_df = df.drop_duplicates(['UserID', 'Game'], keep = 'last').drop(['Action', 'Hours', 'Other'], axis = 1)

# every transaction is represented by only one record now
clean_df.head()

Unnamed: 0,UserID,Game,Hours_Played
65430,5250,Alien Swarm,4.9
65424,5250,Cities Skylines,144.0
65435,5250,Counter-Strike,0.0
65436,5250,Counter-Strike Source,0.0
65437,5250,Day of Defeat,0.0


In [7]:
n_users = len(clean_df.UserID.unique())
n_games = len(clean_df.Game.unique())

print('There are {0} users and {1} games in the data'.format(n_users, n_games))

There are 12393 users and 5155 games in the data


In [8]:
sparsity = clean_df.shape[0] / float(n_users * n_games)
print('{:.2%} of the user-item matrix is filled'.format(sparsity))

0.20% of the user-item matrix is filled


In [9]:
user_counter = Counter()
for user in clean_df.UserID.tolist():
    user_counter[user] +=1

game_counter = Counter()
for game in clean_df.Game.tolist():
    game_counter[game] += 1

user2idx = {user: i for i, user in enumerate(clean_df.UserID.unique())}
idx2user = {i: user for user, i in user2idx.items()}

game2idx = {game: i for i, game in enumerate(clean_df.Game.unique())}
idx2game = {i: game for game, i in game2idx.items()}

In [10]:
user_idx = clean_df['UserID'].apply(lambda x: user2idx[x]).values
game_idx = clean_df['gameIdx'] = clean_df['Game'].apply(lambda x: game2idx[x]).values
hours = clean_df['Hours_Played'].values

In [None]:
#User X game matrix formation
#We use two matrices here

#Preference matrix : shows whether the user has actually purchased the game (1) or not (0)

#Confidence matrix : The matrix that shows the confidence measure, or how long the user played / enjoyed the specific game.

In [11]:
zero_matrix = np.zeros(shape = (n_users, n_games)) # Create a zero matrix
user_game_pref = zero_matrix.copy()
user_game_pref[user_idx, game_idx] = 1 # Fill preference matrix

user_game_interactions = zero_matrix.copy()
# Confidence matrix
user_game_interactions[user_idx, game_idx] = hours + 1 

In [12]:
#verification

k = 5

# Count the number of purchases for each user
purchase_counts = np.apply_along_axis(np.bincount, 1, user_game_pref.astype(int))
buyers_idx = np.where(purchase_counts[:, 1] >= 2 * k)[0] #find the users who purchase 2 * k games
print('{0} users bought {1} or more games'.format(len(buyers_idx), 2 * k))

2189 users bought 10 or more games


In [13]:
test_frac = 0.2 # Let's save 10% of the data for validation and 10% for testing.
test_users_idx = np.random.choice(buyers_idx,
                                  size = int(np.ceil(len(buyers_idx) * test_frac)),
                                  replace = False)

In [14]:
val_users_idx = test_users_idx[:int(len(test_users_idx) / 2)]
test_users_idx = test_users_idx[int(len(test_users_idx) / 2):]

In [15]:
def data_process(dat, train, test, user_idx, k):
    for user in user_idx:
        purchases = np.where(dat[user, :] == 1)[0]
        mask = np.random.choice(purchases, size = k, replace = False)
        
        train[user, mask] = 0
        test[user, mask] = dat[user, mask]
    return train, test

In [16]:
train_matrix = user_game_pref.copy()
test_matrix = zero_matrix.copy()
val_matrix = zero_matrix.copy()

# Mask the train matrix and create the validation and test matrices
train_matrix, val_matrix = data_process(user_game_pref, train_matrix, val_matrix, val_users_idx, k)
train_matrix, test_matrix = data_process(user_game_pref, train_matrix, test_matrix, test_users_idx, k)

In [17]:
test_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]

array([1., 1., 1., 1., 1.])

In [18]:
train_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]

array([0., 0., 0., 0., 0.])

In [None]:
tf.reset_default_graph() # Create a new graphs

pref = tf.placeholder(tf.float32, (n_users, n_games))  # Here's the preference matrix
interactions = tf.placeholder(tf.float32, (n_users, n_games)) # Here's the hours played matrix
users_idx = tf.placeholder(tf.int32, (None))

In [None]:
n_features = 30 # number of features extracted

# The X matrix represents the user latent preferences with a shape of user x latent features
X = tf.Variable(tf.truncated_normal([n_users, n_features], mean = 0, stddev = 0.05))

# The Y matrix represents the game latent features with a shape of game x latent features
Y = tf.Variable(tf.truncated_normal([n_games, n_features], mean = 0, stddev = 0.05))

# Here's the initilization of the confidence parameter
conf_alpha = tf.Variable(tf.random_uniform([1], 0, 1))

In [None]:
#Adding biases
#There could be differences with users and games. 

#Some games like DOTA 2 just don't have an end. While on the other side, there could be users that tend to speedrun through games thanks to their experience.

#We want to implement a statistical model to take care of such biases.

In [21]:
n_features = 30 # number of features extracted

# The X matrix represents the user latent preferences with a shape of user x latent features
X = tf.Variable(tf.truncated_normal([n_users, n_features], mean = 0, stddev = 0.05))

# The Y matrix represents the game latent features with a shape of game x latent features
Y = tf.Variable(tf.truncated_normal([n_games, n_features], mean = 0, stddev = 0.05))

# Here's the initilization of the confidence parameter
conf_alpha = tf.Variable(tf.random_uniform([1], 0, 1))

In [22]:
#user bias
user_bias = tf.Variable(tf.truncated_normal([n_users, 1], stddev = 0.2))

# Concatenate the vector to the user matrix
X_plus_bias = tf.concat([X, 
                         #tf.convert_to_tensor(user_bias, dtype = tf.float32),
                         user_bias,
                         tf.ones((n_users, 1), dtype = tf.float32)], axis = 1)

In [23]:
# game bias
item_bias = tf.Variable(tf.truncated_normal([n_games, 1], stddev = 0.2))

# Cocatenate the vector to the game matrix
Y_plus_bias = tf.concat([Y, 
                         tf.ones((n_games, 1), dtype = tf.float32),
                         item_bias],
                         axis = 1)

In [24]:
pred_pref = tf.matmul(X_plus_bias, Y_plus_bias, transpose_b=True)

# Construct the confidence matrix with the hours played and alpha paramter
conf = 1 + conf_alpha * interactions

In [25]:
cost = tf.reduce_sum(tf.multiply(conf, tf.square(tf.subtract(pref, pred_pref))))
l2_sqr = tf.nn.l2_loss(X) + tf.nn.l2_loss(Y) + tf.nn.l2_loss(user_bias) + tf.nn.l2_loss(item_bias)
lambda_c = 0.01
loss = cost + lambda_c * l2_sqr

In [26]:
lr = 0.05
optimize = tf.train.AdagradOptimizer(learning_rate = lr).minimize(loss)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [27]:
# This is a function that helps to calculate the top k precision 
def top_k_precision(pred, mat, k, user_idx):
    precisions = []
    
    for user in user_idx:
        rec = np.argsort(-pred[user, :]) # Found the top recommendation from the predictions
        
        top_k = rec[:k]
        labels = mat[user, :].nonzero()[0]
        
        precision = len(set(top_k) & set(labels)) / float(k) # Calculate the precisions from actual labels
        precisions.append(precision)
    return np.mean(precisions) 

In [28]:
iterations = 100
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(iterations):
        sess.run(optimize, feed_dict = {pref: train_matrix,
                                        interactions: user_game_interactions})
        
        if i % 10 == 0:
            mod_loss = sess.run(loss, feed_dict = {pref: train_matrix,
                                                   interactions: user_game_interactions})            
            mod_pred = pred_pref.eval()
            train_precision = top_k_precision(mod_pred, train_matrix, k, val_users_idx)
            val_precision = top_k_precision(mod_pred, val_matrix, k, val_users_idx)
            print('Iterations {0}...'.format(i),
                  'Training Loss {:.2f}...'.format(mod_loss),
                  'Train Precision {:.3f}...'.format(train_precision),
                  'Val Precision {:.3f}'.format(val_precision)
                )

    rec = pred_pref.eval()
    test_precision = top_k_precision(rec, test_matrix, k, test_users_idx)
    print('\n')
    print('Test Precision{:.3f}'.format(test_precision))


Iterations 0... Training Loss 4273684.50... Train Precision 0.103... Val Precision 0.012
Iterations 10... Training Loss 416310.59... Train Precision 0.407... Val Precision 0.029
Iterations 20... Training Loss 323009.59... Train Precision 0.476... Val Precision 0.035
Iterations 30... Training Loss 288652.19... Train Precision 0.521... Val Precision 0.037
Iterations 40... Training Loss 267139.28... Train Precision 0.544... Val Precision 0.038
Iterations 50... Training Loss 251879.92... Train Precision 0.570... Val Precision 0.039
Iterations 60... Training Loss 240117.73... Train Precision 0.584... Val Precision 0.037
Iterations 70... Training Loss 230639.25... Train Precision 0.590... Val Precision 0.039
Iterations 80... Training Loss 222648.48... Train Precision 0.601... Val Precision 0.039
Iterations 90... Training Loss 215683.17... Train Precision 0.611... Val Precision 0.040


Test Precision0.035


In [31]:
# Test

#The overall precision score comes out really low. However, I think there should be a separate way to tell whether this precision is correct or not. (more domain-knowledge based like genre savvy-ness or the relations of the games etc)

#After all, the recommender is not doing its job if it recommends the games the user already bought.

In [29]:
n_examples = 10
users = np.random.choice(test_users_idx, size = n_examples, replace = False)
rec_games = np.argsort(-rec)

In [30]:
for user in users:
    print('User #{0} recommendations ...'.format(idx2user[user]))
    purchase_history = np.where(train_matrix[user, :] != 0)[0]
    recommendations = rec_games[user, :]

    
    new_recommendations = recommendations[~np.in1d(recommendations, purchase_history)][:k]
    
    print('Recommendations')
    print(', '.join([idx2game[game] for game in new_recommendations]))
    print('\n')
    print('Actual purchases')
    print(', '.join([idx2game[game] for game in np.where(test_matrix[user, :] != 0)[0]]))
    print('\n')
    print('Precision of {0}'.format(len(set(new_recommendations) & set(np.where(test_matrix[user, :] != 0)[0])) / float(k)))
    print('--------------------------------------')
    print('\n')

User #262474788 recommendations ...
Recommendations
Dota 2, Heroes & Generals, Dirty Bomb, Defiance, Call of Duty Modern Warfare 2 - Multiplayer


Actual purchases
Unturned, Loadout, Blacklight Retribution, Neverwinter, Toribash


Precision of 0.0
--------------------------------------


User #190191843 recommendations ...
Recommendations
The Elder Scrolls V Skyrim, Dota 2, Euro Truck Simulator 2, DayZ, Saints Row IV


Actual purchases
BattleBlock Theater, PlanetSide 2, Metro Last Light Redux, Five Nights at Freddy's, oO


Precision of 0.0
--------------------------------------


User #125912054 recommendations ...
Recommendations
Garry's Mod, Unturned, Team Fortress 2, Warframe, War Thunder


Actual purchases
The Elder Scrolls V Skyrim, Bulletstorm, The Witcher 3 Wild Hunt, BioShock 2, Gothic II Gold Edition


Precision of 0.0
--------------------------------------


User #1971801 recommendations ...
Recommendations
Counter-Strike, Team Fortress 2, Dota 2, Counter-Strike Source, Half-

In [33]:
#Result evaluation
#Some users were very genre-based and predictable. User #22689481 got the following results :

#Recommendations Counter-Strike Condition Zero Deleted Scenes, Counter-Strike Condition Zero, Deathmatch Classic, Ricochet, Left 4 Dead 2

#Actual purchases Deathmatch Classic, Half-Life 2 Lost Coast, Ricochet, Counter-Strike Condition Zero, Counter-Strike Condition Zero Deleted Scenes

#...which meant he is a very hardcore FPS gamer.

#However, some users such as User #129391396 got some strange-looking results such as :

#Recommendations Counter-Strike Global Offensive, Dota 2, Team Fortress 2, The Elder Scrolls V Skyrim, Portal

#Actual purchases Team Fortress 2, Call of Duty Modern Warfare 3, Call of Duty Modern Warfare 3 - Multiplayer, Rust, Stranded Deep

#Maybe he was affiliated with games that were popular (Team Fortress 2, Call of Duty, Rust), so he got recommended other commercially popular games recommended to him such as CS:GO(understandable as it is an FPS), Dota 2 and Skyrim.