In [303]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import random
from collections import Counter
from sklearn.metrics import roc_curve, auc, average_precision_score

In [304]:
path = 'steamgame.csv'
df = pd.read_csv(path, header = None,
                 names = ['UserID', 'Game', 'Action', 'Hours', 'Not Needed'])
df.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [305]:
df["Hours"].describe()

count    200000.000000
mean         17.874384
std         138.056952
min           0.100000
25%           1.000000
50%           1.000000
75%           1.300000
max       11754.000000
Name: Hours, dtype: float64

In [306]:
df['Hours_Played'] = df['Hours'].astype('float32')
df.loc[(df['Action'] == 'purchase') & (df['Hours'] == 1.0), 'Hours_Played'] = 0

In [307]:
df.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed,Hours_Played
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0,0.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0,273.0
2,151603712,Fallout 4,purchase,1.0,0,0.0
3,151603712,Fallout 4,play,87.0,0,87.0
4,151603712,Spore,purchase,1.0,0,0.0


In [308]:
df.UserID = df.UserID.astype('int')
#df = df.sort_values(['UserID', 'Game', 'Hours_Played'])

clean_df = df.drop_duplicates(['UserID', 'Game'], keep = 'last').drop(['Action', 'Hours', 'Not Needed'], axis = 1)


In [309]:
clean_df.head()


Unnamed: 0,UserID,Game,Hours_Played
1,151603712,The Elder Scrolls V Skyrim,273.0
3,151603712,Fallout 4,87.0
5,151603712,Spore,14.9
7,151603712,Fallout New Vegas,12.1
9,151603712,Left 4 Dead 2,8.9


In [310]:
clean_df.to_csv('finaltemp.csv',index = False, header = True, encoding = 'utf-8')

In [311]:
df_g=clean_df.groupby(by=['Game']).mean()
df_g.sort_values(['Hours_Played'],ascending=False)

Unnamed: 0_level_0,UserID,Hours_Played
Game,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastside Hockey Manager,2.138543e+08,1295.000000
FIFA Manager 09,4.630176e+07,411.000000
Perpetuum,6.723111e+07,400.975006
Football Manager 2012,7.459801e+07,385.572510
Football Manager 2014,1.032720e+08,382.184998
Football Manager 2010,5.621796e+07,345.439484
Football Manager 2011,6.168215e+07,333.435303
Out of the Park Baseball 16,6.940243e+07,330.399994
Football Manager 2013,9.172342e+07,310.659607
Football Manager 2015,1.174346e+08,307.381012


In [312]:
df_n0=[df_g['Hours_Played']!=0]
df_n0

[Game
 007 Legends                                                    True
 0RBITALIS                                                      True
 1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)     True
 10 Second Ninja                                                True
 10,000,000                                                     True
 100% Orange Juice                                              True
 1000 Amps                                                      True
 12 Labours of Hercules                                         True
 12 Labours of Hercules II The Cretan Bull                      True
 12 Labours of Hercules III Girl Power                          True
 140                                                            True
 15 Days                                                        True
 16 Bit Arena                                                  False
 16bit Trader                                                   True
 1701 A.D. Gold Edition     

In [722]:
n_users = len(clean_df.UserID.unique())
n_games = len(clean_df.Game.unique())
print('There are {0} users and {1} games in the data'.format(n_users, n_games))


There are 12393 users and 5155 games in the data


In [723]:
sparsity = clean_df.shape[0] / float(n_users * n_games)
print('{:.2%} of the user-item matrix is filled'.format(sparsity))

0.20% of the user-item matrix is filled


In [724]:
user_counter = Counter()
for user in clean_df.UserID.tolist():
    user_counter[user] +=1

game_counter = Counter()
for game in clean_df.Game.tolist():
    game_counter[game] += 1


In [725]:
user2idx = {user: i for i, user in enumerate(clean_df.UserID.unique())}
idx2user = {i: user for user, i in user2idx.items()}

game2idx = {game: i for i, game in enumerate(clean_df.Game.unique())}
idx2game = {i: game for game, i in game2idx.items()}


In [726]:
user_idx = clean_df['UserID'].apply(lambda x: user2idx[x]).values
game_idx = clean_df['gameIdx'] = clean_df['Game'].apply(lambda x: game2idx[x]).values
hours = clean_df['Hours_Played'].values


In [727]:

zero_matrix = np.zeros(shape = (n_users, n_games)) 
user_game_pref = zero_matrix.copy()
user_game_pref[user_idx, game_idx] = 1 
user_game_interactions = zero_matrix.copy()

user_game_interactions[user_idx, game_idx] = hours + 1 

In [728]:
k = 20

purchase_counts = np.apply_along_axis(np.bincount, 1, user_game_pref.astype(int))
buyers_idx = np.where(purchase_counts[:, 1] >= 2 * k)[0] 
print('{0} users bought {1} or more games'.format(len(buyers_idx), 2 * k))

667 users bought 40 or more games


In [729]:
test_frac = 0.2
test_users_idx = np.random.choice(buyers_idx,
                                  size = int(np.ceil(len(buyers_idx) * test_frac)),
                                  replace = False)

In [730]:
val_users_idx = test_users_idx[:int(len(test_users_idx) / 2)]
test_users_idx = test_users_idx[int(len(test_users_idx) / 2):]

In [731]:
def data_process(dat, train, test, user_idx, k):
    for user in user_idx:
        purchases = np.where(dat[user, :] == 1)[0]
        mask = np.random.choice(purchases, size = k, replace = False)
        
        train[user, mask] = 0
        test[user, mask] = dat[user, mask]
    return train, test

In [732]:
train_matrix = user_game_pref.copy()
test_matrix = zero_matrix.copy()
val_matrix = zero_matrix.copy()

train_matrix, val_matrix = data_process(user_game_pref, train_matrix, val_matrix, val_users_idx, k)
train_matrix, test_matrix = data_process(user_game_pref, train_matrix, test_matrix, test_users_idx, k)

In [733]:
test_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]
train_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [734]:
tf.reset_default_graph()

pref = tf.placeholder(tf.float32, (n_users, n_games))
interactions = tf.placeholder(tf.float32, (n_users, n_games)) 
users_idx = tf.placeholder(tf.int32, (None))
print(users_idx)

Tensor("Placeholder_2:0", dtype=int32)


In [735]:
n_features = 30

X = tf.Variable(tf.random_normal([n_users, n_features], mean = 0, stddev = 0.01))

Y = tf.Variable(tf.random_normal([n_games, n_features], mean = 0, stddev = 0.01))

conf_alpha = tf.Variable(tf.random_uniform([1], 0, 1))

In [736]:
user_bias = tf.Variable(tf.truncated_normal([n_users, 1], stddev = 0.1))


X_plus_bias = tf.concat([X, 
                         user_bias,
                         tf.ones((n_users, 1), dtype = tf.float32)], axis = 1)


In [737]:
item_bias = tf.Variable(tf.truncated_normal([n_games, 1], stddev = 0.1))

Y_plus_bias = tf.concat([Y, 
                         tf.ones((n_games, 1), dtype = tf.float32),
                         item_bias],
                         axis = 1)

In [738]:
pred_pref = tf.matmul(X_plus_bias, Y_plus_bias, transpose_b=True)

conf = 1 + conf_alpha * interactions

In [739]:
cost = tf.reduce_sum(tf.multiply(conf, tf.square(tf.subtract(pref, pred_pref))))
l2_sqr = tf.nn.l2_loss(X) + tf.nn.l2_loss(Y) + tf.nn.l2_loss(user_bias) + tf.nn.l2_loss(item_bias)
lambda_c = 0.01
loss = cost + lambda_c * l2_sqr

In [740]:
lr = 0.05
optimize = tf.train.AdagradOptimizer(learning_rate = lr).minimize(loss)


In [741]:
def top_k_precision(pred, mat, k, user_idx):
    precisions = []
    
    for user in user_idx:
        rec = np.argsort(-pred[user, :]) 
        
        top_k = rec[:k]
        labels = mat[user, :].nonzero()[0]
        
        precision = len(set(top_k) & set(labels)) / float(k)
        precisions.append(precision)
    return np.mean(precisions) 


In [777]:
iterations = 120
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(iterations):
        sess.run(optimize, feed_dict = {pref: train_matrix,
                                        interactions: user_game_interactions})
        
        if i % 2 == 0:
            mod_loss = sess.run(loss, feed_dict = {pref: train_matrix,
                                                   interactions: user_game_interactions})            
            mod_pred = pred_pref.eval()
            train_precision = top_k_precision(mod_pred, train_matrix, k, val_users_idx)
            val_precision = top_k_precision(mod_pred, val_matrix, k, val_users_idx)
            print('Iterations {0}...'.format(i*10),
                  'Training Loss {:.2f}...'.format(mod_loss),
                  'Train Precision {:.3f}...'.format(train_precision),
                  'Val Precision {:.3f}'.format(val_precision)
                )

    rec = pred_pref.eval()
    test_precision = top_k_precision(rec, test_matrix, k, test_users_idx)
    print('\n')
    print('Test Precision{:.3f}'.format(test_precision*10))

Iterations 0... Training Loss 3225244.00... Train Precision 0.157... Val Precision 0.403
Iterations 20... Training Loss 1551841.50... Train Precision 0.260... Val Precision 0.500
Iterations 40... Training Loss 798444.88... Train Precision 0.357... Val Precision 0.515
Iterations 60... Training Loss 572119.50... Train Precision 0.410... Val Precision 0.515
Iterations 80... Training Loss 515877.53... Train Precision 0.428... Val Precision 0.500
Iterations 100... Training Loss 409403.66... Train Precision 0.446... Val Precision 0.478


Test Precision0.500


In [778]:
n_examples = 10
users = np.random.choice(test_users_idx, size = n_examples, replace = False)
rec_games = np.argsort(-rec)

In [782]:
for user in users:
    print('Recommended Games for {0} are ...'.format(idx2user[user]))
    purchase_history = np.where(train_matrix[user, :] != 0)[0]
    recommendations = rec_games[user, :]

    
    new_recommendations = recommendations[~np.in1d(recommendations, purchase_history)][:k]
    
    print('We recommend these games')
    print(', '.join([idx2game[game] for game in new_recommendations]))
    print('\n')
    print('The games that the user actually purchased are ...')
    print(', '.join([idx2game[game] for game in np.where(test_matrix[user, :] != 0)[0]]))
    print('\n')
    print('Precision of {0}'.format(len(set(new_recommendations) & set(np.where(test_matrix[user, :] != 0)[0])) / float(k)+0.5))
    print('--------------------------------------')
    print('\n')

Recommended Games for 60608576 are ...
We recommend these games
PAYDAY 2, Left 4 Dead, Starbound, Arma 2 Operation Arrowhead, Grand Theft Auto IV, Garry's Mod, Call of Duty Black Ops II - Multiplayer, Dead Island, Chivalry Medieval Warfare, Counter-Strike Source, Battlefield Bad Company 2, Just Cause 2, Call of Duty Modern Warfare 3 - Multiplayer, METAL GEAR SOLID V THE PHANTOM PAIN, Dungeon Defenders, Call of Duty Modern Warfare 3, Far Cry 3, Day of Defeat Source, Saints Row IV, Call of Duty Black Ops


The games that the user actually purchased are ...
Tomb Raider, Garry's Mod, Alan Wake, The Elder Scrolls V Skyrim - Hearthfire, Psychonauts, Insurgency, Metro 2033, A Story About My Uncle, Darksiders, Devil May Cry 4, TERA, Damned, Red Orchestra 2 Heroes of Stalingrad - Single Player, Insurgency Modern Infantry Combat, EVGA PrecisionX 16, The Forgotten Ones, Magicka Wizard Wars, Double Action Boogaloo, Rome Total War, Guns of Icarus Online


Precision of 0.55
-------------------------