Install and import necessary packages and functions to load data from GCP and build neighbor-based collaborative filtering recommender systems

In [26]:
pip install --upgrade google-cloud-storage #install necessary packages to load data from GCP



In [27]:
pip install fsspec #install necessary packages to load data from GCP



In [28]:
pip install gcsfs #install necessary packages to load data from GCP



In [29]:
import os
from google.cloud import storage
import pandas as pd
import pandas as pd
import numpy as np
import random

In [30]:
# import necessary functions
%cd '/content/'
from demolib import mapdata
from demolib import getRecommendations_UU
from demolib import getRecommendations_II
from demolib import predictRatings
from demolib import getitemsimsmatrix
from demolib import pearsonsim
from demolib import wtavg
from demolib import makeRatingsMatrix

/content


Load data from Google Cloud Storage

In [31]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/eloquent-life-328614-f56ab675c052.json'
storage_client = storage.Client()
bucket = storage_client.get_bucket('data5006_batch')
ratings_df = pd.read_csv('gs://data5006_batch/Google_Game_Ratings.csv')
game_info = pd.read_csv('gs://data5006_batch/GameInfo.csv')
print(ratings_df)
print(game_info)

            user_id  item_id  rating
0                -9  Game133       3
1                 0    Game9       2
2                 0   Game14       4
3                 0  Game129       4
4                 0  Game152       5
...             ...      ...     ...
289582      zzz zzz    Game2       2
289583  Zzzdreaminq   Game78       5
289584      zzzlepy   Game37       5
289585    Zzzz Zzzz  Game138       5
289586         TRUE   Game15       3

[289587 rows x 3 columns]
      GameID  ...                                                url
0      Game1  ...  https://play.google.com/store/apps/details?id=...
1      Game2  ...  https://play.google.com/store/apps/details?id=...
2      Game3  ...  https://play.google.com/store/apps/details?id=...
3      Game4  ...  https://play.google.com/store/apps/details?id=...
4      Game5  ...  https://play.google.com/store/apps/details?id=...
..       ...  ...                                                ...
165  Game166  ...  https://play.google.com/sto

In [32]:
titlelookup = dict(zip(game_info["GameID"],game_info["title"]))
titlelookup

{'Game1': 'Grand Theft Auto: San Andreas',
 'Game10': 'Hitman Sniper',
 'Game100': 'The Room',
 'Game101': 'Bridge Constructor Portal',
 'Game102': 'Motorsport Manager Mobile 2',
 'Game103': 'Riptide GP2',
 'Game104': 'Real Drift Car Racing',
 'Game105': 'F1 2016',
 'Game106': 'Illegal Race Tuning - Real car racing multiplayer',
 'Game107': 'Need for Speed Most Wanted',
 'Game108': 'Bike Race Pro by T. F. Games',
 'Game109': 'GRID™ Autosport',
 'Game11': 'Zombie Age 3 Premium: Rules of Survival',
 'Game110': 'Draw Rider Plus',
 'Game111': 'Motorsport Manager Mobile 3',
 'Game112': 'Earn to Die',
 'Game113': 'Reckless Racing 3',
 'Game114': 'Dungeon Corporation P : (An auto-farming RPG game)',
 'Game115': 'Stardew Valley',
 'Game116': 'Star Wars™: KOTOR',
 'Game117': 'Mystery of Fortune 2',
 'Game118': 'Devil Twins: VIP',
 'Game119': 'Dungeon Corporation VIP: An auto-farming RPG game!',
 'Game12': 'Dungeon Shooter : The Forgotten Temple',
 'Game120': '[VIP] +9 God Blessing Knight - Cash

In [33]:
# define a function to map the user_id and item_id in the ratings_df to integer indexes
def mapdata(ratings_df):
  ratings_df["item_id"] = ratings_df["item_id"].astype(str)
  ratings_df["user_id"] = ratings_df["user_id"].astype(str)
  ratings_df["rating"]  = ratings_df["rating"].values.astype(np.float32)
  user_ids = np.sort(ratings_df["user_id"].unique()).tolist()
  umap = {x: i for i, x in enumerate(user_ids)}
  item_ids = np.sort(ratings_df["item_id"].unique()).tolist()
  imap = {x: i for i, x in enumerate(item_ids)}
  ratings_df["user_id"] = ratings_df["user_id"].map(umap) # swap userid for user index
  ratings_df["item_id"] = ratings_df["item_id"].map(imap) # swap itemid for item index
  return ratings_df, umap, imap

In [34]:
# define function to convert the ratings_df into the ratings matrix
def makeRatingsMatrix(ratings_df):
  ratings_df, umap, imap = mapdata(ratings_df)
  ratmatrix = pd.pivot_table(ratings_df, index=['user_id'], columns=['item_id'], values=['rating'],aggfunc=[np.mean]).values
  return ratmatrix, umap, imap

In [35]:
ratmatrix, umap, imap = makeRatingsMatrix(ratings_df)

In [36]:
# define function to calculate similarity by Pearson coefficients
def pearsonsim(x,y):
    xy = x*y
    x = x[np.isnan(xy)==False]
    y = y[np.isnan(xy)==False]
    if(len(x)==0): return np.nan
    mx=np.mean(x)
    my=np.mean(y)
    rt = np.sqrt(sum((x-mx)**2)*sum((y-my)**2))
    if (rt == 0): return np.nan 
    return sum((x-mx)*(y-my))/rt

User-based Collaborative Filtering Recommender System

In [37]:
# make user-based CF recommendations for a given  user
def getRecommendations_UU(targetrats, ratsmatrix, imap, simfun=pearsonsim, topN=5):

    # get similarity between target and all other users
    sims = []
    for row in ratsmatrix:
      sims.append(simfun(row,targetrats))
    sims = np.array(sims)
    sims[sims < 0] = np.nan

    # for each unseen item, get weighted average of all user ratings
    rats = []
    unseenitemidxs = np.where(np.isnan(targetrats)==True)[0]
    for col in unseenitemidxs:
      rats.append(wtavg(ratsmatrix[:,col], sims))

    # put results into a dataframe and reverse sort by predicted rating
    itemnames=list(imap.keys())
    rats = pd.DataFrame(rats, index=[itemnames[i] for i in unseenitemidxs], columns=['predrating'])
    rats = rats.sort_values(ascending = False, by=['predrating'])
    return rats[0:min(topN,len(rats))]

In [39]:
# Select a user called 'zzzpley' as the target user and make  recommendations by user-based CF
targetname = "zzzlepy"
targetrats = ratmatrix[umap[targetname],] 
simfun=pearsonsim
recs = getRecommendations_UU(targetrats, ratmatrix, imap, simfun=simfun, topN = 10)
for i in recs.index: 
  print("Predicted Rating: %2.2f, Index for Game: %s, Game Name: %s" % (recs['predrating'][i], i, titlelookup[i]))

Predicted Rating: nan, Index for Game: Game1, Game Name: Grand Theft Auto: San Andreas
Predicted Rating: nan, Index for Game: Game10, Game Name: Hitman Sniper
Predicted Rating: nan, Index for Game: Game100, Game Name: The Room
Predicted Rating: nan, Index for Game: Game101, Game Name: Bridge Constructor Portal
Predicted Rating: nan, Index for Game: Game102, Game Name: Motorsport Manager Mobile 2
Predicted Rating: nan, Index for Game: Game103, Game Name: Riptide GP2
Predicted Rating: nan, Index for Game: Game104, Game Name: Real Drift Car Racing
Predicted Rating: nan, Index for Game: Game105, Game Name: F1 2016
Predicted Rating: nan, Index for Game: Game106, Game Name: Illegal Race Tuning - Real car racing multiplayer
Predicted Rating: nan, Index for Game: Game107, Game Name: Need for Speed Most Wanted


It can be seen that the predicted ratings are all NA which may be due to the sparse of the matrix.

Item-based Collaborative Filtering Recommender System

In [41]:
# make item-based CF recommendations for a given user
def getRecommendations_II(targetrats, itemsims, imap, topN=5):

    unseenitemidxs = np.where(np.isnan(targetrats)==True)[0]
    seenitems = np.isnan(targetrats)==False
    rats = []
    for row in unseenitemidxs: rats.append(wtavg(targetrats[seenitems],itemsims[row,seenitems]))

    itemnames=list(imap.keys()) 
    rats = pd.DataFrame(np.array(rats), index=[itemnames[i] for i in unseenitemidxs], columns=['predrating'])
    rats = rats.sort_values(ascending = False, by=['predrating'])
    return rats[0:min(topN,len(rats))]

In [43]:
# create an item-item similarity matrix using Pearson coefficients and make recommendations by item-based CF 
simfun=pearsonsim
itemsims = getitemsimsmatrix(ratmatrix, simfun=simfun)
print(itemsims.shape)
recs = getRecommendations_II(targetrats, itemsims, imap, topN = 10)
for i in recs.index: 
  print("Predicted Rating: %2.2f, Index for Game: %s, Game Name: %s" % (recs['predrating'][i], i, titlelookup[i]))

(170, 170)
Predicted Rating: 5.00, Index for Game: Game127, Game Name: Evertale
Predicted Rating: 5.00, Index for Game: Game9, Game Name: Five Nights at Freddy's
Predicted Rating: 5.00, Index for Game: Game1, Game Name: Grand Theft Auto: San Andreas
Predicted Rating: 5.00, Index for Game: Game30, Game Name: THE KING OF FIGHTERS-A 2012
Predicted Rating: 5.00, Index for Game: Game45, Game Name: Through the Ages
Predicted Rating: 5.00, Index for Game: Game44, Game Name: Chess - Analyze This (Pro)
Predicted Rating: 5.00, Index for Game: Game42, Game Name: Monopoly - Board game classic about real-estate!
Predicted Rating: 5.00, Index for Game: Game40, Game Name: Max Payne Mobile
Predicted Rating: 5.00, Index for Game: Game38, Game Name: Grand Theft Auto: Vice City
Predicted Rating: 5.00, Index for Game: Game36, Game Name: Space Shooter: Alien vs Galaxy Attack (Premium)


Evaluate The Model

In [45]:
# evaluation: To test on a random set (20% of the dataset)
testsize = int(len(ratmatrix)*0.2)
random.seed(66)
testidx  = random.sample(range(ratings_df.shape[0]), testsize) 
testevents = ratings_df.iloc[testidx,].values
np.set_printoptions(suppress=True) 
testevents[0:3]

array([[ 16616.,     67.,      1.],
       [113840.,    130.,      4.],
       [163633.,     74.,      3.]])

In [47]:
# blank out the test ratings in ratmatrix
for [u,i,rating] in testevents: 
  ratmatrix[int(u),int(i)] = np.nan

In [56]:
# define a function to compute predicted ratings for the testing dataset
def predictRatings(testevents, ratsmatrix, itemsims):
    preds = []
    for testevent in testevents:
        print('.', end = '')
        testuser = int(testevent[0])
        testitem = int(testevent[1])
        testuserrats = ratsmatrix[testuser,:]
        testitemrats = ratsmatrix[:,testitem]
        seenitems = np.isnan(testuserrats) == False
        predrat = wtavg(testuserrats[seenitems],itemsims[testitem,seenitems])
        preds.append(predrat)
    return np.array(preds)

In [51]:
ratmatrixFull = np.nan_to_num(ratmatrix)
print(ratmatrixFull)
itemsims = np.corrcoef(ratmatrixFull, rowvar=False)
itemsims

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


array([[ 1.        , -0.00363179, -0.00688166, ..., -0.00560421,
        -0.00632676, -0.00644521],
       [-0.00363179,  1.        , -0.00893721, ..., -0.00435351,
        -0.00606103, -0.00541902],
       [-0.00688166, -0.00893721,  1.        , ..., -0.00424069,
         0.06793573,  0.05004875],
       ...,
       [-0.00560421, -0.00435351, -0.00424069, ...,  1.        ,
        -0.00249698,  0.0016583 ],
       [-0.00632676, -0.00606103,  0.06793573, ..., -0.00249698,
         1.        ,  0.06787903],
       [-0.00644521, -0.00541902,  0.05004875, ...,  0.0016583 ,
         0.06787903,  1.        ]])

In [53]:
# Define a function to compute similarities between items in the ratings matrix using Pearson Coefficients
def getitemsimsmatrix(ratsmatrix,simfun=pearsonsim):
    r,c = ratsmatrix.shape
    matrx = list()
    for col1 in range(0,c):
        simrow = [0]*col1
        for col2 in range(col1,c):
            simrow.append(simfun(ratsmatrix[:,col1],ratsmatrix[:,col2]))
        matrx.append(simrow)
    matrx = np.array(matrx)
    matrx = matrx + matrx.T - np.diag(np.diag(matrx))
    return matrx

In [57]:
# compute the item-item based recommendations
simfun=pearsonsim
itemsims = getitemsimsmatrix(ratmatrix, simfun=simfun)
print(itemsims.shape)
itemsims

(170, 170)


array([[ 1.        ,  0.03492152,  0.30029508, ..., -0.3503293 ,
        -0.50000002, -0.8660254 ],
       [ 0.03492152,  1.        , -0.24019224, ..., -0.13810711,
         0.86602541,  0.57735027],
       [ 0.30029508, -0.24019224,  1.        , ...,  0.07026634,
         0.74917034,  0.70644097],
       ...,
       [-0.3503293 , -0.13810711,  0.07026634, ...,  1.        ,
        -0.2903018 ,  0.5024481 ],
       [-0.50000002,  0.86602541,  0.74917034, ..., -0.2903018 ,
         1.        ,  0.7252305 ],
       [-0.8660254 ,  0.57735027,  0.70644097, ...,  0.5024481 ,
         0.7252305 ,  1.        ]])

In [58]:
# compute MAE for item-based CF using pearson coefficients
simfun=pearsonsim
preds = predictRatings(testevents, ratmatrix, itemsims)
print("\nMAE for Item-based Recommender System: ",np.nanmean(abs(preds-testevents[:,2])))

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................