In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shutil
import os
from scipy.sparse import coo_matrix

In [2]:
# Calculate RMSE for ratings prediction
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [3]:
# load in dataset
import mysql.connector
import yaml

cfg = yaml.safe_load(open('_inc.yaml'))
cnx = mysql.connector.connect(user=cfg['mysql']['user'], password=cfg['mysql']['pwd'],
                            host=cfg['mysql']['server'], database=cfg['mysql']['db'])

# load games dataset
games = pd.read_sql_query("select * from tblGame;", cnx)
# load tv shows dataset
tv = pd.read_sql_query("select * from tblTVShow;", cnx)
# load movies dataset
movies = pd.read_sql_query("select * from tblMovie;", cnx) 
# load Reviews
reviews = pd.read_sql_query("select * from tblReview;", cnx)

In [4]:
# products_dataset
products_dataset = games[['uniqueID', 'name']].append(movies[['uniqueID', 'name']]).append(tv[['uniqueID', 'name']])

In [5]:
# Restricting to user_reviews for 
users = reviews[reviews['reviewType']=='u']
users = users[['uniqueID','author', 'score']]
users.head(5)

Unnamed: 0,uniqueID,author,score
514304,20417,swing,3
514305,20417,cwongx,10
514306,20417,jmmarch,3
514307,20417,RatedRex,8
514308,20417,Morphine_OD,0


In [7]:
users = users[users.author!=""]
# filter with authors >1
authors = users.groupby(['author']).agg('count')
author_gtone = authors[authors['score'] > 5].index
# take only those authors
users = users[users['author'].isin(author_gtone)]

In [178]:
user_item_matrix = pd.pivot_table(users, values = 'score', index = ['author'], columns = ['uniqueID'], aggfunc=np.max)

# take off anonymous reviews
user_item_matrix = user_item_matrix.iloc[1:,]

# delete any games that only had anonymous reviews
user_item_matrix  = user_item_matrix.dropna(axis = 1, how = 'all')
# fill na to zero
data = user_item_matrix.fillna(0)

In [133]:
data = user_item_matrix.fillna(0)

In [18]:
# normalize item_user_matrix by each user's mean
R = data.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [20]:
#################### BASELINE RMSE FOR COMPARISON TO SVD MODEL 

In [19]:
# baseline 
R_demeaned_mean = np.mean(R_demeaned)
baseline_average_prediction = np.ones([R.shape[0], R.shape[1]])*R_demeaned_mean 
baseline_rmse = rmse(baseline_average_prediction, R)

In [20]:
baseline_rmse

7.7026747561506825

In [21]:
#################### SVD Implementation

In [22]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
sigma = np.diag(sigma)

In [23]:
# Making a prediction from decomposed matrix
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [24]:
model_rmse = rmse(all_user_predicted_ratings, R)

In [25]:
print "Improvement from baseline model(in percentage): %.2f" %((baseline_rmse-model_rmse)/baseline_rmse*100)

Improvement from baseline model(in percentage): 13.06


In [None]:
################## Recommendation

In [26]:
# Making product recommendation
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = data.columns)
preds_df.head()

uniqueID,1,2,3,4,5,6,11,13,14,15,...,27849,27850,27851,27853,27855,27857,27858,27860,27862,27864
0,-0.001216,-0.001337,0.041548,0.0008,-0.001359,-0.005172,-0.006479,-0.004509,-0.001431,-0.002176,...,0.019494,-0.000445,0.002438,0.05859,0.006004,0.006264,-0.00073,0.000599,0.032835,0.015136
1,-0.003198,-0.004956,-0.036639,-0.00699,-0.002697,-0.012704,0.081787,0.069166,-0.009021,-0.008953,...,0.027094,-0.005373,-0.00571,0.061787,0.079513,-0.049541,-0.007635,-0.003778,0.05385,-0.028959
2,-0.00108,-0.000729,-0.028017,0.000612,-0.000724,-0.004801,0.042167,0.042412,-0.001774,-0.005651,...,0.014242,-0.003583,-0.003656,-0.027926,0.134519,0.012311,-0.00345,-0.004685,0.008711,-0.006352
3,0.006759,0.006783,0.020259,0.007162,0.006749,0.007779,0.005188,0.004937,0.006663,0.006676,...,-0.002384,0.006461,0.004706,-0.016631,-0.008687,0.024862,0.006295,0.00494,-0.009656,0.017565
4,0.004702,0.004225,0.011059,0.005586,0.004839,0.012784,0.046455,0.044684,0.003816,0.004478,...,-0.010046,0.004105,0.005222,0.041234,0.029648,0.006596,0.002873,0.001989,0.014901,0.01248


In [34]:
user_row_number = 4
sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)

In [44]:
"""
reccommend returns even items that a user has recommended highly
"""
def recommend(user_row_number, preds_df, info_df, user_item_df, top_n=5):
    # History
    user_history = user_item_df.iloc[[user_row_number,]]
    ids = user_history.columns[~((user_history==0).all())]
    # add in data frame infromation
    history = pd.merge(pd.DataFrame(ids),products_dataset[['uniqueID', 'name']],how="left", on ="uniqueID") 
    # add ratings information
    history = pd.merge(history, user_history.T.reset_index(), on= "uniqueID", how="left")
    
    
    # prediction
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    recommendation = sorted_user_predictions.sort_values(ascending=False).iloc[:top_n]
    result = pd.merge(recommendation.reset_index()[['uniqueID']], info_df[['uniqueID', 'name']],how="left", on ="uniqueID")
    return result, history
    
suggestions, history = recommend(4, preds_df, products_dataset, data, 5)    

In [45]:
history

Unnamed: 0,uniqueID,name,0Bennyman
0,362,No Time to Explain,9.0
1,422,Murdered: Soul Suspect,8.0
2,7542,Slender: The Arrival,7.0
3,15338,Game of Thrones: A Telltale Games Series,10.0
4,17678,Metal Gear Solid V: The Phantom Pain,9.0
5,18201,Pro Evolution Soccer 2016,8.0
6,18226,Star Wars Battlefront,4.0
7,18718,Dying Light,9.0
8,18801,Call of Duty: Black Ops III,9.0
9,18813,Call of Duty: Advanced Warfare,9.0


In [46]:
suggestions

Unnamed: 0,uniqueID,name
0,18297,Halo 5: Guardians
1,18013,Titanfall
2,18189,Halo: The Master Chief Collection
3,18837,Sunset Overdrive
4,17726,The Witcher 3: Wild Hunt


In [49]:
"""
collaborative_recommend returns new items only
"""
def collaborative_recommend(user_row_number, preds_df, info_df, user_item_df, top_n=5):
    # History
    user_history = user_item_df.iloc[[user_row_number,]]
    ids = user_history.columns[~((user_history==0).all())]
    # add in data frame infromation
    history = pd.merge(pd.DataFrame(ids),products_dataset[['uniqueID', 'name']],how="left", on ="uniqueID") 
    # add ratings information
    history = pd.merge(history, user_history.T.reset_index(), on= "uniqueID", how="left")
    
    # prediction
    sorted_user_predictions = preds_df.loc[user_row_number,user_item_df.iloc[user_row_number,:]==0].sort_values(ascending=False)
    recommendation = sorted_user_predictions.sort_values(ascending=False).iloc[:top_n]
    result = pd.merge(recommendation.reset_index()[['uniqueID']], info_df[['uniqueID', 'name']],how="left", on ="uniqueID")
    return result, history
    
suggestions, history = collaborative_recommend(4, preds_df,games,data,10)    
suggestions

Unnamed: 0,uniqueID,name
0,18297,Halo 5: Guardians
1,18013,Titanfall
2,18189,Halo: The Master Chief Collection
3,18837,Sunset Overdrive
4,17726,The Witcher 3: Wild Hunt
5,18052,Rise of the Tomb Raider
6,3770,Ryse: Son of Rome
7,19809,Dead Rising 3
8,18346,Destiny
9,17761,Quantum Break


In [50]:
history

Unnamed: 0,uniqueID,name,0Bennyman
0,362,No Time to Explain,9.0
1,422,Murdered: Soul Suspect,8.0
2,7542,Slender: The Arrival,7.0
3,15338,Game of Thrones: A Telltale Games Series,10.0
4,17678,Metal Gear Solid V: The Phantom Pain,9.0
5,18201,Pro Evolution Soccer 2016,8.0
6,18226,Star Wars Battlefront,4.0
7,18718,Dying Light,9.0
8,18801,Call of Duty: Black Ops III,9.0
9,18813,Call of Duty: Advanced Warfare,9.0


In [177]:
user_item_matrix

uniqueID,1,2,3,4,5,6,11,13,14,15,...,27849,27850,27851,27853,27855,27857,27858,27860,27862,27864
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00Liteyear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
06Morrow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0Bennyman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0fof0fo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0hope,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0javis0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0phidi4n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
#save pickle file
import pickle
with open('user_item_matrix.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [149]:
############ Functions for Flask App 

In [150]:
# load pickle file of preloaded pivot table
from scipy.sparse.linalg import svds
import pickle

#to load a pickle file
with open('user_item_matrix.pickle', 'rb') as f:
    user_item_matrix = pickle.load(f)


In [172]:
d ={1:10, 2:10}
last = user_item_matrix.shape[0]
user_item_matrix = add_score(d, user_item_matrix)
preds_df = predict_svd(user_item_matrix)
collaborative_recommend(last-1, preds_df, user_item_matrix,5) 

In [151]:
# function takes in input from users and adds it to user_item matrix
"""
add_score: adds a dictionary of values rated obtained from user to the user_matrix
d should be formated as: type: dictionary, format: {'uniqueID': 'score'}
"""
def add_score(d, user_matrix):
    last = len(user_matrix)
    user_matrix.loc[len(user_matrix),d.keys()[0]] = d.values()[0]
    if len(d)>1:
        for i in range(1, len(d)):
            user_matrix.loc[last,d.keys()[i]] = d.values()[i]
    return user_matrix.fillna(0)

In [167]:
"""
predict_svd: takes a user_item_matrix and returns a matrix of same shape with all ratings predictions
"""

def predict_svd(user_item_matrix):
    # change type and normalize ratings for SVD
    R = user_item_matrix.as_matrix()
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    # svd
    U, sigma, Vt = svds(R_demeaned, k = 50)
    sigma = np.diag(sigma)
    
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns = user_item_matrix.columns)
    
    return preds_df

In [170]:
"""
collaborative_recommend returns a list with of uniqueIDs of recommended items
"""
def collaborative_recommend(user_row_number, preds_df, user_item_df, top_n=5):
    sorted_user_predictions = preds_df.loc[user_row_number,user_item_df.iloc[user_row_number,:]==0].sort_values(ascending=False)
    recommendation = sorted_user_predictions.sort_values(ascending=False).iloc[:top_n]
    return recommendation.index
    
suggestions = collaborative_recommend(4, preds_df, user_item_matrix,5)    


Int64Index([18297, 18013, 18189, 18837, 17726], dtype='int64', name=u'uniqueID')

In [None]:
# Train/test split to check which value of K to use in the SVD model

In [None]:
# split train and test dataset

# Define a function for splitting train/test data.
def split_train_test(data, percent_test=10):
    """Split the data(user to item matrix) into train/test sets.
    :param int percent_test: Percentage of data to use for testing. Default 10.
    """
    n, m = data.shape             # # users, # games
    N = n * m                     # # cells in matrix

    # Prepare train/test ndarrays.
    train = data.copy().values
    test = np.ones(data.shape) * np.nan

    # Draw random sample of training data to use for testing.
    tosample = np.where(~np.isnan(train))       # ignore nan values in data
    idx_pairs = zip(tosample[0], tosample[1])   # tuples of row/col index pairs
    indices = np.arange(len(idx_pairs))         # indices of index pairs
    
    test_size = len(indices) / percent_test  # use 10% of data as test set
    train_size = len(indices) - test_size    # and remainder for training
    
    sample = np.random.choice(indices, replace=False, size=test_size)

    # Transfer random sample from train set to test set.
    for idx in sample:
        idx_pair = idx_pairs[idx]
        test[idx_pair] = train[idx_pair]  # transfer to test set
        train[idx_pair] = np.nan          # remove from train set

    # Return train set, test set
    return train, test

train, test = split_train_test(user_item_matrix)
# Training dataset
R_train = np.nan_to_num(train)
train_user_ratings_mean = np.mean(R_train, axis = 1)
R_train_demeaned = R_train - train_user_ratings_mean.reshape(-1, 1)
# Test dataset
R_test = np.nan_to_num(train)

In [None]:
# SVD for training set
U, sigma, Vt = svds(R_train_demeaned, k = 50)
sigma = np.diag(sigma)

In [None]:
# Making a prediction from decomposed train matrix
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + train_user_ratings_mean.reshape(-1, 1)

In [None]:
# train error
rmse(all_user_predicted_ratings, R_train_demeaned, R_train)
# test error
rmse(all_user_predicted_ratings, R_test_demeaned, R_test)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [28]:
# Alternative Least Square Implemenation

In [30]:
# users dataset, with no id reviews

# get some shape parameters
n_users = users.author.unique().shape[0]
n_products = users.gameID.unique().shape[0]

users = users[users['author']!=""]

# create numeric identification for users
authors = users.author.unique()
authors_id = range(n_users)
id_dict = dict(zip(authors, authors_id))


print n_users, n_products 

92991 9317


In [31]:
# Convert author name to numeric ID
users['author'] = users.apply(lambda row: id_dict[row['author']], axis=1)
users['gameID'] = users.apply(lambda row: int(row['gameID']), axis=1)
users.head(2)

Unnamed: 0,gameID,author,score
398058,3,0,5
398069,3,1,4


In [32]:
# split into train and test
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(users,test_size=0.2)

In [33]:
R = pd.pivot_table(train_data, values = 'score', index = ['author'], columns = ['gameID'], aggfunc=np.max)
R = R.as_matrix()

In [116]:
train_data.gameID.max() 

14992

In [123]:
# Create train and test matrix
R = np.zeros((14993, n_users))
for line in train_data.itertuples():
    R[line[1], line[2]] = line[3]  

T = np.zeros((14993, n_users))
for line in test_data.itertuples():
    T[line[1], line[2]] = line[3]

In [127]:
# Selector for rows with and without entries
# Index matrix for training data
I = R.copy()
I[I > 0] = 1
I[I == 0] = 0

# Index matrix for test data
I2 = T.copy()
I2[I2 > 0] = 1
I2[I2 == 0] = 0

In [129]:
# ALS algo
lmbda = 0.1 # Regularisation weight
k = 20 # Dimensionality of latent feature space
m, n = R.shape # Number of users and items
n_epochs = 15 # Number of epochs

P = 3 * np.random.rand(k,m) # Latent user feature matrix
Q = 3 * np.random.rand(k,n) # Latent product feature matrix
Q[0,:] = R[R != 0].mean(axis=0) # Avg. rating for each product
E = np.eye(k) # (k x k)-dimensional idendity matrix

In [133]:
# # Repeat until convergence
for epoch in range(n_epochs):
    # Fix Q and estimate P
    for i, Ii in enumerate(I):
        nui = np.count_nonzero(Ii) # Number of items user i has rated
        if (nui == 0): nui = 1 # Be aware of zero counts!
    
        # Least squares solution
        Ai = np.dot(Q, np.dot(np.diag(Ii), Q.T)) + lmbda * nui * E
        Vi = np.dot(Q, np.dot(np.diag(Ii), R[i].T))
        P[:,i] = np.linalg.solve(Ai,Vi)
        
    # Fix P and estimate Q
    for j, Ij in enumerate(I.T):
        nmj = np.count_nonzero(Ij) # Number of users that rated item j
        if (nmj == 0): nmj = 1 # Be aware of zero counts!
        
        # Least squares solution
        Aj = np.dot(P, np.dot(np.diag(Ij), P.T)) + lmbda * nmj * E
        Vj = np.dot(P, np.dot(np.diag(Ij), R[:,j]))
        Q[:,j] = np.linalg.solve(Aj,Vj)
    
    train_rmse = rmse(I,R,Q,P)
    test_rmse = rmse(I2,T,Q,P)
    train_errors.append(train_rmse)
    test_errors.append(test_rmse)
    
    print "[Epoch %d/%d] train error: %f, test error: %f" \
    %(epoch+1, n_epochs, train_rmse, test_rmse)

MemoryError: 

In [None]:
# Check performance by plotting train and test errors
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(range(n_epochs), train_errors, marker='o', label='Training Data');
plt.plot(range(n_epochs), test_errors, marker='v', label='Test Data');
plt.title('ALS-WR Learning Curve')
plt.xlabel('Number of Epochs');
plt.ylabel('RMSE');
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate prediction matrix R_hat (low-rank approximation for R)
R_hat = pd.DataFrame(np.dot(P.T,Q))
R = pd.DataFrame(R)

In [None]:
# Compare true ratings of user 17 with predictions
ratings = pd.DataFrame(data=R.loc[16,R.loc[16,:] > 0]).head(n=5)
ratings['Prediction'] = R_hat.loc[16,R.loc[16,:] > 0]
ratings.columns = ['Actual Rating', 'Predicted Rating']

In [None]:
predictions = R_hat.loc[16,R.loc[16,:] == 0] # Predictions for movies that the user 17 hasn't rated yet
top5 = predictions.sort_values(ascending=False).head(n=5)
recommendations = pd.DataFrame(data=top5)
recommendations.columns = ['Predicted Rating']

recommendations