In [91]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings(action='once')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

sns.set_style('white')

In [92]:
df_train  = pd.read_csv("C:/Users/salon/Desktop/Fifth Semester/Data Analytics/Project Datasets/Colaborative Filtering/Colab/collab_train.csv")
df_test  = pd.read_csv("C:/Users/salon/Desktop/Fifth Semester/Data Analytics/Project Datasets/Colaborative Filtering/Colab/collab_test.csv")
df_train  = df_train[['gameid','username','rating']]
df_test = df_test[['gameid','username','rating']]

In [93]:
df_train.head()

Unnamed: 0,gameid,username,rating
0,104347,Larry Chong,7.0
1,25613,bluekingzog,10.0
2,35497,Bundyman,6.5
3,47,laycelin,8.0
4,20551,gixmo,3.0


In [95]:
df_train.shape

(879, 3)

In [94]:
df_test.head()

Unnamed: 0,gameid,username,rating
0,160477,549sd,8.0
1,171669,aaronseeber,7.0
2,12942,agentzen,4.0
3,5867,Aiken Drum,6.0
4,164928,Aiken Drum,6.0


In [96]:
df_test.shape

(220, 3)

In [97]:
#Indexed values for gameid and username fields

trainrows = df_train.username.unique()
traincols = df_train['gameid'].unique()
df_train = df_train[['gameid', 'username', 'rating']]
df_train
idict = dict(zip(traincols, range(len(traincols))))
udict = dict(zip(trainrows, range(len(trainrows))))
idict
udict
df_train.username = [ udict[i] for i in df_train.username ]
df_train['gameid'] = [ idict[i] for i in df_train['gameid'] ]
trainmat = df_train.values
testrows = df_test.username.unique()
testcols = df_test['gameid'].unique()
df_test = df_test[['gameid', 'username', 'rating']]
idict = dict(zip(testcols, range(len(testcols))))
udict = dict(zip(testrows, range(len(testrows))))
df_test.username = [ udict[i] for i in df_test.username ]
df_test['gameid'] = [ idict[i] for i in df_test['gameid'] ]
testmat = df_test.values

def nonzerocount(a):
    count = 0
    for i in a:
        for j in i:
            if j != 0:
                count += 1
    return count
print("Sparsity of train matrix: ",nonzerocount(trainmat)/np.prod(trainmat.shape))
print("Sparsity of test matrix: ",nonzerocount(testmat)/np.prod(testmat.shape))

Sparsity of train matrix:  0.9992415623814941
Sparsity of test matrix:  0.996969696969697


In [99]:
print("Unique users in df_train", len(trainrows))
print("Unique games in df_train", len(traincols))
print("Unique users in df_test", len(testrows))
print("Unique games in df_test", len(testcols))

Unique users in df_train 617
Unique games in df_train 617
Unique users in df_test 201
Unique games in df_test 198


In [100]:
naive_train = np.zeros((len(trainrows),len(traincols))) #Pivot table for train dataset
for row in trainmat:
  naive_train[int(row[1]), int(row[0])] = int(row[2])
print(naive_train.shape)

(617, 617)


In [101]:
naive_test = np.zeros((len(testrows),len(testcols)))#Pivot table for test dataset
for row in testmat:
  naive_test[int(row[1]), int(row[0])] = int(row[2])
print(naive_test.shape)

(201, 198)


In [102]:
from sklearn.metrics.pairwise import cosine_similarity 

similarities = cosine_similarity(naive_train) #Calculating the user-user similairty matrix
print(similarities.shape)

#Finding k similar users
def ksimilar(user, SM, k):
    similarities = SM[user]
    similarities[user] = 0
    similar_users = []
    for i in range(k):
        m = np.argmax(similarities)
        similar_users.append(m)
        similarities[m] = 0
    return similar_users

(617, 617)


In [110]:
prediction_train = np.zeros_like(naive_train)
real_train = np.zeros_like(naive_train)

for row, val in enumerate(naive_train):
  avg_rating = np.mean(naive_train[row]) #On an average how has the user rated games
  sim = similarities[row]  #Stores all the similarity values of that user with respect to other users
  for col in val:
    prediction_train[int(row), int(col)] = avg_rating + np.sum(sim*naive_train[int(row)][int(col)])/np.sum(np.abs(sim))
    real_train[int(row), int(col)] = naive_train[int(row)][int(col)]

print(prediction_train)

[[7.01134522 0.         0.         ... 0.         0.         0.        ]
 [0.02755267 0.         0.         ... 0.         0.         0.        ]
 [0.00972447 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.00972447 0.         0.         ... 0.         0.         0.        ]
 [0.01458671 0.         0.         ... 0.         0.         0.        ]
 [0.01134522 0.         0.         ... 0.         0.         0.        ]]


In [104]:
print(prediction_train.shape)

(617, 617)


In [111]:
prediction_test = np.zeros_like(naive_test)
real_test = np.zeros_like(naive_test)

for row, val in enumerate(naive_test):
  avg_rating = np.mean(naive_test[row]) #On an average how has the user rated games
  sim = similarities[row]  #Stores all the similarity values of that user with respect to other users
  for col in val:
    prediction_test[int(row), int(col)] = avg_rating + np.sum(sim*naive_train[int(row)][int(col)])/np.sum(np.abs(sim))
    real_test[int(row), int(col)] = naive_test[int(row)][int(col)]

print(prediction_test)

[[7.04040404 0.         0.         ... 0.         0.         0.        ]
 [0.03535354 0.         0.         ... 0.         0.         0.        ]
 [0.02020202 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.03535354 0.         0.         ... 0.         0.         0.        ]
 [0.03535354 0.         0.         ... 0.         0.         0.        ]
 [0.03535354 0.         0.         ... 0.         0.         0.        ]]


In [113]:
print(prediction.shape)

(201, 198)


In [117]:
import math #Defining the rmse function
def rmse(a, b):
    return math.sqrt(np.square(a - b).mean())

In [118]:
err1 = rmse(np.array(prediction_train), np.array(real_train))
print('Train Error')
print('RMSE : %.4f' % err1)

Train Error
RMSE : 0.0012


In [119]:
err2 = rmse(np.array(prediction_test), np.array(real_test))
print('Test Error')
print('RMSE : %.4f' % err2)

Test Error
RMSE : 0.0713
