In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt 

In [6]:
data_cf = pd.read_csv('data_for_collab.csv')

In [5]:
data_cf.head()

Unnamed: 0,user_id,business_id,stars
0,---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,4.0
1,---1lKK3aKOuomHnwAkAow,7vHvQCjPq5pWj7Rio1A--w,1.0
2,---1lKK3aKOuomHnwAkAow,A0X1baHPgw9IiBRivu0G9g,5.0
3,---1lKK3aKOuomHnwAkAow,AZlnpvILz5cEWJifjr2CSQ,5.0
4,---1lKK3aKOuomHnwAkAow,GJBLjDkfJ4tvkpAjDeCikA,1.0


In [7]:
Unique_users = pd.DataFrame([i for i in data_cf['user_id'].unique()],columns = ['user_id'])
Unique_business_id = pd.DataFrame([i for i in data_cf['business_id'].unique()],columns = ['business_id'])

In [8]:
Unique_users['user_index'] = [i for i in range(len(Unique_users))]
Unique_business_id['business_index'] = [i for i in range(len(Unique_business_id))]

In [9]:
data_cf = data_cf.merge(Unique_users, how= 'left',left_on='user_id', right_on='user_id')
data_cf = data_cf.merge(Unique_business_id, how= 'left',left_on='business_id', right_on='business_id')

In [10]:
data_cf.head()

Unnamed: 0,user_id,business_id,stars,user_index,business_index
0,---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,4.0,0,0
1,---1lKK3aKOuomHnwAkAow,7vHvQCjPq5pWj7Rio1A--w,1.0,0,1
2,---1lKK3aKOuomHnwAkAow,A0X1baHPgw9IiBRivu0G9g,5.0,0,2
3,---1lKK3aKOuomHnwAkAow,AZlnpvILz5cEWJifjr2CSQ,5.0,0,3
4,---1lKK3aKOuomHnwAkAow,GJBLjDkfJ4tvkpAjDeCikA,1.0,0,4


In [11]:
data_cf.shape

(823199, 5)

In [12]:
data_cf.isna().sum()

user_id           0
business_id       0
stars             0
user_index        0
business_index    0
dtype: int64

In [13]:
data_cf[['user_index','business_index','stars']].head()

Unnamed: 0,user_index,business_index,stars
0,0,0,4.0
1,0,1,1.0
2,0,2,5.0
3,0,3,5.0
4,0,4,1.0


---
## Using surprise package

In [15]:
from surprise import Reader, Dataset,evaluate,accuracy

In [16]:
reader = Reader(rating_scale=(1, 5.0))

In [17]:
cf_surprise  = Dataset.load_from_df(data_cf[['user_index','business_index','stars']], reader)

In [18]:
import random
random.seed(1245) 

In [19]:
from surprise.model_selection import train_test_split
train, test = train_test_split(cf_surprise, test_size=.25)

---
### Baseline model

In [20]:
from surprise import BaselineOnly

In [21]:
Baseline_cf = BaselineOnly()

In [22]:
Baseline_cf.fit(train)
predictions_bl = Baseline_cf.test(test)

Estimating biases using als...


In [23]:
accuracy.rmse(predictions_bl)

RMSE: 1.0348


1.0347564873643547

In [39]:
predictions_bl[0]

Prediction(uid=15685, iid=8529, r_ui=4.0, est=3.6698428434483663, details={'was_impossible': False})

----
### Building a functions for KPIs


In [122]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [120]:
def MAR(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(recall_score(est,act))
        
    return(np.average(All_recalls))

In [123]:
def MAP(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(precision_score(est,act))
        
    return(np.average(All_recalls))

In [144]:
def ADCG(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_DCG = list()
    for uid in Top_5.keys():
        
        df = pd.DataFrame(Top_5[uid])
        DCG = list()
        for i in range(len(df)):
            if i ==0:
                DCG.append(df[2][i])
            else:
                DCG.append(df[2][i]/np.log2(i+1))
        
        
        All_DCG.append(sum(DCG))
        
    return(np.average(All_DCG))

In [121]:
MAR(predictions_bl)

  'recall', 'true', average, warn_for)


0.8278981648369654

In [124]:
MAP(predictions_bl)

  'precision', 'predicted', average, warn_for)


0.8192869626959419

In [145]:
ADCG(predictions_bl)

12.76313496020085

---
## Predictions

In [29]:
from collections import defaultdict

In [92]:
# This is updated
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, est,r_ui,_ in predictions:
        top_n[uid].append((iid, est, r_ui))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


---
### Extracting recommendations for all users

In [0]:
top_5_recco = get_top_n(predictions_bl, n=5)
all_uids = []
all_top_res = []

# Store the recommended items for each user

for uid, user_ratings in top_5_recco.items():
    all_uids.append(Unique_users.user_id[Unique_users['user_index'] == uid].values[0])
    all_top_res.append([Unique_business_id.business_id[Unique_business_id['business_index'] == iid].values[0]
                    for (iid, _) in user_ratings])

In [0]:
Top_5_recco = pd.DataFrame(data = {'uids': all_uids , 'top_res' : all_top_res})

In [0]:
Top_5_recco.iloc[1]['top_res']

['OoKNxTMu5YAaNgQKQ4SrzA',
 'u-SJ5QUwrNquL9VnXwl8cg',
 'ohEnmKpF7i2_ujme1p_vUQ',
 'GdgGAINCqMXqcuKd2nKEBg',
 'wl0QZqAzr1DelslQ02JGCQ']

In [0]:
t = data_cf.business_id[data_cf['user_id']== '_jjbwHWLFA5GjWFWGLj4ZQ']
t[t.isin(Top_5_recco.iloc[1]['top_res'])]

489277    GdgGAINCqMXqcuKd2nKEBg
489295    OoKNxTMu5YAaNgQKQ4SrzA
489354    ohEnmKpF7i2_ujme1p_vUQ
489365    u-SJ5QUwrNquL9VnXwl8cg
489373    wl0QZqAzr1DelslQ02JGCQ
Name: business_id, dtype: object

---
### Trail

In [0]:
test01 = [('abcd','1',3),('abcd','b',0),('abcd',43,5),('abcd',5,5)]
test01

[('abcd', '1', 3), ('abcd', 'b', 0), ('abcd', 43, 5), ('abcd', 5, 5)]

In [0]:
predictions_test01 = Baseline_cf.test(test01)

In [0]:
predictions_test01

[Prediction(uid='abcd', iid='1', r_ui=3, est=3.8021122131872906, details={'was_impossible': False}),
 Prediction(uid='abcd', iid='b', r_ui=0, est=3.8021122131872906, details={'was_impossible': False}),
 Prediction(uid='abcd', iid=43, r_ui=5, est=4.344172026870368, details={'was_impossible': False}),
 Prediction(uid='abcd', iid=5, r_ui=5, est=4.073890050990429, details={'was_impossible': False})]

----
### KNN

In [26]:
from surprise import KNNBasic
Knn_cf = KNNBasic(sim_options={'user_based': True})

In [27]:
Knn_cf.fit(train)
predictions_knn = Knn_cf.test(test)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [28]:
accuracy.rmse(predictions_knn)

RMSE: 1.1117


1.1117434087887796

-----
### KNN means model

KNN with means takes into consideration how a user usually rates. But this can have drawbacks in cases where the user has rated only 2-3 restaurants and the user didn't like all of them.

In [40]:
from surprise import KNNWithMeans

In [41]:
Knnwithmeans_cf = KNNWithMeans(sim_options={'user_based': True})

In [42]:
Knnwithmeans_cf.fit(train)
predictions_knnwithmeans = Knnwithmeans_cf.test(test)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [59]:
accuracy.rmse(predictions_knnwithmeans), accuracy.mae(predictions_knnwithmeans)

RMSE: 1.0694
MAE:  0.8268


(1.0693752195674013, 0.8267623071557545)

In [60]:
from surprise.model_selection import GridSearchCV
param_grid = {
             'k': [200,250,300,700],\
             'sim_options': {'name': ['msd','cosine']  } \
            }

gsKNNMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=3)

In [61]:
gsKNNMeans.fit(cf_surprise)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

In [62]:
gsKNNMeans.best_params

{'rmse': {'k': 700, 'sim_options': {'name': 'cosine', 'user_based': True}},
 'mae': {'k': 700, 'sim_options': {'name': 'cosine', 'user_based': True}}}

In [63]:
gsKNNMeans.best_score

{'rmse': 1.0715353437122157, 'mae': 0.8265848303883844}

------
### NMF

In [29]:
from surprise import NMF
NMF_cf = NMF()

In [30]:
NMF_cf.fit(train)
predictions_nmf = NMF_cf.test(test)

In [31]:
accuracy.rmse(predictions_nmf)

RMSE: 1.1192


1.1192437928532888

In [64]:
# Including a bias parameter
NMF_cf_v2 = NMF(biased = True)

In [65]:
NMF_cf_v2.fit(train)
predictions_nmf_v2 = NMF_cf_v2.test(test)

In [66]:
accuracy.rmse(predictions_nmf_v2)

RMSE: 1.0580


1.0580416007074052

In [34]:
# Fine tuning the NMF model
from surprise.model_selection import GridSearchCV

In [83]:
param_grid = { 'n_factors' : [15] , 'biased' : [True],
    'n_epochs': [10], 'init_low': [0.00, 0.001,0.0001],
              'lr_bu' : [0.0001, 0.001,0.01] , 'lr_bi' : [0.0001, 0.001,0.01]}

gsNMF = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=3)
#gs.fit(cf_surprise)

In [None]:
gsNMF.fit(cf_surprise)

In [81]:
gsNMF.best_params

{'rmse': {'n_factors': 15,
  'biased': True,
  'n_epochs': 10,
  'init_low': 0.0001,
  'lr_bu': 0.01,
  'lr_bi': 0.005},
 'mae': {'n_factors': 15,
  'biased': True,
  'n_epochs': 10,
  'init_low': 0.01,
  'lr_bu': 0.01,
  'lr_bi': 0.005}}

In [82]:
gsNMF.best_score

{'rmse': 1.0475839075365967, 'mae': 0.8180544161506189}

Using the best parameters 

---
### NOT using the surprise package
### Baseline model

In [0]:
from scipy.sparse import csr_matrix

In [0]:
data_cf_pivot = data_cf.pivot(
    index='business_index',
    columns='user_index',
    values='stars'
)

In [0]:
#data_cf_matrix = csr_matrix(data_cf_pivot.values)

In [0]:
total_mean = data_cf_pivot.mean()

In [0]:
avg_rating_user = data_cf_pivot.mean(1)
avg_rating_rest = data_cf_pivot.mean(0)
data_cf_pivot.shape

In [0]:
len(avg_rating_user),len(avg_rating_rest)

(16906, 24512)

In [0]:
type(data_cf_pivot)

pandas.core.frame.DataFrame

In [0]:
NANs = np.isnan(data_cf_pivot)

In [0]:
for i in range(avg_rating_user):
    for j in range(avg_rating_rest):
        if NANs[i][j] == True:
            b_mean = total_mean - data_cf_pivot[i].mean()
            b_r = total_mean - data_cf_pivot[j].mean()
            data_cf_pivot[i][j] = total_mean + ()

## KNN for collaborative filtering

Transforming the data into required/necessary form

In [0]:
from scipy.sparse import csr_matrix

In [0]:
# High runtime

In [0]:
data_cf_pivot = data_cf.pivot(
    index='user_index',
    columns='business_index',
    values='stars'
).fillna(0)

In [0]:
data_cf_pivot.head()

business_index,0,1,2,3,4,5,6,7,8,9,...,16896,16897,16898,16899,16900,16901,16902,16903,16904,16905
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,1.0,5.0,5.0,1.0,5.0,3.0,1.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
data_cf_pivot.shape

(24512, 16906)

In [0]:
# to make memory effective we will be using csr matrix

In [0]:
data_cf_matrix = csr_matrix(data_cf_pivot.values)

In [0]:
from sklearn.neighbors import NearestNeighbors

Top 10 restaurants

In [0]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11, n_jobs=-1).fit(data_cf_matrix)

In [0]:
distances, indices = model_knn.kneighbors(data_cf_matrix)

In [0]:
indices[0],distances[0]

(array([   0,  167,  901,  902, 1185, 6479,  742,  763,  890,  892, 2641],
       dtype=int64),
 array([2.22044605e-16, 8.25606044e-01, 8.32054397e-01, 8.42598246e-01,
        8.44297572e-01, 8.49964124e-01, 8.52980981e-01, 8.60204906e-01,
        8.64273235e-01, 8.64819370e-01, 8.66019236e-01]))

In [0]:
data_cf_pivot.head()

user_index,0,1,2,3,4,5,6,7,8,9,...,24502,24503,24504,24505,24506,24507,24508,24509,24510,24511
business_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
