Importing required libraries: 

In [None]:
import pandas as pd
import numpy as np
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD, KNNBasic
from surprise.prediction_algorithms import *
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import GridSearchCV,cross_validate
import seaborn as sns
import matplotlib.pyplot as plt

Importing data:

In [None]:
Data = pd.read_csv("./data/FINALDATA.csv")

Creating training and testing set :

In [None]:
#Create a new dataframe with testing data
df = pd.DataFrame(Data)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['customer_id','id','vendor_rating']], reader)


In [None]:
trainset, testset = train_test_split(data, test_size=.25)

SVD: ( model based) 

Hyperparameter tuning ( cross validation) :

In [None]:
# We'll use the famous SVD algorithm.

# Using GridSearchCV

param_grid = {'n_factors': [50,100,150],'n_epochs': [20,30], 'lr_all': [0.005,0.01],'reg_all':[0.02,0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Modelling with best parameters:

In [None]:
# We'll use the famous SVD algorithm.
algo = SVD(n_factors=50, n_epochs=20, lr_all = 0.01, reg_all=0.02)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
score1= algo.test(testset)

# Then compute RMSE
accuracy.rmse(score1)

Checking accuracy using testing data:

In [None]:
df_pred = pd.DataFrame(score1, columns=['customer_id', 'id', 'vendor_rating', 'pred_rating', 'details'])

df_pred['impossible'] = df_pred['details'].apply(lambda x: x['was_impossible'])
df_pred['pred_rating_round'] = df_pred['pred_rating'].round()
df_pred['abs_err'] = abs(df_pred['pred_rating'] - df_pred['vendor_rating'].round())
df_pred.drop(['details'], axis=1, inplace=True)

df_pred.sample(5)

In [None]:
palette = sns.color_palette("RdBu", 10)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))

sns.countplot(x='vendor_rating', data=df_pred, palette=palette, ax=ax1)
ax1.set_title('Distribution of actual ratings of Resturants in the test set')

sns.countplot(x='pred_rating', data=df_pred, palette=palette, ax=ax2)
ax2.set_title('Distribution of predicted ratings of Resturants in the test set')

plt.show()

In [None]:
df_pred_err = df_pred.groupby('vendor_rating')['abs_err'].mean().reset_index()

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))

sns.distplot(df_pred['abs_err'], color='#2f6194', ax=ax1)
ax1.set_title('Distribution of absolute error in test set')

sns.barplot(x='vendor_rating', y='abs_err', data=df_pred_err, palette=palette, ax=ax2)
ax2.set_title('Mean absolute error for rating in test set')

plt.show()

KNN MEANS ( Memory based) 

Hyperparameter tuning:

In [None]:
param_grid = {'k': [40,45,50],
              'min_k': [1,3,5],
              'sim_options': {'name': ['pearson'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])


Modelling with best parameters:

In [None]:
sim_options = {
    'name': 'pearson',
    'user_based': False,
    'min_support': 1
}
algo2 = KNNWithMeans(k=50, min_k=5, sim_options=sim_options)
algo2.fit(trainset)
score2 = algo2.test(testset)

accuracy.rmse(score2)

Distribution of actual and predicted ratings in the test set

In [None]:
df_pred = pd.DataFrame(score2, columns=['customer_id', 'id', 'vendor_rating', 'pred_rating', 'details'])

df_pred['impossible'] = df_pred['details'].apply(lambda x: x['was_impossible'])
df_pred['pred_rating_round'] = df_pred['pred_rating'].round()
df_pred['abs_err'] = abs(df_pred['pred_rating'] - df_pred['vendor_rating'])
df_pred.drop(['details'], axis=1, inplace=True)

df_pred.sample(5)

In [None]:
palette = sns.color_palette("RdBu", 10)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))

sns.countplot(x='vendor_rating', data=df_pred, palette=palette, ax=ax1)
ax1.set_title('Distribution of actual ratings of Resturants in the test set')

sns.countplot(x='pred_rating', data=df_pred, palette=palette, ax=ax2)
ax2.set_title('Distribution of predicted ratings of Resturants in the test set')

plt.show()

Absolute error of predicted ratings

In [None]:
df_pred_err = df_pred.groupby('vendor_rating')['abs_err'].mean().reset_index()

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))

sns.distplot(df_pred['abs_err'], color='#2f6194', ax=ax1)
ax1.set_title('Distribution of absolute error in test set')

sns.barplot(x='vendor_rating', y='abs_err', data=df_pred_err, palette=palette, ax=ax2)
ax2.set_title('Mean absolute error for rating in test set')

plt.show()