# CASE STUDY ON RECOMMENDATION SYSTEMS

#### Build a collaborative filtering based recommendation system on jokes rating.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('jokes-data.csv')
df.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [3]:
df.shape

(1092059, 4)

In [4]:
df.nunique()

id         1092059
user_id      40863
joke_id        139
Rating         641
dtype: int64

In [5]:
df.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [6]:
# for building collaborative filtering u need:
# user_id, joke_id and rating

In [7]:
df = df.drop('id', axis=1)
df

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.750
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375
...,...,...,...
1092054,9517,132,3.156
1092055,27767,118,-1.594
1092056,10580,81,2.000
1092057,31007,119,8.906


In [8]:
from surprise import Reader, Dataset, KNNBasic, SVD  #singular value decomposition

In [9]:
# define a reader object
# reader object helps in parsing the file or dataframe containing ratings

In [10]:
reader = Reader()

In [11]:
# create a dataset to be used for building the filter

In [12]:
data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x1eacee26850>

In [13]:
algo = KNNBasic()

In [14]:
from surprise.model_selection import cross_validate

In [16]:
#singular value decomposition
algo2 = SVD()

In [17]:
cross_validate(algo2, data, measures=['RMSE'], cv=5, verbose=True)   #verbose = true means while training too it will tell what is the rmse scores for different folds

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.6546  4.6715  4.6603  4.6684  4.6649  4.6639  0.0060  
Fit time          9.77    10.08   10.52   10.50   10.48   10.27   0.30    
Test time         2.27    2.10    2.10    2.10    2.10    2.13    0.07    


{'test_rmse': array([4.65456849, 4.67152329, 4.66030075, 4.66837701, 4.66485067]),
 'fit_time': (9.773905515670776,
  10.077255010604858,
  10.520045042037964,
  10.498553991317749,
  10.482023477554321),
 'test_time': (2.2686679363250732,
  2.1024272441864014,
  2.1013879776000977,
  2.0982398986816406,
  2.101195812225342)}

In [None]:
# SVD is performing better as its mean rmse is smaller