In [1]:
# Import required libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import the dataset


data = pd.read_csv('/content/jokes-data.csv')
data.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [3]:
#Find any null values

data.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [4]:
#Find any duplicate values


data.duplicated().sum()

0

In [5]:
data.columns

Index(['id', 'user_id', 'joke_id', 'Rating'], dtype='object')

In [5]:
#drop id

data = data.drop('id', axis=1)
data.head()

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.75
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375


In [6]:
!pip install scikit-surprise




In [7]:
from surprise import Reader, Dataset, KNNBasic, SVD, KNNWithMeans

In [8]:
# Load the data into the Surprise package

reader = Reader(rating_scale=(min(data['Rating']), max(data['Rating'])))
data1 = Dataset.load_from_df(data, reader)

In [9]:
#importing cross validation

from surprise.model_selection import cross_validate

In [10]:
# Create the Single Value Decomposition algorithm

algo_svd=SVD()
results = cross_validate(algo_svd, data1, measures=['RMSE'], cv=5, verbose = True)
results

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.2963  4.2954  4.2865  4.2951  4.2941  4.2935  0.0035  
Fit time          19.42   19.05   16.79   17.96   17.30   18.10   1.00    
Test time         2.84    2.26    2.99    2.35    2.00    2.49    0.37    


{'test_rmse': array([4.29625451, 4.29538088, 4.28651076, 4.29505072, 4.29409048]),
 'fit_time': (19.416359901428223,
  19.054348468780518,
  16.78693389892578,
  17.95866823196411,
  17.3011417388916),
 'test_time': (2.840986728668213,
  2.2591235637664795,
  2.9884793758392334,
  2.3546628952026367,
  1.9991815090179443)}

In [None]:
# Use KNNWithMeans  --->runtime crash

#algo_knnm = KNNWithMeans()
#result2 = cross_validate(algo_knnm, data1, measures = ['RMSE'], cv=5)
#result2

In [None]:
# Use KNNBasic   --->runtime crash

#algo_knn = KNNBasic()
#result3 = cross_validate(algo_knn, data1, measures = ['RMSE'], cv=5)
#result3

In [11]:
training_set=data1.build_full_trainset()


In [13]:
#List of all the ratings in the training set

ratings = list(training_set.all_ratings())

In [14]:
# Print the first few ratings
print(ratings[:8])

[(0, 0, 2.75), (0, 106, -9.719), (0, 43, -9.375), (0, 72, -5.5), (0, 30, 4.688), (0, 80, 1.469), (0, 108, -8.375), (0, 71, -9.062)]


In [15]:
#Predicting the results

prediction = algo_svd.predict('user_id','joke_id','Rating')
prediction

Prediction(uid='user_id', iid='joke_id', r_ui='Rating', est=1.7566906145266745, details={'was_impossible': False})

In [16]:
prediction.est

1.7566906145266745