In [1]:
# import necessary libraries
import numpy as np
import pandas as pd

# Familiarisation of the data

In [2]:
# read the data
data=pd.read_csv('jokes-data.csv')

In [3]:
# copy data to variable df
df=data.copy()

In [4]:
# print first 3 rows of the data
df.head(3)

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438


data contains user_id,joke_id,Rating.

In [5]:
# lets check the shape of the data
df.shape

(1092059, 4)

dataset contains 1092059 rows & 4 columns

In [6]:
# lets find the unique userid & joke id
df['user_id'].nunique()

40863

We have only 40863 user_ID. because same user can give ratings to different jokes

In [7]:
# Lets check the unique joke_id
df['joke_id'].nunique()

139

In this case also, we have only 139 jokes, but multiple user can give ratings to same jokes.

In [8]:
# check the duplicated rows
df.duplicated().sum()

0

There is no duplicated rows. each row is unique.

In [9]:
# check for null values
df.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

There is no null values in the dataset.

In [10]:
# lets check the last 3 rows of the dataset
df.tail(3)

Unnamed: 0,id,user_id,joke_id,Rating
1092056,10580_81,10580,81,2.0
1092057,31007_119,31007,119,8.906
1092058,8420_98,8420,98,-3.344


# Build a collaborative filtering based recommendation system on jokes rating

In [11]:
# lets check Rating range
print('Min value given as rating',df['Rating'].min())
print('Max value given as rating',df['Rating'].max())


Min value given as rating -10.0
Max value given as rating 10.0


So, rating range is between -10 to 10

In [12]:
# lets check the no of reviews done by each user
print('Min no. of reviews',df['user_id'].value_counts().min())
print('MAx no. of reviews',df['user_id'].value_counts().max())


Min no. of reviews 7
MAx no. of reviews 93


Lowest no of review is 7 and highest review count is 93.

in collaborative filtering, we can set our rules for the recommendation engine</br>
Rules:</br>
    1. user should be a genuine type- we can set limit 50. user should have reviewd atleast 30 joke.</br>
    2. rating should be greater than 40</br>
    3. rating should be a positiive value

In [13]:
# lets drop id column
df.drop('id',axis=1,inplace=True)

In [14]:
# lets filter the data based on user. we will consider only those user reviews that are genuine
# we will consider only those user who gave rating morethan 30 times
x=df.groupby('user_id')['Rating'].count()>30

In [15]:
# extract indices
valid_users=x[x].index

In [16]:
# lets filter data using this user_id
filtered_data=df[df['user_id'].isin(valid_users)]

In [17]:
# first 3 rows of the datset
filtered_data.head(3)

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.75
3,14273,86,4.406
4,18419,134,9.375


In [18]:
# lets check the shape
filtered_data.shape

(662889, 3)

In [19]:
# from this datset, filter again for those jokes got 40 reviews
y=filtered_data.groupby('joke_id')['Rating'].count()>40

In [20]:
# extract indices
famous_jokes=y[y].index

In [21]:
# lets filter the data using this joke_id
final_ratings = filtered_data[filtered_data['joke_id'].isin(famous_jokes)]

In [22]:
# lets check the shape
final_ratings.shape

(662889, 3)

In [23]:
# first 3 rows 
final_ratings.head(3)

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.75
3,14273,86,4.406
4,18419,134,9.375


In [24]:
final_ratings=final_ratings[final_ratings['Rating']>0]

In [25]:
# Lets convert this final_ratings to vectors using pivot function
# do pivot to make it jokeid on rows & user_id in columns
pivot1=final_ratings.pivot_table(index='joke_id',columns='user_id',values='Rating')

In [26]:
# Lets print the pivot table
pivot1

user_id,1,4,6,7,9,12,14,18,21,26,...,40822,40823,40827,40828,40836,40845,40846,40847,40848,40863
joke_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.219,6.906,,6.219,,,0.469,6.312,,,...,,,,,,,,,,
2,,,,,0.281,,,,,,...,,,0.812,,2.438,,,,1.375,
3,,,,,0.781,,,5.469,,,...,,,,3.500,2.531,3.094,,,,
4,,,,,,,,,,,...,,,,,8.125,,,1.344,,
5,0.875,,,0.531,,,,,,,...,,,,0.125,2.594,8.000,,5.344,0.125,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,,,,,,,,,,,...,8.281,8.906,,,,,,3.469,3.625,8.375
136,,,,,,,,,,,...,7.375,,,,8.562,,,,,8.938
137,,,,,,,,,,,...,9.000,3.219,,,,3.656,9.438,,,8.281
138,,,,,,,,,,,...,9.781,,,,,5.906,,4.688,,


In [27]:
# before applying cosine similiariity , we need to handle the misisng values
# so replace nan with 0
pivot1.fillna('0',inplace=True)

In [28]:
# Do the cosine similiarity to calculate similiarity score between the vectors
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# calculate cosine similiarity_score
similiarity_score=cosine_similarity(pivot1)

In [30]:
# print the score
similiarity_score

array([[1.        , 0.04929195, 0.06509902, ..., 0.00772449, 0.00394203,
        0.00654602],
       [0.04929195, 1.        , 0.33617447, ..., 0.20793893, 0.27405183,
        0.24307759],
       [0.06509902, 0.33617447, 1.        , ..., 0.28324717, 0.33211691,
        0.29545946],
       ...,
       [0.00772449, 0.20793893, 0.28324717, ..., 1.        , 0.41780454,
        0.42255829],
       [0.00394203, 0.27405183, 0.33211691, ..., 0.41780454, 1.        ,
        0.47845143],
       [0.00654602, 0.24307759, 0.29545946, ..., 0.42255829, 0.47845143,
        1.        ]])

In [31]:
pd.options.display.float_format='{:,.2f}'.format

In [32]:
similiarity_score[0][0]

0.9999999999999999

In [33]:
# shape of similiarity_score
similiarity_score.shape

(139, 139)

In [34]:
list(enumerate(similiarity_score[0]))

[(0, 0.9999999999999999),
 (1, 0.049291948931704216),
 (2, 0.06509902119511526),
 (3, 0.04378304865741081),
 (4, 0.03365224933601356),
 (5, 0.04165642677341311),
 (6, 0.059366855999738254),
 (7, 0.02610664604441116),
 (8, 0.03894762029357893),
 (9, 0.4875881573384897),
 (10, 0.07589835846788523),
 (11, 0.04090556912405909),
 (12, 0.053854830824470803),
 (13, 0.06302256345499972),
 (14, 0.04226101482503605),
 (15, 0.05807953689875331),
 (16, 0.40666982417089104),
 (17, 0.04241621200830606),
 (18, 0.04331867378229827),
 (19, 0.04915923201530282),
 (20, 0.38254799901806225),
 (21, 0.05310709019466626),
 (22, 0.037313511588670634),
 (23, 0.05041754391744829),
 (24, 0.06676547405001176),
 (25, 0.04144351060939817),
 (26, 0.02676398323325723),
 (27, 0.04140441882615632),
 (28, 0.054474974017887534),
 (29, 0.06106429174760494),
 (30, 0.0491021429435081),
 (31, 0.07022407910213829),
 (32, 0.3051476708257401),
 (33, 0.040819048650673206),
 (34, 0.038041350071968585),
 (35, 0.036398877587252854)

In [35]:
# lets filter the 5 most similiar jokes
sorted(list(enumerate(similiarity_score[0])) ,key=lambda x:x[1],reverse=True)[1:6] 

[(9, 0.4875881573384897),
 (16, 0.40666982417089104),
 (20, 0.38254799901806225),
 (50, 0.3716815266151468),
 (32, 0.3051476708257401)]

In [36]:
# create a function
def return_similiar_5_jokes(joke_id):

    index = np.where(pivot1.index==joke_id)[0][0]
    similar_items = sorted(list(enumerate(similiarity_score[index])), key = lambda x:x[1], reverse=True)[1:6]
    data = []
    for i in similar_items:
        items = []
        temp_df = final_ratings[final_ratings['joke_id']==pivot1.index[i[0]]]
        items.extend(list(temp_df.drop_duplicates('joke_id')['joke_id']))
        items.extend(list(temp_df.drop_duplicates('joke_id')['Rating']))
        data.append(items)

    return data

In [37]:
return_similiar_5_jokes(14) # similiar jokes for joke_id 14

[[114, 3.312], [34, 7.875], [48, 8.719], [47, 3.531], [64, 8.219]]

In [38]:
return_similiar_5_jokes(70) # similiar jokes for joke_id 70

[[63, 3.656],
 [21, 2.656],
 [90, 0.594],
 [17, 5.5310000000000015],
 [33, 6.093999999999999]]

Note: This is a colaborative recommendation system. That means the recommendation is based on similiar user preference.