In [1]:
## Importing libraries for Analysis and Visualization

import numpy as np,pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [2]:
## Parsing unstructured text data in jester_items.dat file using regular expressions.

import re

with open('jester_items.dat','r') as fo:
    x = fo.read()
    lst = []
    for joke in x.split('\n\n'):
        j = re.sub(r'(\<.+>)','',joke)
        k = re.sub(r'\W+',' ',j)
        l = re.sub(r'\d','',k).replace('quot','"')
        lst.append(l)
        
items_df = pd.DataFrame(lst)
items_df['item_id'] = range(1,len(lst)+1)
items_df.set_index('item_id',inplace=True)
items_df.drop(151,inplace=True)
items_df.rename(columns={0:'Joke'},inplace=True)
items_df.head()

Unnamed: 0_level_0,Joke
item_id,Unnamed: 1_level_1
1,"A man visits the doctor The doctor says "" I h..."
2,This couple had an excellent relationship goi...
3,Q What s feet long and has teeth A The fro...
4,Q What s the difference between a man and a ...
5,Q What s O J Simpson s web address A Slash ...


In [3]:
## Reading in Ratings df using pandas csv reader

ratings_df = pd.read_csv('jester_ratings.dat',header=None,delimiter='\t\t',engine='python')
ratings_df.rename(columns={0:'user_id',1:'item_id',2:'rating'},inplace=True)
ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281
3,1,13,-6.781
4,1,15,0.875


In [4]:
## Combining items and user ratings

jester_df = ratings_df.merge(items_df,on='item_id')
jester_df.head()

Unnamed: 0,user_id,item_id,rating,Joke
0,1,5,0.219,Q What s O J Simpson s web address A Slash ...
1,2,5,-9.688,Q What s O J Simpson s web address A Slash ...
2,3,5,-9.844,Q What s O J Simpson s web address A Slash ...
3,4,5,-5.812,Q What s O J Simpson s web address A Slash ...
4,5,5,6.906,Q What s O J Simpson s web address A Slash ...


In [5]:
## Reducing size of Joke text to 25 characters for easy viewing.

jester_df_original = jester_df.copy() ## Taking a copy before editing.

jester_df['Joke'] = jester_df['Joke'].apply(lambda x: x[:25])

In [6]:
# Grouping the jokes based on rating and no of ratings received and sorting to get the best jokes.

best_jokes = jester_df.groupby('Joke').describe()['rating'][['count','mean']]
best_jokes
best_jokes.sort_values('mean',ascending=False,inplace=True) ## Sorting by mean rating

## Filtering best jokes that has received a minimum of 5K ratings.

best_jokes.columns = ['No.of.ratings','Mean']
best_jokes = best_jokes[best_jokes['No.of.ratings']>5000]

In [7]:
## Finding jokes of similar ilk as rated by users - BASIC RECOMMENDER SYSTEM VIA CORRELATION

joke_matrix = jester_df.pivot_table(index='user_id',columns='Joke',values='rating')

top_joke = best_jokes.index[0] ## Finding the Top joke 
similar_to_topjoke = joke_matrix[top_joke] ## Checking Top Joke's category in matrix 
Top5_correlates_jokes_to_top_joke = joke_matrix.corrwith(similar_to_topjoke).sort_values(ascending=False)[1:6]
Top5_correlates_jokes_to_top_joke

Joke
 This guy  s wife asks "     0.483581
 A little boy goes to his    0.477822
 On the first day of coll    0.476141
 President Clinton looks     0.472623
 A group of girlfriends i    0.472552
dtype: float64

**USING SURPRISE PACKAGE**

In [56]:
from surprise import KNNBasic,KNNWithMeans,KNNWithZScore,SVD ## Models
from surprise import Dataset,Reader,evaluate,GridSearch ## Data import and evaluation

In [23]:
jester_surprise = jester_df_original.copy()
jester_surprise.drop('Joke',axis=1,inplace=True)

jester_surprise2 = jester_surprise.iloc[:4000,:] ## Had to be cut short to 4000 rows to avoid Memory Error from happening.
#jester_surprise2.info()                     ## Surprise Documentation mentions standard use of 8 GB RAM system.

In [25]:
reader = Reader( rating_scale= [-10,10] )
data = Dataset.load_from_df(jester_surprise2,reader=reader)
data.split(n_folds=3)
user_based_cf = KNNBasic(sim_options={'name':'Cosine','user_based':True})

results = evaluate(user_based_cf,data)
print(results)

In [35]:
## Function tocheck multiple KNN based algos and results.

def algo_type(model,decision=True):
    algo = model(sim_options={'name':'Cosine','user_based':decision})
    return evaluate(algo,data)

algo_type(KNNBasic,False) ## Item Based Collaborative Filtering
#algo_type(KNNWithMeans,True) ## User Based CF taking into account mean ratings of each user.
#algo_type(KNNWithZScore,False) ## Item Based CF taking into account z score normalized ratings of each user.

In [51]:
## Applying Gridsearch

cv = GridSearch(KNNBasic,param_grid={'min_k':[2,4,6],'k':[20,30,40]})

cv.evaluate(data)

#cv.best_score
#cv.best_params



Running grid search for the following parameter combinations:
{'min_k': 2, 'k': 20}
{'min_k': 2, 'k': 30}
{'min_k': 2, 'k': 40}
{'min_k': 4, 'k': 20}
{'min_k': 4, 'k': 30}
{'min_k': 4, 'k': 40}
{'min_k': 6, 'k': 20}
{'min_k': 6, 'k': 30}
{'min_k': 6, 'k': 40}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.




Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Resulsts:
{'min_k': 2, 'k': 20}
{'RMSE': 5.81681554990229, 'MAE': 4.992321811134398}
----------
{'min_k': 2, 'k': 30}
{'RMSE': 5.809542150997331, 'MAE': 4.988168074138952}
----------
{'min_k': 2, 'k': 40}
{'RMSE': 5.806312399728618, 'MAE': 4.988011914544899}
----------
{'min_k': 4, 'k': 20}
{'RMSE': 5.81681554990229, 'MAE': 4.992321811134398}
----------
{'min_k': 4, 'k': 30}
{'RMSE': 5.809542150997331, 'MAE': 4.988168074138952}
----------
{'min_k': 4, 'k': 40}
{'RMSE': 5.806312399728618, 'MAE': 4.988011914544899}
----------
{'min_k': 6, 'k': 20}
{'RMSE': 5.81681554990229, 'MAE': 4.992321811134398}
----------
{'min_k': 6, 'k': 30}
{'RMSE': 5.809542150997331, 'MAE': 4.988168074138952}
----------
{'min_k': 6, 'k': 40}
{'RMSE': 5.806312399728618, 'MAE': 4.988011914544899}
--