# Tutorial 8 Jester



In [1]:
import pandas
import numpy as np

import matplotlib.pyplot as plt 

import csv 
import ast
import re
%matplotlib inline

In [2]:
data=pandas.read_csv('jester_items.csv')
data

Unnamed: 0,jokeId,jokeText
0,1,"A man visits the doctor. The doctor says ""I ha..."
1,2,This couple had an excellent relationship goin...
2,3,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,4,Q. What's the difference between a man and a t...
4,5,Q.\tWhat's O. J. Simpson's Internet address? \...
...,...,...
145,146,America: 8:00 - Welcome to work! 12:00 - Lunch...
146,147,It was the day of the big sale. Rumors of the ...
147,148,"Recently a teacher, a garbage collector, and a..."
148,149,"A little girl asked her father, ""Daddy? Do all..."


In [3]:
data.isnull().sum()

jokeId      0
jokeText    0
dtype: int64

In [4]:
data.dtypes

jokeId       int64
jokeText    object
dtype: object

In [5]:
corpus=list(data["jokeText"])
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(corpus)

doc_term_matrix = sparse_matrix.todense()
df = pandas.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names())
df

Unnamed: 0,00,000,10,100,1040,11,12,125,13,14,...,yesterday,yet,york,you,young,younger,your,yourself,zipper,zo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,10,0,0,0,0,0,3,0,0,0,...,0,0,0,1,0,0,2,0,0,0
146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df,df))

[[1.         0.12811469 0.15593624 ... 0.38655501 0.         0.27717885]
 [0.12811469 1.         0.05216405 ... 0.25416307 0.14064217 0.20560171]
 [0.15593624 0.05216405 1.         ... 0.24422956 0.         0.18768782]
 ...
 [0.38655501 0.25416307 0.24422956 ... 1.         0.08779731 0.54548307]
 [0.         0.14064217 0.         ... 0.08779731 1.         0.07276316]
 [0.27717885 0.20560171 0.18768782 ... 0.54548307 0.07276316 1.        ]]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf*tfidf.T
pairwise_similarity.toarray()



array([[1.        , 0.        , 0.        , ..., 0.0255217 , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.01774957, 0.03038234,
        0.09384287],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0255217 , 0.01774957, 0.        , ..., 1.        , 0.01694724,
        0.01801729],
       [0.        , 0.03038234, 0.        , ..., 0.01694724, 1.        ,
        0.0052379 ],
       [0.        , 0.09384287, 0.        , ..., 0.01801729, 0.0052379 ,
        1.        ]])

In [8]:
import ipywidgets as widgets

user_joke=widgets.Dropdown(
    options=sorted(list(data["jokeId"])),
    description='Please choose a joke id:',
    disabled=False,
    value=1
)
user_joke

Dropdown(description='Please choose a joke id:', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…

In [9]:
arr = pairwise_similarity.toarray()
np.fill_diagonal(arr, np.nan)

input_idx = user_joke.value-1

results_arr = np.argsort(arr[input_idx])[-6:-1]
results_arr = results_arr+1 #this is because the jokeId is 1 bigger than the index
results_arr

array([130,  68, 134, 103,  87], dtype=int64)

In [10]:
jokeList = list(results_arr)
jokeList.reverse()
for i in range(5):
    jokeList[i] = "joke number "+str(jokeList[i])
print("If you liked the joke number "+ str(user_joke.value) + ", you might enjoy these jokes:\n"+'\n'.join(list(jokeList)))

If you liked the joke number 1, you might enjoy these jokes:
joke number 87
joke number 103
joke number 134
joke number 68
joke number 130
