![](https://www.ieseg.fr/wp-content/uploads/IESEG-Logo-2012-rgb.jpg)


# Content based - TFIDF

# Jester Data
- Values from (-10.00 to +10.00) of 100 jokes from 73,421 users: collected between April 1999 - May 2003


In [1]:
import pandas as pd 
import numpy as np 
from IESEGRecSys import eval
from IESEGRecSys.model import ContentBased
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic, SVD

# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\p.borchert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\p.borchert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
text = pd.read_csv('../Data/jester/JokeText.csv')
data = pd.read_csv('../Data/jester/UserRatings1.csv')

In [3]:
text.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [4]:
data.head()

Unnamed: 0,JokeId,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User36701,User36702,User36703,User36704,User36705,User36706,User36707,User36708,User36709,User36710
0,0,5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,-3.64,...,,,,,,,,,2.91,
1,1,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,-3.35,...,,,,-5.63,,-6.07,,-1.6,-4.56,
2,2,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,-6.46,...,,,,,,4.08,,,8.98,
3,3,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,-3.4,...,,,,,,,,,,
4,4,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,1.26,...,2.28,-0.49,5.1,-0.29,-3.54,-1.36,7.48,-5.78,0.73,2.62


In [5]:
data = data.set_index('JokeId').T
data.head()

JokeId,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
User1,5.1,4.9,1.75,-4.17,5.15,1.75,4.76,3.3,-2.57,-1.41,...,5.34,-4.61,3.59,7.18,0.92,6.31,-4.95,-0.19,3.25,4.37
User2,-8.79,-0.87,1.99,-4.61,5.39,-0.78,1.6,1.07,-8.69,-4.66,...,3.59,1.21,2.86,-0.05,-1.75,-1.02,-0.97,4.13,-1.84,2.96
User3,-3.5,-2.91,-2.18,-0.1,7.52,1.26,-5.39,1.5,-8.4,4.37,...,1.84,-4.03,-1.41,1.65,-3.79,3.98,-6.46,-6.89,-2.33,-7.38
User4,7.14,-3.88,-3.06,0.05,6.26,6.65,-7.52,7.28,-5.15,-7.14,...,-4.47,6.36,4.71,-5.19,6.26,3.93,-2.57,1.07,2.33,-0.34
User5,-8.79,-0.58,-0.58,8.98,7.67,8.25,4.08,2.52,-9.66,2.48,...,-0.29,9.37,8.3,9.13,-3.45,9.13,9.17,9.17,9.08,8.98


In [6]:
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(data.shape)
print(train.shape)
print(test.shape)

(36710, 100)
(25697, 100)
(11013, 100)


In [7]:
# transform 
test_stack = test.stack().reset_index()
test_stack.columns = ['user', 'item', 'rating']

# transform 
train_stack = train.stack().reset_index()
train_stack.columns = ['user', 'item', 'rating']
train_stack.head()

Unnamed: 0,user,item,rating
0,0,0,7.48
1,0,1,6.6
2,0,2,4.13
3,0,3,-2.82
4,0,4,2.96


# Exercise 1

## Preprocessing text data

In [8]:
# Tokenize, case conversion & only alphabetic
tokens = text['JokeText'].apply(lambda txt: [word.lower() for word in word_tokenize(str(txt)) if word.isalpha()])

In [9]:
# setup stop words list
stop_words = stopwords.words('english')
stop_words.append('nan')

stemmer = SnowballStemmer("english")

# remove stopwords
# stem
token_stem = tokens.apply(lambda lst_token: [stemmer.stem(tok) for tok in lst_token if tok not in stop_words and len(tok) > 2])

# Exercise 2

## Term Frequency - Inverse Document Frequency (TF-IDF)

In [10]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(min_df=5)

# apply tf-idf vectorizer -> document-term-matrix in sparse format
dtm = tfidf.fit_transform([" ".join(x) for x in token_stem])

print(dtm.shape)

df_dtm = pd.DataFrame(dtm.toarray(), columns=tfidf.get_feature_names_out(), index=text.index)
df_dtm.head()

(100, 71)


Unnamed: 0,anoth,answer,ask,back,bar,bill,call,car,chang,clinton,...,use,walk,want,well,wife,woman,work,would,year,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.467335,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.238011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.295167,0.0,0.303372,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.650491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# count word occurences in text preprocessed corpus 
from collections import Counter

res_l = []
for l in token_stem: res_l+=l

token_dict = Counter(res_l)
token_dict = {k: v for k, v in sorted(token_dict.items(), key=lambda item: item[1], reverse=True)}
pd.DataFrame(token_dict.items(), columns=['token', 'count'])[:10]

Unnamed: 0,token,count
0,say,60
1,one,41
2,man,36
3,engin,27
4,ask,26
5,repli,23
6,two,21
7,said,20
8,blah,20
9,get,19


# Exercise 3

## Recommendations

In [12]:
reader = Reader(rating_scale=(-10, 10))
df_train = Dataset.load_from_df(train_stack, reader).build_full_trainset()
df_test = list(test_stack.itertuples(index=False, name=None))

In [13]:
# content-based
cb = ContentBased(NN=10)
cb.fit(df_dtm)
cb.fit_ratings(df_train)

# predict test ratings
cb_pred = cb.test(df_test)

  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [14]:
# item-based
options = {'name':'cosine', 'user_based':False}
ib = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)
ib.fit(df_train)

# svd
mf = SVD(n_factors=20, biased=False, random_state=42)
mf.fit(df_train)

models = {"CB_10":cb, "IB_15":ib, "SVD_20":mf}
overview = pd.concat([eval.evaluate(mod.test(df_test), topn=5, rating_cutoff=5) for mod in models.values()], axis=1)
overview.columns = list(models.keys())
overview

Computing the cosine similarity matrix...
Done computing similarity matrix.


Unnamed: 0,CB_10,IB_15,SVD_20
RMSE,6.053173,5.962839,6.259536
MAE,4.958585,4.830249,5.057436
Recall,0.081892,0.15267,0.165661
Precision,0.258553,0.308126,0.31008
F1,0.124387,0.204176,0.21595
NDCG@5,0.841168,0.904337,0.875019
