# Importing

Importing the necessary libraries

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Data processing

Importing the datasets into a Dataframe

In [3]:
jester_items = pd.read_csv(r'C:\Users\vbraun\Downloads\SDM-Datasets\jester_items.csv')
jester_ratings = pd.read_csv(r'C:\Users\vbraun\Downloads\SDM-Datasets\jester_ratings.csv')
display(jester_items.head(3),jester_ratings.head(3))

Unnamed: 0,jokeId,jokeText
0,1,"A man visits the doctor. The doctor says ""I ha..."
1,2,This couple had an excellent relationship goin...
2,3,Q. What's 200 feet long and has 4 teeth? \n\nA...


Unnamed: 0,userId,jokeId,rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281


As is shown above, only the 'jester_ratings' dataset is required for the creation of our RecSys model.

The jokeId will be renamed to itemId to have a more universal dataset.

In [4]:
jester_df = jester_ratings.rename(columns={'jokeId':'itemId'})

In [5]:
print('Original length of dataset:',len(jester_df))
print('The original dataset has', jester_df['userId'].nunique(), 'unique users')
print('The original dataset has', jester_df['itemId'].nunique(), 'unique jokes')
print('The original dataset has', jester_df['rating'].nunique(), 'unique ratings')
print('The unique ratings are', sorted(jester_df['rating'].unique()))

Original length of dataset: 1761439
The original dataset has 59132 unique users
The original dataset has 140 unique jokes
The original dataset has 641 unique ratings
The unique ratings are [-10.0, -9.969, -9.938, -9.906, -9.875, -9.844, -9.812, -9.781, -9.75, -9.719, -9.688, -9.656, -9.625, -9.594, -9.562, -9.531, -9.5, -9.469, -9.438, -9.406, -9.375, -9.344, -9.312, -9.281, -9.25, -9.219, -9.188, -9.156, -9.125, -9.094, -9.062, -9.031, -9.0, -8.969, -8.937999999999999, -8.906, -8.875, -8.844, -8.812000000000001, -8.781, -8.75, -8.719, -8.687999999999999, -8.656, -8.625, -8.594, -8.562000000000001, -8.531, -8.5, -8.469, -8.437999999999999, -8.406, -8.375, -8.344, -8.312000000000001, -8.281, -8.25, -8.219, -8.187999999999999, -8.156, -8.125, -8.094, -8.062000000000001, -8.031, -8.0, -7.968999999999999, -7.938, -7.906000000000001, -7.875, -7.843999999999999, -7.812, -7.781000000000001, -7.75, -7.718999999999999, -7.688, -7.656000000000001, -7.625, -7.593999999999999, -7.562, -7.531000000

The ratings vary from -10 to 10 which may allow problems when working with mean ratings of users. Therefore each rating will be increased by 10 to create ratings from 0 to 20. 

In [6]:
jester_df['rating'] = jester_df['rating'] + 10
print('The unique ratings are', sorted(jester_df['rating'].unique()))

The unique ratings are [0.0, 0.031000000000000583, 0.06199999999999939, 0.09399999999999942, 0.125, 0.15600000000000058, 0.1880000000000006, 0.21899999999999942, 0.25, 0.2810000000000006, 0.3119999999999994, 0.3439999999999994, 0.375, 0.4060000000000006, 0.4380000000000006, 0.4689999999999994, 0.5, 0.5310000000000006, 0.5619999999999994, 0.5939999999999994, 0.625, 0.6560000000000006, 0.6880000000000006, 0.7189999999999994, 0.75, 0.7810000000000006, 0.8119999999999994, 0.8439999999999994, 0.875, 0.9060000000000006, 0.9380000000000006, 0.9689999999999994, 1.0, 1.0310000000000006, 1.0620000000000012, 1.0939999999999994, 1.125, 1.1560000000000006, 1.1879999999999988, 1.2189999999999994, 1.25, 1.2810000000000006, 1.3120000000000012, 1.3439999999999994, 1.375, 1.4060000000000006, 1.4379999999999988, 1.4689999999999994, 1.5, 1.5310000000000006, 1.5620000000000012, 1.5939999999999994, 1.625, 1.6560000000000006, 1.6879999999999988, 1.7189999999999994, 1.75, 1.7810000000000006, 1.812000000000001

# Data filtering

In order to filter the dataset based on activity and reduce the sparsity of the data, the data will be grouped and filtered based on jokes and users. The datasets will show how many ratings each joke has gotten and the amount of ratings a user has given.

In [7]:
jester_item_count = jester_df[['itemId','userId']].groupby('itemId').count().reset_index().rename(columns={'userId':'user_count'})
jester_user_count = jester_df[['itemId','userId']].groupby('userId').count().reset_index().rename(columns={'itemId':'item_count'})

display(jester_item_count.head(3),jester_user_count.head(3))

Unnamed: 0,itemId,user_count
0,5,661
1,7,59122
2,8,57720


Unnamed: 0,userId,item_count
0,1,62
1,2,34
2,3,18


To reduce the sparcity of data in the dataset, we will filter out users that have rated fewer than 5% of the total amount of jokes.

In [8]:
filtered_jester_df = jester_df[jester_df['userId'].isin(jester_user_count[jester_user_count['item_count']/len(jester_item_count) > 0.05]['userId'])]

print('Length of dataset:',len(filtered_jester_df))

Length of dataset: 1729140


Finally, jokes that have been rated by fewer than 20 people will be filtered out of the dataset.

In [9]:
filtered_jester_df = filtered_jester_df[filtered_jester_df['itemId'].isin(jester_item_count[jester_item_count['user_count']>20]['itemId'])]

print('Length of dataset:',len(filtered_jester_df))

Length of dataset: 1729140


# EDA

t.b.d.

# KNN Model Creation

Pivot the dataset into a matrix with index='itemId', columns='userId', values='rating' in order to later perform content-based collaborative filtering. Moreover, fill_value = 0 in order to remove NaN values and save them as 0. Finally, the matrix is directly stored as a sparse matrix to save memory, instead of first saving the entire matrix into memory. 

In [10]:
sparse_matrix = csr_matrix(filtered_jester_df.pivot_table(index='itemId', columns='userId', values='rating').subtract(filtered_jester_df.pivot_table(index='itemId', columns='userId', values='rating').mean(axis=0), axis = 'columns').fillna(0).values)
sparse_matrix.check_format

<bound method _cs_matrix.check_format of <140x50712 sparse matrix of type '<class 'numpy.float64'>'
	with 1727548 stored elements in Compressed Sparse Row format>>

Splitting the dataset into 70% training, 15% validation and 15% testing data

In [11]:
train_data, test_data = train_test_split(sparse_matrix, test_size=.30)
test_data, validation_data = train_test_split(test_data, test_size=.50)

A function has to be created to perform hyper parameter tuning for our KNN RecSys model, as NearestNeighbors and kneighbors do not work using the standard GridSearch function.

In [12]:
def evaluate_predictions(pred, truth):
    pred = pred[truth.nonzero()].flatten()
    truth = truth[truth.nonzero()].flatten()
    rmse = np.sqrt(mean_squared_error(pred,truth))
    mae = mean_absolute_error(pred,truth)
    return rmse, mae

1. Create NearestNeighbors model
1. Fit the model with train data
1. Use kneighbors to find the k amount of neighbors of the jokes in the test data
1. Calculate the prediction by taking the average score of the k most similar jokes
1. Evaluate the model by comparing the actual ratings with the predicted ratings

Algorithm is set at brute (force) because the inputdata is sparse

In [30]:
def create_knn_model(metric='cosine',k = 5,n_neighbors = 20,testdata=validation_data):
    knn_model = NearestNeighbors(metric=metric,algorithm='brute',n_neighbors=n_neighbors,n_jobs=-1)

    knn_model_fitted = knn_model.fit(train_data.toarray())
    distance, indices = knn_model_fitted.kneighbors(testdata.toarray(),k)

    similarity = 1 - distance
    predictions = similarity.T.dot(testdata.toarray()) / np.array([np.abs(similarity.T).sum(axis = 1)]).T
    truth = testdata.toarray()[similarity.argsort()[0]]
    
    rmse, mae = evaluate_predictions(predictions,truth)
    
    return rmse, knn_model_fitted

Hyperparameter tuning by hand:

In [31]:
n_neighbors = [5,10,20,50]
recommendation_amount = [3,5,10]
metric = ['euclidean','manhattan','cosine','minkowski']

hpt_results = []
for met in metric:
    for k in recommendation_amount:
        for n in n_neighbors:
            hpt_results.append([create_knn_model(metric=met, k=k,n_neighbors=n,testdata=validation_data)[0],met,k,n])
print(hpt_results)

[[4.10518002301106, 'euclidean', 3, 5], [4.10518002301106, 'euclidean', 3, 10], [4.10518002301106, 'euclidean', 3, 20], [4.10518002301106, 'euclidean', 3, 50], [4.07125004122666, 'euclidean', 5, 5], [4.07125004122666, 'euclidean', 5, 10], [4.07125004122666, 'euclidean', 5, 20], [4.07125004122666, 'euclidean', 5, 50], [4.127616453538912, 'euclidean', 10, 5], [4.127616453538912, 'euclidean', 10, 10], [4.127616453538912, 'euclidean', 10, 20], [4.127616453538912, 'euclidean', 10, 50], [4.232916152990279, 'manhattan', 3, 5], [4.232916152990279, 'manhattan', 3, 10], [4.232916152990279, 'manhattan', 3, 20], [4.232916152990279, 'manhattan', 3, 50], [4.178803063955458, 'manhattan', 5, 5], [4.178803063955458, 'manhattan', 5, 10], [4.178803063955458, 'manhattan', 5, 20], [4.178803063955458, 'manhattan', 5, 50], [4.2616701543585584, 'manhattan', 10, 5], [4.2616701543585584, 'manhattan', 10, 10], [4.2616701543585584, 'manhattan', 10, 20], [4.2616701543585584, 'manhattan', 10, 50], [4.08936232834994

In [32]:
best_parameters = sorted(hpt_results, key=lambda x: x[0])[0]
print(best_parameters)

[3.9879673072753534, 'cosine', 10, 5]


Evaluating the model with test data

In [33]:
rmse, model = create_knn_model(metric=best_parameters[1], k=best_parameters[2],n_neighbors=best_parameters[3],testdata = test_data)

print(rmse)

3.8440879086703252
