In [74]:
from surprise import KNNBasic,NormalPredictor
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate,train_test_split
from surprise import accuracy
from evaluation_metrics import precision_recall_at_k
import pandas as pd
import pickle
import joblib

Collaborative Filtering 
*   Loading the user_item_rating dataset that I have created in the last stage.
*   Using Random predictor as a baseline algorithem for recommendation of items to user.
*   Applying two approches of colaborative filtering - USER_BASE and ITEM_BASE 
*   Similarity between users and items will be calculated by cosin-sim.
*   The results of the algorithem will be validated by using RMSE score.


In [11]:
user_item_rating = pd.read_csv('../data/user_item_rating.csv')

Using KNN basic (user and item based) and normal algorithem(random).
* Executing cross validation over each algorithm

In [28]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_item_rating[['user_id', 'item_id', 'rating']], reader)
# User-based KNN
user_knn_algo = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
# Item-based KNN
item_knn_algo = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
#Random predictor
random_algo = NormalPredictor()

algos= [('user_knn_algo',user_knn_algo),('item_knn_algo',item_knn_algo),('random_algo',random_algo)]
for algo in algos :
    print(algo[0])
    cross_validate(algo[1], data, measures=['RMSE'], cv=3, verbose=True)

user_knn_algo
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2262  1.2271  1.2317  1.2283  0.0024  
Fit time          8.31    8.22    8.44    8.32    0.09    
Test time         1.34    1.53    1.34    1.40    0.09    
item_knn_algo
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.1527  1.1522  1.1521  1.1523  0.0003  
Fit time          13.25   12.71   13.13   13.03   0.23

According to the cross-validation resluts above:
* Random algorotim rmse is 1.41 on average - used as a baseline.
* Both CF algo perform better than random algorithem with USER:1.22 and ITEM :1.15
* Even though the results of the knn are better than random it is hard to validate by those scores the real accuracy of the algorithems.

In [29]:
trainset, testset = train_test_split(data, test_size=0.25,shuffle=False)
#Train the model on the entire dataset
user_knn_algo.fit(trainset)
item_knn_algo.fit(trainset)
random_algo.fit(trainset)

#Evaluate on the same dataset
user_knn_predictions = user_knn_algo.test(testset)
item_knn_predictions = item_knn_algo.test(testset)
random_knn_predictions = random_algo.test(testset)

# Calculate and print RMSE
accuracy.rmse(user_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(item_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(random_knn_predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.2126
RMSE: 1.2142
RMSE: 1.4093


1.4093492071932534

In order to evaluate better the performce of the system we have to use more informative metric than rmse.
*   Mostly used metrics for recommendation systems are types of hit rate perfomances.
*   Two of them are Preision@K and Recall@K
*   Preision@K  - Proportion of recommended items that are relevant.(for given rating and K threshold )
*   Recall@K  - Proportion of relevant items that are recommended.(for given rating and K threshold )

In [31]:
for algo in algos:
    predictions = algo[1].test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

    precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
    recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
    # Precision and recall can then be averaged over all users
    print(f'Preision@K for algo {algo[0]} is  {precision_at_k}')
    print(f'Recall@K for algo {algo[0]} is  {precision_at_k}')

Preision@K for algo user_knn_algo is  0.8018351031216056
Recall@K for algo user_knn_algo is  0.8018351031216056
Preision@K for algo item_knn_algo is  0.7728669011142516
Recall@K for algo item_knn_algo is  0.7728669011142516
Preision@K for algo random_algo is  0.6997323494017224
Recall@K for algo random_algo is  0.6997323494017224


Accoding to the results (K=10 , threshold=4):
*  Both knn are better than random.
*  On contraty to RMSE metric , user_knn perform better when considering recall@k and precision@K

Fit each algorithem on the full dataset for application usage.

In [32]:
# First train an SVD algorithm on the movielens dataset.
full_trainset = data.build_full_trainset()
user_knn_algo.fit(trainset)
item_knn_algo.fit(trainset)
random_algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x1d57f3d2080>

Test if item_knn algorithem find similar to a given item.
*   I have chosen the most frqu item in the dataset which is 'Puppia Dog Harnesses'
*   According to the top 5 item we can see that 3 out of 5 are realted to dogs.

In [52]:
import json
with open('../outputs/IDs/items_ids_dict.json', 'r') as file:
    items_ids_dict = json.load(file)
with open('../outputs/IDs/ids_items_dict.json', 'r') as file:
    ids_items_dict = json.load(file)

In [70]:
# Retrieve raw_id
item_raw_id = items_ids_dict["Puppia Dog Harnesses"]
item_inner_id = item_knn_algo.trainset.to_inner_iid(item_raw_id)
item_neighbors = item_knn_algo.get_neighbors(item_inner_id, k=5)
# Convert inner ids of the neighbors into names.
item_neighbors = (
    item_knn_algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors
)
item_neighbors = (ids_items_dict[str(rid)] for rid in item_neighbors)
print("The 5 nearest neighbors of Items are:")
for item in item_neighbors:
    print(item)

The 5 nearest neighbors of Items are:
KONG Sitting Frog Dog Toy Medium Green
2LB Cafe Pablo Subtle Earth Organic Gourmet Coffee Dark Roast Whole Bean Coffee USDA Certified Organic Arabica Coffee 2 lb Bag
PetSafe Pet Screen Door Dog Cat Door Screen Door Window Porch Use
Tropicana Orange Juice 10 Ounce Pack 24
Best Bully Sticks 100 Natural Cow Ear Dog Treats 15 Pack


Save the trainset and the fitted algo 
*   KNN algo are highly memory consuming , thus I will not be ables to use them in the application as pre-saved
*   Saving the random algo for baseline comparsion of the following algos.

In [71]:
# Save the trained model to a file
user_knn_algo_filename = '../outputs/CF/user_knn_algo.sav'
# Save the trained KNN model to the file
joblib.dump(user_knn_algo, user_knn_algo_filename, compress=('zlib', 9))  # Use zlib compression with level 9 (maximum compression)

item_knn_algo_filename = '../outputs/CF/item_knn_algo.sav'
# Save the trained KNN model to the file
joblib.dump(item_knn_algo, item_knn_algo_filename, compress=('zlib', 9))  # Use zlib compression with level 9 (maximum compression)

with open('../outputs/Random/random_algo.sav', 'wb') as file:
    pickle.dump(random_algo, file)
with open('../outputs/CF/trainset.sav', 'wb') as file:
    pickle.dump(full_trainset, file)