In [207]:
from surprise import KNNBasic,NormalPredictor
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate,train_test_split
from surprise import accuracy
from evaluation_metrics import precision_recall_at_k
import pandas as pd
import pickle
import joblib
import helper_functions as hf
import json

Collaborative Filtering 
*   Loading the user_item_rating dataset that I have created in the last stage.
*   Using Random predictor as a baseline algorithem for recommendation of items to user.
*   Applying two approches of colaborative filtering - USER_BASE and ITEM_BASE 
*   Similarity between users and items will be calculated by cosin-sim.
*   The results of the algorithem will be validated by using RMSE score.


In [30]:
user_item_rating = pd.read_csv('../data/user_item_rating.csv')

Using KNN basic (user and item based) and normal algorithem(random).
* Executing cross validation over each algorithm

In [31]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_item_rating[['user_id', 'item_id', 'rating']], reader)
# User-based KNN
user_knn_algo = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
# Item-based KNN
item_knn_algo = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
#Random predictor
random_algo = NormalPredictor()

algos= [('user_knn_algo',user_knn_algo),('item_knn_algo',item_knn_algo),('random_algo',random_algo)]
for algo in algos :
    print(algo[0])
    cross_validate(algo[1], data, measures=['RMSE'], cv=3, verbose=True)

user_knn_algo
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2098  1.1991  1.1918  1.2002  0.0074  
Fit time          11.14   10.45   10.45   10.68   0.32    
Test time         2.32    2.56    2.77    2.55    0.18    
item_knn_algo
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0915  1.0940  1.0950  1.0935  0.0015  
Fit time          24.80   59.14   25.95   36.63   15.9

According to the cross-validation resluts above:
* Random algorotim rmse is 1.41 on average - used as a baseline.
* Both CF algo perform better than random algorithem with USER:1.19 and ITEM :1.09
* Even though the results of the knn are better than random it is hard to validate by those scores the real accuracy of the algorithems.

In [32]:
trainset, testset = train_test_split(data, test_size=0.25,shuffle=True)
#Train the model on the entire dataset
user_knn_algo.fit(trainset)
item_knn_algo.fit(trainset)
random_algo.fit(trainset)

#Evaluate on the same dataset
user_knn_predictions = user_knn_algo.test(testset)
item_knn_predictions = item_knn_algo.test(testset)
random_knn_predictions = random_algo.test(testset)

# Calculate and print RMSE
accuracy.rmse(user_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(item_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(random_knn_predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.1873
RMSE: 1.0774
RMSE: 1.4109


1.4109355341848548

In order to evaluate better the performce of the system we have to use more informative metric than rmse.
*   Mostly used metrics for recommendation systems are types of hit rate perfomances.
*   Two of them are Preision@K and Recall@K
*   Preision@K  - Proportion of recommended items that are relevant.(for given rating and K threshold )
*   Recall@K  - Proportion of relevant items that are recommended.(for given rating and K threshold )

In [41]:
for algo in algos:
    predictions = algo[1].test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

    precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
    recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
    # Precision and recall can then be averaged over all users
    print(f'Preision@K for algo {algo[0]} is  {precision_at_k}')
    print(f'Recall@K for algo {algo[0]} is  {recall_at_k}')

Preision@K for algo user_knn_algo is  0.801067348462194
Recall@K for algo user_knn_algo is  0.8050359405758389
Preision@K for algo item_knn_algo is  0.7670414397685479
Recall@K for algo item_knn_algo is  0.7849318018590545
Preision@K for algo random_algo is  0.6756440950850918
Recall@K for algo random_algo is  0.5746636796301922


Accoding to the results (K=10 , threshold=4):
*  Both knn are better than random.
*  On contraty to RMSE metric , user_knn perform better when considering recall@k and precision@K

Fit each algorithem on the full dataset for application usage.

In [108]:
# First train an KNN algorithm on the dataset.
full_trainset = data.build_full_trainset()
user_knn_algo.fit(full_trainset)
item_knn_algo.fit(full_trainset)
random_algo.fit(full_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x282bb4e2630>

Test if item_knn algorithem find similar to a given item.
*   I have chosen the most frqu item in the dataset which is 'Puppia Dog Harnesses'
*   According to the top 5 item we can see that 3 out of 5 are realted to dogs.

In [206]:
with open('../outputs/IDs/usernames_ids_dict.json', 'r') as file:
    usernames_ids_dict = json.load(file)

In [188]:
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])
K = 5 
userName='Kristie D'
user_id = usernames_ids_dict[userName]  
df[df['userName']==userName]

Unnamed: 0,userName,itemName,brand,category,price,rating,vote,user_id,item_id
128729,Kristie D,quot Guinea Habitat rdquo Guinea Pig Cage amp ...,MidWest Homes Pets,Pet Supplies,55.77,5.0,0,7642,11046
138704,Kristie D,Kaytee Clean amp Cozy Colored Small Animal Bed...,Kaytee,Pet Supplies,8.27,5.0,0,7642,5003
138810,Kristie D,Oxbow Animal Health Hamster Gerbil Fortified Food,Oxbow,Pet Supplies,20.178349,5.0,0,7642,6831


Getting top K items for user by using KNN ITEM based

In [204]:
predictions = hf.get_top_n(user_id,full_trainset,item_knn_algo,5)
hf.convect_predictions_to_items(predictions,df)

Unnamed: 0,itemName,brand,category
0,Prince Peace Organic Tea Oolong 100 Tea Bags,Prince Peace,Grocery Gourmet Food
1,Pet Champion Adjustable Harness Collar Leash M...,Pet Champion,Pet Supplies
2,Blue Buffalo Life Protection Formula Natural P...,Blue Buffalo,Pet Supplies
3,Elk Antlers Dogs Large quot Premium Antler Che...,Heartland Antlers,Pet Supplies
4,BIC Round Stic Xtra Life Ballpoint Pen Medium ...,BIC,Office Products


Getting top K items for user by using KNN USER based

In [205]:
predictions = hf.get_top_n(user_id,full_trainset,user_knn_algo,5)
hf.convect_predictions_to_items(predictions,df)

Unnamed: 0,itemName,brand,category
0,Prince Peace Organic Tea Oolong 100 Tea Bags,Prince Peace,Grocery Gourmet Food
1,Elk Antlers Dogs Large quot Premium Antler Che...,Heartland Antlers,Pet Supplies
2,Kaytee Crittertrail Lazy Lookout Accessory Kit,Kaytee,Pet Supplies
3,PetSafe Nylon Dog Leash Strong Durable Traditi...,PetSafe,Pet Supplies
4,Smead File Folder Tab Letter Size Assorted Col...,Smead,Office Products


Save the trainset and the fitted algo 
*   KNN algo are highly memory consuming , thus I will not be ables to use them in the application as pre-saved
*   Saving the random algo for baseline comparsion of the following algos.

In [37]:
# Save the trained model to a file
user_knn_algo_filename = '../outputs/CF/user_knn_algo.sav'
# Save the trained KNN model to the file
joblib.dump(user_knn_algo, user_knn_algo_filename, compress=('zlib', 9))  # Use zlib compression with level 9 (maximum compression)

item_knn_algo_filename = '../outputs/CF/item_knn_algo.sav'
# Save the trained KNN model to the file
joblib.dump(item_knn_algo, item_knn_algo_filename, compress=('zlib', 9))  # Use zlib compression with level 9 (maximum compression)

with open('../outputs/Random/random_algo.sav', 'wb') as file:
    pickle.dump(random_algo, file)
with open('../outputs/CF/trainset.sav', 'wb') as file:
    pickle.dump(full_trainset, file)