# Import libraries

In [None]:
!pip install numpy
!pip install scikit-surprise
!pip install streamlit
kjlksdjf

In [77]:
import pandas as pd
import streamlit as st
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNBasic, SVD, KNNWithMeans, SVDpp
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# Load Data

In [62]:
# Read data
df = pd.read_csv('data/kaggle_full_df.csv')
df['country'].fillna('unknown', inplace=True)

  df = pd.read_csv('data/kaggle_full_df.csv')


In [63]:
df.shape

(1005487, 12)

# Baseline Model (Popularity Model)

A common (and usually hard-to-beat) baseline approach is the Popularity model. This model is not actually personalized - it simply recommends to a user the most popular items that the user has not previously consumed. As the popularity accounts for the "wisdom of the crowds", it usually provides good recommendations, generally interesting for most people. The main objective of a recommender system is to leverage the long-tail items to the users with very specific interests, which goes far beyond this simple technique.

In [None]:
# Recommend to every user the five books with the highest mean rating
top_5_books = df.groupby('common_identifier')['book_rating'].mean().sort_values(ascending=False).head(5)
top_5_books

In [None]:
# Get the common identifiers of the top 5 books
top_5_common_identifiers = top_5_books.index

# Filter the original DataFrame to get the corresponding book titles
top_5_books_info = df[df['common_identifier'].isin(top_5_common_identifiers)][['common_identifier', 'book_title']].drop_duplicates()

# Display the top 5 books by name
print(top_5_books_info)

In [1]:
# Get recommendations for a user
def get_recommendations(user_id, num_books=5):
    # Get the books the user has rated
    user_rated_books = df[df['user_id'] == user_id][['common_identifier', 'book_rating']]
    # Get the books the user has not rated
    user_unrated_books = df[~df['common_identifier'].isin(user_rated_books['common_identifier'])][['common_identifier', 'book_rating']]
    # Merge user_unrated_books with top_5_books_info to get the book titles
    recommendations = pd.merge(user_unrated_books, top_5_books_info, on='common_identifier', how='inner')
    # Sort the recommendations by rating in descending order
    recommendations = recommendations.sort_values(by='book_rating', ascending=False)
    # Return the top num_books recommendations
    return recommendations.head(num_books)

# Difference between User-based and Item-based Collaborative Filtering

- User-based collaborative filtering recommends items to a user based on the preferences of similar users. It finds users who have similar tastes and preferences to the target user and recommends items that those similar users have liked or rated highly. For a given user, the algorithm looks for other users who have rated or liked similar items and calculates the similarity between the target user and those similar users. The algorithm then predicts the ratings or preferences of the target user for items that they have not yet interacted with, based on the rating or preferences of the similar users. User-based collaborative filtering is more suitable when the user-item matrix is sparse and there are more users than items.

- Item-based collaborative filtering recommends items to a user based on the similarity between items. It finds items that are similar to the ones the target user has already interacted with and recommends those similar items. For a given item, the algorithm looks for other items that have been rated or liked by users who have also rated or liked the target item. The algorithm calculates the similarity between the target item and those similar items based on the rating or preferences of the users who have interacted with both items. The algorithm then predicts the ratings or preferences of the target user for items that are similar to the ones they have already interacted with. Item-based collaborative filtering is more suitable when the user-item matrix is dense and there are more items than users.

- In summary, user-based filtering focuses on finding similar users to make recommendations, while item-based collaborative filtering focuses on finding similar items. The choice between the two depens on the characteristics of the dataset and the specific requirements of the recommendation system.

# Modelling

## Subset data

In [6]:
# Only consider explicit ratings (i.e. ratings > 0)
df_explicit = df[df['book_rating'] != 0]
df_explicit.shape

(370785, 12)

#### Subset data (Oliwia's Approach)
- only EXCPLICIT rating and users from USA & Canada

In [64]:
#Only Rating above 0
df = df[df['book_rating']>0]

#Only users from US or Canada
df = df[df['country'].str.contains("usa|canada")]

df.shape

(303032, 12)

#### Create a new variable: Rating Count

In [65]:
#Add a new column with a total rating count for each book by common identifier
df['rating_count'] = df.groupby(['book_title', 'book_author'])['book_rating'].transform('count')

#Show a list of books that got the highest rating count, group by title and author to show unique books

df.groupby(['book_title', 'book_author', 'rating_count']).size().reset_index(name='Count').sort_values(by='rating_count', ascending=False).head(5)

Unnamed: 0,book_title,book_author,rating_count,Count
86042,The Lovely Bones: A Novel,Alice Sebold,614,614
79268,The Da Vinci Code,Dan Brown,420,420
91346,The Secret Life Of Bees,Sue Monk Kidd,387,387
103977,Wild Animus,Rich Shapero,352,352
90242,The Red Tent (Bestselling Backlist),Anita Diamant,351,351


In [66]:
df.head(3)

Unnamed: 0,book_title,book_author,year_of_publication,publisher,image_url_m,common_identifier,user_id,isbn,book_rating,age,city,country,rating_count
1,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,269782,801319536,7,30,edmonton,canada,1
2,Pay It Forward: A Novel,Catherine Ryan Hyde,2000,Simon &amp; Schuster,http://images.amazon.com/images/P/0684862719.0...,2392,269782,684862719,8,30,edmonton,canada,26
3,Watership Down,Richard Adams,1976,Avon,http://images.amazon.com/images/P/0380002930.0...,3172,269782,140039589,10,30,edmonton,canada,99


#### Define user activity threshold

In [67]:
#Subset only users with more than 30 ratings

user_rating_counts = df['user_id'].value_counts()
df = df[df['user_id'].isin(user_rating_counts[user_rating_counts >= 30].index)]
df.shape

(151974, 13)

In [68]:
popularity_threshold = 50
df = df[df['rating_count'] >= popularity_threshold]
df.shape

(19161, 13)

In [69]:
#Add a new column with a total rating count for each book by common identifier
df['rating_count'] = df.groupby(['book_title', 'book_author'])['book_rating'].transform('count')

#Show a list of books that got the highest rating count, group by title and author to show unique books

df.groupby(['book_title', 'book_author', 'rating_count']).size().reset_index(name='Count').sort_values(by='rating_count', ascending=False).head(5)

Unnamed: 0,book_title,book_author,rating_count,Count
425,The Lovely Bones: A Novel,Alice Sebold,165,165
173,Harry Potter And The Chamber Of Secrets (Book 2),J. K. Rowling,140,140
370,The Da Vinci Code,Dan Brown,133,133
457,The Secret Life Of Bees,Sue Monk Kidd,126,126
176,Harry Potter And The Prisoner Of Azkaban (Book 3),J. K. Rowling,121,121


In [70]:
df.head(3)

Unnamed: 0,book_title,book_author,year_of_publication,publisher,image_url_m,common_identifier,user_id,isbn,book_rating,age,city,country,rating_count
109,To Kill A Mockingbird,Harper Lee,1988,Little Brown &amp; Company,http://images.amazon.com/images/P/0446310786.0...,38,85526,60935464,9,36,victoria,canada,101
115,Girl With A Pearl Earring,Tracy Chevalier,2001,Plume Books,http://images.amazon.com/images/P/0452282152.0...,232,85526,452282152,7,36,victoria,canada,77
120,Empire Falls,Richard Russo,2002,Vintage Books USA,http://images.amazon.com/images/P/0375726403.0...,480,85526,375726403,9,36,victoria,canada,54


## Build Models

In [71]:
# Define rating scale
reader = Reader(rating_scale=(1, 10))
# Load data into Surprise dataset
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)
#data_all = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)

In [72]:
# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
#trainset_all, testset_all = train_test_split(data_all, test_size=0.2, random_state=42)

## SVD

In [73]:
# Build SVD model 
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10a88e2d0>

In [74]:
# Evaluation of SVD model
svd_predictions = svd_model.test(testset)
svd_rmse = accuracy.rmse(svd_predictions)
# RMSE: 1.5334

RMSE: 1.5334


In [87]:
# Perform cross validation
svd_cv = cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print Mean RMSE Score
print("Mean RMSE Score: ", svd_cv['test_rmse'].mean())
# Mean RMSE Score:  1.5288

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5096  1.5531  1.5612  1.4721  1.5479  1.5288  0.0335  
MAE (testset)     1.1704  1.1806  1.1837  1.1340  1.1867  1.1711  0.0193  
Fit time          0.08    0.06    0.06    0.06    0.06    0.07    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Mean RMSE Score:  1.528783126904197


## SVDpp

In [19]:
# Build SVDpp model 
## The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
svdpp_model = SVDpp()
svdpp_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x17d8aa9d0>

In [20]:
# Evaluation of svdpp_predictions = svdpp_model.test(testset)
svdpp_rmse = accuracy.rmse(svdpp_predictions)SVDpp model
# RMSE: 1.5474

RMSE: 1.5474


In [88]:
# Perform cross validation
svdpp_cv = cross_validate(svdpp_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print Mean RMSE Score
print("Mean RMSE Score: ", svdpp_cv['test_rmse'].mean())
# Mean RMSE Score:  1.5223

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5378  1.5189  1.5402  1.5168  1.4976  1.5223  0.0156  
MAE (testset)     1.1826  1.1789  1.1926  1.1675  1.1547  1.1753  0.0131  
Fit time          0.13    0.10    0.10    0.10    0.10    0.11    0.01    
Test time         0.04    0.04    0.04    0.04    0.04    0.04    0.00    
Mean RMSE Score:  1.5222709761619835


## KNN

In [95]:
# Build KNN model with cross-validation
knn_model = KNNWithMeans()
knn_model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x178a120d0>

In [96]:
# Evaluation of KNN model
knn_predictions = knn_model.test(testset)
knn_rmse = accuracy.rmse(knn_predictions)

RMSE: 1.6465


In [97]:
# Perform cross validation
knn_cv = cross_validate(knn_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# Print Mean RMSE Score
print("Mean RMSE Score: ", knn_cv['test_rmse'].mean())
# Mean RMSE Score:  1.6157

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5780  1.6221  1.6419  1.6110  1.6256  1.6157  0.0213  
MAE (testset)     1.1705  1.2040  1.2115  1.2008  1.2038  1.1981  0.0142  
Fit time          0.04    0.02    0.02    0.02    0.02    0.03    0.01    
Test time         0.06    0.06    0.06    0.06    0.06    0.06    0.00    
Mean RMSE Score:  1.6156944605093642


RMSE measures the average difference between the predicted ratings and the actual ratings given by users. A lower RMSE value indicates that the model's predictions are closer to the actual ratings, indicating better accuracy.

# Tuning

## SVD

In [113]:
param_grid = {
    "n_epochs": [5, 10, 15, 20], # number of iterations
    "lr_all": [0.002, 0.005, 0.01, 0.02], # learning rate for all parameters, which decides how much the parameters are adjusted in each iteration
    "reg_all": [0.4, 0.6, 0.8, 1.0]# regularization term for all parameters, which is a penalty term added to prevent overfitting
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=5)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

# RMSE: 1.5217

1.519450252229476
{'n_epochs': 20, 'lr_all': 0.02, 'reg_all': 0.4}


In [114]:
# Build SVD model with best parameters
svd_model = SVD(n_epochs=20, lr_all=0.02, reg_all=0.4)
svd_model.fit(trainset)
svd_predictions = svd_model.test(testset)
svd_rmse = accuracy.rmse(svd_predictions)

# Perform cross validation with best parameters
svd_cv = cross_validate(svd_model, data, measures=['rmse'], cv=5, verbose=True)

# Print Mean RMSE Score
print("Mean RMSE Score: ", svd_cv['test_rmse'].mean())

RMSE: 1.5307
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4987  1.5296  1.5327  1.5161  1.5367  1.5227  0.0139  
Fit time          0.06    0.06    0.06    0.06    0.06    0.06    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Mean RMSE Score:  1.5227479762204075


## KNN

In [115]:
# grid search for KNN
param_grid = {
    "k": [10, 20, 30, 40, 50, 100], # number of neighbors
    "sim_options": {
        "name": ["msd", "cosine"], # similarity measure
        "min_support": [1, 2, 3, 4, 5, 10, 20], # minimum number of common items for two users
        "user_based": [True, False] # user-based or item-based
    }
}
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=5)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [116]:
# Build KNN with best parameters
knn_model = KNNWithMeans(k=100, sim_options={'name': 'cosine', 'min_support': 1, 'user_based': False})
knn_model.fit(trainset)
knn_predictions = knn_model.test(testset)
knn_rmse = accuracy.rmse(knn_predictions)

# Perform cross validation with best parameters
knn_cv = cross_validate(knn_model, data, measures=['rmse'], cv=5, verbose=True) 

# Print Mean RMSE Score
print("Mean RMSE Score: ", knn_cv['test_rmse'].mean())


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.5897
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5731  1.5299  1.5469  1.6033  1.6140  1.5734  0.0321  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    
Mean RMSE Score:  1.5734332509482976


## SVDpp

In [117]:
param_grid = {
    'n_epochs': [5, 10, 15],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.4, 0.6, 0.8]
}

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])


1.5279679467634113
{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.4}


In [119]:
# Build SVDpp model with best parameters
## The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
svdpp_model = SVDpp(n_epochs=15, lr_all=0.01, reg_all=0.4)
svdpp_model.fit(trainset)
svdpp_predictions = svdpp_model.test(testset)
svdpp_rmse = accuracy.rmse(svdpp_predictions)

# Perform cross validation with best parameters
svdpp_cv = cross_validate(svdpp_model, data, measures=['rmse'], cv=5, verbose=True) 

# Print Mean RMSE Score
print("Mean RMSE Score: ", svdpp_cv['test_rmse'].mean())


RMSE: 1.5313
Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4901  1.5233  1.5560  1.5196  1.5442  1.5267  0.0226  
Fit time          0.10    0.10    0.11    0.10    0.12    0.11    0.01    
Test time         0.04    0.04    0.04    0.04    0.05    0.04    0.00    
Mean RMSE Score:  1.5266607544081967


# Recommendations

In [154]:
def get_svd_recommendations(user_id, svd_model, num_books=5):
    # Get the books the user has rated
    user_rated_books = df[df['user_id'] == user_id][['common_identifier', 'book_title', 'book_author']]
    
    # Get the books the user has not rated
    user_unrated_books = df[~df['common_identifier'].isin(user_rated_books['common_identifier'])][['common_identifier', 'book_title', 'book_author']]
    
    # Make predictions for unrated books using SVD model
    unrated_books = list(user_unrated_books['common_identifier'])
    user_predictions = [
        (book_id, svd_model.predict(user_id, book_id).est) for book_id in unrated_books
    ]
    
    # Remove duplicate entries based on common_identifier
    user_predictions = list(set(user_predictions))
    
    # Sort the predictions by estimated rating in descending order
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Take the top num_books predictions
    user_predictions = user_predictions[:num_books]
    
    # Create a DataFrame from the sorted predictions
    recommendations = pd.DataFrame(user_predictions, columns=['common_identifier', 'estimated_rating'])
    
    # Merge with book information to get the book titles and authors
    recommendations = pd.merge(
        recommendations,
        user_unrated_books[['common_identifier', 'book_title', 'book_author']],
        on='common_identifier',
        how='inner'
    )
    
    # Drop duplicates based on common_identifier
    recommendations = recommendations.drop_duplicates(subset=['common_identifier'])
    
    # Drop the book_rating column
    recommendations = recommendations[['common_identifier', 'estimated_rating', 'book_title', 'book_author']]
    
    return recommendations.head(num_books)

# Replace 'user_id' with the actual user ID for which you want to get recommendations
user_id = 276729
recommended_books = get_svd_recommendations(user_id, svd_model, num_books=5)
print(recommended_books)

     common_identifier  estimated_rating  \
0                 4935          8.934163   
49                  75          8.930062   
68                3760          8.918632   
100               3016          8.913113   
129               4207          8.872582   

                                            book_title  \
0                                            Outlander   
49                                          Seabiscuit   
68                                   The Secret Garden   
100                                     Atlas Shrugged   
129  The Return Of The King (The Lord Of The Rings,...   

                 book_author  
0             Diana Gabaldon  
49         Laura Hillenbrand  
68   Frances Hodgson Burnett  
100                 Ayn Rand  
129           J.R.R. Tolkien  
