### **Using Surprise Library to train a collaborative filtering model**

In [4]:
from surprise import (
    Reader, Dataset, accuracy,
    NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline,
    SVD, SVDpp, BaselineOnly, NMF, SlopeOne, CoClustering
)
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

import numpy as np
import pandas as pd

In [12]:
# Load processed data
books_df = pd.read_csv('books_df_processed.csv')
users_df = pd.read_csv('users_df_processed.csv')
rating_df = pd.read_csv('rating_df_processed.csv')

### **Scenario 1: Randomly sample 50,000 books as kernel keeps on dying**

In [4]:
books_sampled = books_df.sample(n=50000, random_state=42)

books_sampled.reset_index(drop=True, inplace=True)

In [5]:
df = pd.merge(books_sampled, rating_df, on='ISBN', how='inner')

df = df[~(df['Book-Rating'] == 0)] # removing book rating that are marked as 0

distinct_books = df['ISBN'].nunique()
print(f"Number of distinct books: {distinct_books}")
distinct_users = df['User-ID'].nunique()
print(f"Number of distinct users: {distinct_users}")

Number of distinct books: 27314
Number of distinct users: 23682


In [6]:
df.count()

ISBN                   67644
Book-Title             67644
Book-Author            67644
Year-Of-Publication    67644
Publisher              67644
Image-URL-S            67644
Image-URL-M            67644
Image-URL-L            67644
User-ID                67644
Book-Rating            67644
dtype: int64

In [7]:
reader = Reader(rating_scale=(1, 10))
data_full = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [8]:
benchmark = []

# Iterate over different algorithms, do cross-validation, and store the RMSE and compare them
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data_full, measures=['RMSE'], cv=3, verbose=False)

    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    algo_name = str(algorithm).split(' ')[0].split('.')[-1]
    tmp = pd.concat([tmp, pd.Series({'Algorithm': algo_name})])
    benchmark.append(tmp)

benchmark_df = pd.DataFrame(benchmark)
print(benchmark_df)


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


### **Scenaior 2: creating another dataframe where each book has at least 2 ratings and each user has 20 ratings**

In [13]:

min_book_ratings = 2
min_user_ratings = 20

book_counts = rating_df['ISBN'].value_counts()
valid_books = book_counts[book_counts > min_book_ratings].index

user_counts = rating_df['User-ID'].value_counts()
valid_users = user_counts[user_counts > min_user_ratings].index

# Remove top 1% of books by rating count
top_1_percent = int(len(book_counts) * 0.01)  # Number of books in top 1%
top_books = book_counts.head(top_1_percent).index
valid_books = valid_books[~valid_books.isin(top_books)]

filtered_ratings = rating_df[
    (rating_df['ISBN'].isin(valid_books)) &
    (rating_df['User-ID'].isin(valid_users)) &
    (rating_df['Book-Rating'] != 0)  # Removing zero ratings
]

df2 = pd.merge(filtered_ratings, books_df, on='ISBN', how='inner')

distinct_books = df2['ISBN'].nunique()
distinct_users = df2['User-ID'].nunique()
print(f"Number of distinct books: {distinct_books}")
print(f"Number of distinct users: {distinct_users}")

# Save filtered dataset
df2.to_csv('filtered_ratings.csv', index=False)

Number of distinct books: 43968
Number of distinct users: 6511


In [7]:
reader = Reader(rating_scale=(1, 10))
data_full2 = Dataset.load_from_df(df2[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [49]:
benchmark2 = []

# Iterate over different algorithms, do cross-validation, and store the RMSE and compare them
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data_full2, measures=['RMSE'], cv=3, verbose=False)

    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    algo_name = str(algorithm).split(' ')[0].split('.')[-1]
    tmp = pd.concat([tmp, pd.Series({'Algorithm': algo_name})])
    benchmark2.append(tmp)

benchmark2_df = pd.DataFrame(benchmark2)
print(benchmark2_df)


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


### **Scenario 2 seems to be giving a slightly better RMSE number**

### **Now I will do  grid search for 4 individual algorithms SVD, SVDpp, BaselineOnly, and KNNBaseline**

In [50]:
# parameter grid for SVD
param_grid_svd = {
    'n_epochs': [10, 25, 50],
    'lr_all': [0.001, 0.005, 0.01],
    'reg_all': [0.1, 0.4, 0.08]
}

# Perform GridSearchCV on SVD
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3, n_jobs=-1)
gs_svd.fit(data_full2)

In [51]:
print(gs_svd.best_score['rmse'])

1.5974818683637941


In [52]:
print(gs_svd.best_params['rmse'])

{'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.1}


In [53]:
# parameter grid for SVDpp
param_grid_svdpp = {'n_epochs': [10, 25, 50], 'lr_all': [0.001, 0.005, 0.01], 
              'reg_all': [0.1, 0.4, 0.08]}

gs_svdpp = GridSearchCV(SVD, param_grid_svdpp, measures = ['rmse'], cv = 3)
gs_svdpp.fit(data_full2)

In [54]:
print(gs_svdpp.best_score['rmse'])

1.5927848804736808


In [55]:
print(gs_svdpp.best_params['rmse'])

{'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.1}


In [None]:
# gridsearch on BaselineOnly
param_grid_baselineonly = {'bsl_options': {'method': ['als', 'sgd'],
                                  'n_epochs': [10, 25, 50]}
              }
gs_baselineonly = GridSearchCV(BaselineOnly, param_grid_baselineonly, measures = ['rmse'], cv = 3)
gs_baselineonly.fit(data_full2)

In [14]:
print(gs_baselineonly.best_score['rmse'])

1.61840953325


In [38]:
print(gs_baselineonly.best_params['rmse'])

{'bsl_options': {'method': 'sgd', 'n_epochs': 25}}


In [None]:
# gridsearch on KNNBaseline
param_grid_knnbaseline = {'bsl_options': {'method': ['als']},
              'k': [10, 30, 50],
              'sim_options': {'name': ['msd', 'cosine', 'pearson', 'pearson_baseline'],
                              'min_support': [1, 5],
                              'user_based': [True, False]}
              }
gs_knnbaseline = GridSearchCV(KNNBaseline, param_grid_knnbaseline, measures = ['rmse'], cv = 3)
gs_knnbaseline.fit(data_full2)

In [15]:
print(gs_knnbaseline.best_score['rmse'])

1.6293849543


In [39]:
print(gs_knnbaseline.best_params['rmse'])

{'bsl_options': {'method': 'als'}, 'k': 10, 'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}


### ** Both SVD and SVDpp seems to be performing the best as compared to other models**

In [57]:
# save the best model
from joblib import dump

# After grid search
best_model = gs_svd.best_estimator['rmse']

# Save to a pickle file using joblib
best_model.fit(data_full2.build_full_trainset())
dump(best_model, 'best_svd_model.joblib')

['best_svd_model.joblib']

#### **using all ratings including 0 and not doing gridsearch**

In [None]:
import joblib
user_item_matrix = joblib.load('U_I_M_joblib.pkl')

In [44]:
# replacing null values with 0
user_item_matrix = user_item_matrix.fillna(0)

In [45]:
# Since using the Surprise library directly we need to get the data in the required format

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(user_item_matrix.stack().reset_index(), reader)

# doing a 90:10 split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [46]:
# Specify the number of factors for the model
n_factors = 150
model = SVD(n_factors=n_factors, random_state=42 , n_epochs = 10 , biased = False ,  lr_all=0.01)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f40c4c49750>

In [48]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

RMSE: 0.3401
RMSE: 0.3401000346136176


### **Considering 0 as part of the training definitely underestimates the RMSE**

In [8]:
# How much sparsity still remains in the data
import pandas as pd

# Load your processed ratings file if not already loaded
rating_df = pd.read_csv('filtered_ratings.csv')

# Exclude implicit (0) ratings if needed
rating_df_explicit = rating_df[rating_df['Book-Rating'] > 0]

# Unique users and items
num_users = rating_df_explicit['User-ID'].nunique()
num_items = rating_df_explicit['ISBN'].nunique()

# Number of ratings
num_ratings = len(rating_df_explicit)

# Total possible interactions
total_possible = num_users * num_items

# Sparsity calculation
sparsity = 1 - (num_ratings / total_possible)

print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")
print(f"Number of ratings: {num_ratings}")
print(f"Total possible ratings: {total_possible}")
print(f"📉 Sparsity of the user-item matrix: {sparsity:.4f} ({sparsity * 100:.2f}%)")


Number of users: 3641
Number of items: 2894
Number of ratings: 16918
Total possible ratings: 10537054
📉 Sparsity of the user-item matrix: 0.9984 (99.84%)
