Dataset: https://www.kaggle.com/arashnic/book-recommendation-dataset


Ratings: https://an-utd-course.s3.us-west-1.amazonaws.com/CompDS/Ratings.csv

Books: https://an-utd-course.s3.us-west-1.amazonaws.com/CompDS/Books.csv

In [None]:
# Imports
!pip install surprise

import re
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from surprise.model_selection import GridSearchCV
from surprise import BaselineOnly
from surprise import SVD, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, KFold
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Read data into pandas dataframes
ratings = pd.read_csv('https://an-utd-course.s3.us-west-1.amazonaws.com/CompDS/Ratings.csv')
books = pd.read_csv('https://an-utd-course.s3.us-west-1.amazonaws.com/CompDS/Books.csv')

display(ratings)
display(books)

#ratings.dtypes

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [None]:
print("Number of Ratings (count of ratings, # of rating attributes):", ratings.shape)
print("Number of Books (count of books, # of book attributes):", books.shape)

print(ratings.isna().sum())
print(books.isna().sum())

ratings = ratings.dropna()
books = books.dropna()

print(ratings.isna().sum())
print(books.isna().sum())

print(ratings.shape)
print(books.shape)

Number of Ratings (count of ratings, # of rating attributes): (1149780, 3)
Number of Books (count of books, # of book attributes): (271360, 8)
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64
(1149780, 3)
(271354, 8)


In [None]:
# Merge CSV
df = books.merge(ratings, on = 'ISBN')
#test = df.loc[df['ISBN'] == '0786222743', :] #Check Results
#print(test['Book-Rating'])

def clean(title):
    return str(title).title().strip()

df['Book-Title'] = df['Book-Title'].apply(clean)
### This is to decrease the sample size and and include most popular books
df = df[df['User-ID'].map(df['User-ID'].value_counts()) > 150]
df = df[df['Book-Title'].map(df['Book-Title'].value_counts()) > 75]
df = df.reset_index(drop = True)

def top10(df):
  # Create indexes for each variable
  rating0 = df.groupby(['Book-Title']).count()['Book-Rating'].reset_index()
  rating_final = df.groupby('Book-Title')['Book-Rating'].mean().reset_index()
  isbn = df.groupby('Book-Title')['ISBN'].max().reset_index()

  # Rename values
  rating0.rename(columns = {'Book-Rating' : 'Count-Rating'}, inplace = True)
  rating_final.rename(columns = {'Book-Rating' : 'Rating-Avg'}, inplace = True)
  isbn.rename(columns = {'Book-Rating' : 'ISBN'}, inplace = True)

  # Merge
  book = rating0.merge(rating_final, on = 'Book-Title').merge(isbn, on = 'Book-Title')

  rate1 = book['Rating-Avg'].mean()
  count1 = book['Count-Rating'].quantile()

  # Filter by those with adequate ratings
  book = book[book['Count-Rating'] >= count1]
  book = book.sort_values(by = 'Rating-Avg', ascending = False)

  return book[['ISBN', 'Book-Title', 'Rating-Avg', 'Count-Rating']].reset_index(drop = True).head(10)

# Top 10 books have received the highest count of ratings
display(top10(df))


Unnamed: 0,ISBN,Book-Title,Rating-Avg,Count-Rating
0,786222743,Harry Potter And The Prisoner Of Azkaban (Book 3),4.641026,156
1,439139600,Harry Potter And The Goblet Of Fire (Book 4),4.566434,143
2,439358078,Harry Potter And The Order Of The Phoenix (Boo...,4.333333,126
3,439420105,Harry Potter And The Chamber Of Secrets (Book 2),4.060302,199
4,440998050,A Wrinkle In Time,3.735849,159
5,899668585,To Kill A Mockingbird,3.615385,208
6,1565116674,The Fellowship Of The Ring (The Lord Of The Ri...,3.52459,122
7,1401397522,The Five People You Meet In Heaven,3.480916,131
8,312980140,Seven Up (A Stephanie Plum Novel),3.364407,118
9,312924585,Silence Of The Lambs,3.362903,124


In [None]:
# Create a custom dataset using the surprise library
def get_subset(df, number):
    rids = np.arange(df.shape[0])
    np.random.shuffle(rids)
    df_subset = df.iloc[rids[:number], :].copy()
    return df_subset

# Subset data
df_ratings_1000 = get_subset(ratings, 1000)
df_df1_100 = get_subset(books, 100)

# Surprise reader
reader = Reader(rating_scale = (0, 10))

# Loader
ratings1 = Dataset.load_from_df(df_ratings_1000[['User-ID', 'ISBN', 'Book-Rating']], reader)

dataset = ratings1.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  823 

Number of items:  977


##### Choose a book at random and use the KNNBasic algorithm to find out its 10 closest neighbors. Do the results make sense?
After comparing and researching, the results do make sense and the books are connected.


In [None]:
# Create pivot table
df = df.drop_duplicates(['User-ID', 'Book-Title'])
df_pivot = df.pivot(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating').fillna(0)
df_matrix = csr_matrix(df_pivot.values)

# Use KNN
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_matrix)
query = np.random.choice(df_pivot.shape[0])
distances, indices = model_knn.kneighbors(df_pivot.iloc[query, :].values.reshape(1, -1), n_neighbors = 11)

# Find the 10 closest books
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('10 Nearest Neighbors for {0}:\n'.format(df_pivot.index[query]))
    else:
        print('{0}. {1}'.format(i, df_pivot.index[indices.flatten()[i]]))

10 Nearest Neighbors for The Notebook:

1. A Walk To Remember
2. The Rescue
3. Message In A Bottle
4. Suzanne'S Diary For Nicholas
5. The Five People You Meet In Heaven
6. A Child Called \It\": One Child'S Courage To Survive"
7. One Door Away From Heaven
8. Where The Heart Is (Oprah'S Book Club (Paperback))
9. White Oleander : A Novel
10. The Red Tent (Bestselling Backlist)


In [None]:
mark = []

# Determine if there are significant differences from each algorithm
for algo in [SVD(), KNNBaseline(), BaselineOnly()]:
    # CV
    results = cross_validate(algo, ratings1, measures = ['RMSE'], cv = 10, verbose = False)
    # Append and receive results
    temp = pd.DataFrame.from_dict(results).mean(axis = 0)
    temp = temp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index = ['Algorithm']))
    mark.append(temp)

pd.DataFrame(mark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,3.809746,0.064446,0.000882
BaselineOnly,3.819704,0.006099,0.000749
KNNBaseline,3.821079,0.023122,0.0009


In [None]:
# SVD GridSearch
param_grid = {'n_epochs': [10, 15], 'lr_all': [0.001, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv = 10)
gs.fit(ratings1)

# SVD RMSE Score
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

3.820343774227004
{'n_epochs': 15, 'lr_all': 0.005, 'reg_all': 0.4}


In [None]:
# KNN GridSearch
param_grid = {'n_epochs': [10, 15],
              'lr_all': [0.001, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(KNNBaseline, param_grid, measures = ['rmse'], cv = 10)
gs.fit(ratings1)

# KNN RMSE Score
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [None]:
# ALS GridSearch
param_grid = {'bsl_options' : {'method' : ['als'],
                               'n_epochs' : [5, 10],
                               'lr_all': [0.002, 0.005],
                               'reg_all': [0.4, 0.6]}}
bsl_algo = BaselineOnly()
gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 10)
gs.fit(ratings1)

# ALS RMSE Score
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [None]:
# SGD GridSearch
param_grid = {'bsl_options' : {'method' : ['sgd'],
                               'n_epochs' : [5, 10],
                               'lr_all': [0.002, 0.005],
                               'reg_all': [0.4, 0.6]}}
bsl_algo = BaselineOnly()
gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 10)
gs.fit(ratings1)

# SGD RMSE Score
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimati