In [None]:
import kagglehub
from pathlib import Path

In [None]:
data_path = kagglehub.dataset_download("saurabhbagchi/books-dataset")
print(f"Data downloaded to: {data_path}")

In [5]:
import numpy as np
import pandas as pd

In [None]:
books = pd.read_csv(f"{data_path}/books_data/books.csv", sep=";", encoding="iso8859", on_bad_lines="skip")
users = pd.read_csv(f"{data_path}/books_data/users.csv", sep=";", encoding="iso8859", on_bad_lines="skip")
ratings = pd.read_csv(f"{data_path}/books_data/ratings.csv", sep=";", encoding="iso8859", on_bad_lines="skip")

In [7]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [8]:
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [9]:
ratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [10]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 8)
(278858, 3)
(1149780, 3)


In [11]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [12]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [13]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [14]:
print(books.duplicated().sum())
print(users.duplicated().sum())
print(ratings.duplicated().sum())

0
0
0


In [15]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [16]:
books.drop(['Image-URL-S', 'Image-URL-M'], axis=1, inplace=True)

In [17]:
books.rename(columns = {"Book-Title" : "Title",
                        "Book-Author" : "Author",
                        "Year-Of-Publication" : "Year",
                        "Image-URL-L" : "image_url"}, inplace=True)

In [18]:
books.head(2)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [19]:
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [20]:
users.rename(columns = {"User-ID": "user_id",
                        "Location" : "location",
                        "Age" : "age"}, inplace=True)

In [21]:
users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [22]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [23]:
ratings.rename(columns = {"User-ID" : "user_id",
                          "Book-Rating" : "rating"}, inplace=True)

In [24]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


#### Cleaning Books Titles

In [25]:
import html
books["Title"] = books["Title"].apply(html.unescape)

In [26]:
books['Title'] = books["Title"].str.replace(r"\\'", "'", regex=True)
books['Title'] = books["Title"].str.replace(r'\\"', '', regex=True)
books['Title'] = books["Title"].str.replace(r'"', "", regex=True)
books['Title'] = books["Title"].str.replace(r"\\", "", regex=True)

# Popularity Based Recommendation

In [27]:
df = ratings.merge(books, on="ISBN")

In [28]:
df.head(3)

Unnamed: 0,user_id,ISBN,rating,Title,Author,Year,Publisher,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...


In [29]:
df.shape

(1031136, 8)

In [30]:
num_rating_df = df.groupby("Title").count()["rating"].reset_index()
num_rating_df.rename(columns = {"rating" : "tot_ratings"}, inplace=True)
num_rating_df

Unnamed: 0,Title,tot_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241052,Ã?Â?lpiraten.,2
241053,Ã?Â?rger mit Produkt X. Roman.,4
241054,Ã?Â?sterlich leben.,1
241055,Ã?Â?stlich der Berge.,3


In [31]:
df1 = df[["Title", "rating"]]

In [32]:
avg_rating_df = df1.groupby("Title").mean()["rating"].reset_index()
avg_rating_df.rename(columns = {"rating" : "avg_ratings"}, inplace=True)
avg_rating_df

Unnamed: 0,Title,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241052,Ã?Â?lpiraten.,0.000000
241053,Ã?Â?rger mit Produkt X. Roman.,5.250000
241054,Ã?Â?sterlich leben.,7.000000
241055,Ã?Â?stlich der Berge.,2.666667


In [33]:
popular_df = num_rating_df.merge(avg_rating_df, on="Title")
popular_df

Unnamed: 0,Title,tot_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241052,Ã?Â?lpiraten.,2,0.000000
241053,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241054,Ã?Â?sterlich leben.,1,7.000000
241055,Ã?Â?stlich der Berge.,3,2.666667


In [34]:
# Top 50 most popular books 
popular_df = popular_df[popular_df["tot_ratings"]>=250].sort_values("avg_ratings", ascending=False).head(50)
popular_df.head()

Unnamed: 0,Title,tot_ratings,avg_ratings
80498,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80486,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80505,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80490,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80478,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [35]:
popular_df = popular_df.merge(books, on="Title")

In [36]:
popular_df.head()

Unnamed: 0,Title,tot_ratings,avg_ratings,ISBN,Author,Year,Publisher,image_url
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,439136350,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...
1,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,439136369,J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...
2,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,786222743,J. K. Rowling,2000,Thorndike Press,http://images.amazon.com/images/P/0786222743.0...
3,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,439139597,J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...
4,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,439139600,J. K. Rowling,2002,Scholastic Paperbacks,http://images.amazon.com/images/P/0439139600.0...


In [37]:
popular_df.shape

(196, 8)

In [38]:
popular_df = popular_df.drop_duplicates("Title")
popular_df.shape

(50, 8)

In [39]:
popular_df.columns

Index(['Title', 'tot_ratings', 'avg_ratings', 'ISBN', 'Author', 'Year',
       'Publisher', 'image_url'],
      dtype='object')

In [40]:
popular_df = popular_df[["Title", "Author", "tot_ratings", "avg_ratings", "image_url"]]

In [41]:
popular_df

Unnamed: 0,Title,Author,tot_ratings,avg_ratings,image_url
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,428,5.852804,http://images.amazon.com/images/P/0439136350.0...
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,387,5.824289,http://images.amazon.com/images/P/0439139597.0...
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,278,5.73741,http://images.amazon.com/images/P/0590353403.0...
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,347,5.501441,http://images.amazon.com/images/P/043935806X.0...
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,556,5.183453,http://images.amazon.com/images/P/0439064872.0...
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,281,5.007117,http://images.amazon.com/images/P/0345339681.0...
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,368,4.94837,http://images.amazon.com/images/P/0345339703.0...
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,575,4.895652,http://images.amazon.com/images/P/059035342X.0...
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,260,4.880769,http://images.amazon.com/images/P/0345339711.0...
39,To Kill a Mockingbird,Harper Lee,510,4.7,http://images.amazon.com/images/P/0446310786.0...


# Collaborative Filtering based Recommendation System

In [42]:
df.head(3)

Unnamed: 0,user_id,ISBN,rating,Title,Author,Year,Publisher,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...


In [43]:
user_rating_df = df.groupby("user_id").count()

In [44]:
user_rating_df["rating"]

user_id
2          1
8         17
9          3
10         1
12         1
          ..
278846     1
278849     4
278851    23
278852     1
278854     8
Name: rating, Length: 92106, dtype: int64

In [45]:
x = user_rating_df["rating"]>=200
x[x]

user_id
254       True
2276      True
2766      True
2977      True
3363      True
          ... 
274308    True
275970    True
277427    True
277639    True
278418    True
Name: rating, Length: 816, dtype: bool

In [46]:
good_users = x[x].index
len(good_users)

816

In [47]:
filtered_ratings = df[df["user_id"].isin(good_users)]

In [48]:
len(filtered_ratings["user_id"].unique())

816

In [49]:
filtered_ratings.head(3)

Unnamed: 0,user_id,ISBN,rating,Title,Author,Year,Publisher,image_url
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1151,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...
1152,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...


In [50]:
y = filtered_ratings.groupby("Title").count()["rating"]>=50
y[y]

Title
1984                                                                 True
1st to Die: A Novel                                                  True
2nd Chance                                                           True
4 Blondes                                                            True
A Bend in the Road                                                   True
                                                                     ... 
Wuthering Heights                                                    True
Year of Wonders                                                      True
You Belong To Me                                                     True
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values    True
Zoya                                                                 True
Name: rating, Length: 707, dtype: bool

In [51]:
good_books = y[y].index

In [52]:
final_ratings = filtered_ratings[filtered_ratings["Title"].isin(good_books)]

In [53]:
final_ratings.head()

Unnamed: 0,user_id,ISBN,rating,Title,Author,Year,Publisher,image_url
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...


In [54]:
print(final_ratings.shape)
print(final_ratings.isnull().sum().sum())
print(final_ratings.duplicated().sum())

(58823, 8)
0
0


In [55]:
book_pt = final_ratings.pivot_table(index="Title", columns="user_id", values="rating")

In [56]:
book_pt.fillna(0, inplace=True)

In [57]:
book_pt.head()

user_id,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
from sklearn.metrics.pairwise import cosine_similarity


In [59]:
similarity_scores = cosine_similarity(book_pt)

In [60]:
similarity_scores

array([[1.        , 0.0999137 , 0.01189468, ..., 0.01724147, 0.11799012,
        0.07158663],
       [0.0999137 , 1.        , 0.2364573 , ..., 0.19309722, 0.07446129,
        0.16773875],
       [0.01189468, 0.2364573 , 1.        , ..., 0.16565383, 0.04558758,
        0.04938579],
       ...,
       [0.01724147, 0.19309722, 0.16565383, ..., 1.        , 0.0521682 ,
        0.19430288],
       [0.11799012, 0.07446129, 0.04558758, ..., 0.0521682 , 1.        ,
        0.07085128],
       [0.07158663, 0.16773875, 0.04938579, ..., 0.19430288, 0.07085128,
        1.        ]], shape=(707, 707))

In [61]:
similarity_scores.shape

(707, 707)

In [62]:
# Recommendation function
def recommend(book_name):
    index = np.where(book_pt.index == book_name)[0][0]
    distance_vec = similarity_scores[index]
    similar_items = sorted(list(enumerate(distance_vec)), key= lambda x: x[1], reverse= True)[1:6]

    for i in similar_items:
        print(book_pt.index[i[0]])

In [63]:
recommend("Message in a Bottle")

Nights in Rodanthe
The Mulberry Tree
A Walk to Remember
River's End
Nightmares & Dreamscapes


In [64]:
recommend("The Notebook")

A Walk to Remember
The Rescue
One Door Away from Heaven
Toxin
The Five People You Meet in Heaven


### ML model training

In [65]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
# Convert the pivot table dataframe to CSR Matrix, required for training the model
sparse_df = csr_matrix(book_pt)

In [67]:
sparse_df

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14345 stored elements and shape (707, 815)>

In [68]:
model = NearestNeighbors(algorithm = "brute")
model.fit(sparse_df)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [69]:
dist, suggestions = model.kneighbors(book_pt.iloc[1,:].values.reshape(1,-1), n_neighbors=6)

In [None]:
# dist : gives the distance values with the 6 nearest points.
# suggestions : gives the similar books' index IDs.
dist, suggestions

(array([[ 0.        , 50.8502704 , 51.20790955, 51.6309016 , 51.69139193,
         51.93986908]]),
 array([[  1, 175, 512, 142,   7, 357]]))

In [71]:
# Books recommended by model 
recommended_books = []
for book_id in range(len(suggestions)):
    recommended_books.append(book_pt.index[suggestions[book_id]])

recommended_books

[Index(['1st to Die: A Novel', 'Exclusive', 'The Cradle Will Fall',
        'Deck the Halls (Holiday Classics)', 'A Civil Action', 'No Safe Place'],
       dtype='object', name='Title')]

In [72]:
# fetching image url
id = np.where(final_ratings["Title"] == "The Notebook")[0][0]
final_ratings.iloc[id]['image_url']

'http://images.amazon.com/images/P/0446520802.01.LZZZZZZZ.jpg'

In [73]:
# Recommended Book ids
id_index = []
for book in recommended_books[0]:
    id = np.where(final_ratings["Title"] == book)[0][0]
    id_index.append(id)

id_index

[np.int64(793),
 np.int64(739),
 np.int64(2269),
 np.int64(2718),
 np.int64(507),
 np.int64(22)]

In [74]:
# Image urls of the recommended books
img_url = []
for id in id_index:
    url = final_ratings.iloc[id]['image_url']
    img_url.append(url)

img_url

['http://images.amazon.com/images/P/0446610038.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0446604232.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0440115450.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0743418131.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0679772677.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0345404777.01.LZZZZZZZ.jpg']

In [None]:
# Lists all the books avaliable
book_names = book_pt.index

In [76]:
book_names

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called It: One Child's Courage to Survive', 'A Civil Action',
       'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Moon', 'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya'],
      dtype='object', name='Title', length=707)

#### Save the model and required files as pickle object

In [77]:
import pickle

pickle.dump(model, open("artifacts/model.pkl", "wb"))
pickle.dump(book_names, open("artifacts/book_names.pkl", "wb"))
pickle.dump(final_ratings, open("artifacts/books_final_ratings.pkl", "wb"))
pickle.dump(book_pt, open("artifacts/books_pivot_table.pkl", "wb"))

## Model Testing 

In [92]:
def recommend_book(book_name):
    book_id = np.where(book_pt.index == book_name)[0][0]
    _ , suggestions = model.kneighbors(book_pt.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6)

    for id in range(len(suggestions)):
        book = book_pt.index[suggestions[id]]
        for x in book:
            if x == book_name:
                print(f"Searched for: {book_name}\n")
                print(f"Recommended books:\n")
            else:
                print(x)

In [93]:
recommend_book("Message in a Bottle")

Searched for: Message in a Bottle

Recommended books:

The Bourne Supremacy
Nights in Rodanthe
The Mulberry Tree
The Most Wanted
The Bourne Ultimatum


In [99]:
recommend_book("Harry Potter and the Chamber of Secrets (Book 2)")

Searched for: Harry Potter and the Chamber of Secrets (Book 2)

Recommended books:

Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Book 1)
Exclusive
Tom Clancy's Op-Center (Tom Clancy's Op Center (Paperback))
