Import Dependencies

In [24]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-06-14 10:39:47--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2025-06-14 10:39:48 (131 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace BX-Users.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [8]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [9]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [10]:
df_books.isnull().sum()

Unnamed: 0,0
isbn,0
title,0
author,2


In [11]:
df_books = df_books.dropna()

In [12]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [13]:
print(df_books.shape)
print(df_ratings.shape)

(271377, 3)
(1149780, 3)


In [46]:
combined = pd.merge(df_ratings, df_books, on='isbn')
combined.head()

Unnamed: 0,user,isbn,rating,title,author
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5.0,Rites of Passage,Judith Rae
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather


In [15]:
#Checking null values in df_ratings

df_ratings.isnull().sum()

Unnamed: 0,0
user,0
isbn,0
rating,0


In [16]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271377 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271377 non-null  object
 1   title   271377 non-null  object
 2   author  271377 non-null  object
dtypes: object(3)
memory usage: 8.3+ MB


In [17]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [18]:
combined_book_rating = df_books.merge(df_ratings, on='isbn')
combined_book_rating.shape

(1031173, 5)

In [47]:
combined.isnull().sum()

Unnamed: 0,0
user,0
isbn,0
rating,0
title,0
author,0


In [48]:
combined.head()

Unnamed: 0,user,isbn,rating,title,author
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5.0,Rites of Passage,Judith Rae
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather


In [49]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031173 entries, 0 to 1031172
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1031173 non-null  int32  
 1   isbn    1031173 non-null  object 
 2   rating  1031173 non-null  float32
 3   title   1031173 non-null  object 
 4   author  1031173 non-null  object 
dtypes: float32(1), int32(1), object(3)
memory usage: 31.5+ MB


In [50]:
combined_features = (
    combined['isbn']+ ' ' +
    combined['title']+ ' ' +
    combined['author'] + ' ' +
    combined['user'].astype(str) + ' ' +
    combined['rating'].astype(str) + ' '
)
combined_features

Unnamed: 0,0
0,034545104X Flesh Tones: A Novel M. J. Rose 276...
1,0155061224 Rites of Passage Judith Rae 276726 ...
2,0446520802 The Notebook Nicholas Sparks 276727...
3,052165615X Help!: Level 1 Philip Prowse 276729...
4,0521795028 The Amsterdam Connection : Level 4 ...
...,...
1031168,0876044011 Edgar Cayce on the Akashic Records:...
1031169,1563526298 Get Clark Smart : The Ultimate Guid...
1031170,0679447156 Eight Weeks to Optimum Health: A Pr...
1031171,0515107662 The Sherbrooke Bride (Bride Trilogy...


In [51]:
feature_extraction = TfidfVectorizer(stop_words='english')
feature_vectors = feature_extraction.fit_transform(combined_features)

In [52]:
feature_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7891617 stored elements and shape (1031173, 473982)>

In [53]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(feature_vectors)

In [54]:
print(combined_book_rating['title'].sample(10).values)

['Haven' 'Remember When' "Bram Stoker's Dracula" 'The Naked Heart'
 'Shenandoah Christmas (You, Me &amp; The Kids) (Harlequin Superromance, No. 1024)'
 'Day of the East Wind' 'The Reader'
 'The Prayer of Jabez: Breaking Through to the Blessed Life'
 'Emily Dickinson (Illustrated Poets)' 'Heartbreaker']


In [55]:
def recommend_books_knn(title, n_recommendations=5):
    if title not in combined['title'].values:
        print(f"❌ Book '{title}' not found.")
        return

    index = combined[combined_book_rating['title'] == title].index[0]
    distances, indices = knn_model.kneighbors(feature_vectors[index], n_neighbors=n_recommendations+1)

    print(f"\n📖 Recommendations for: '{title}'\n")
    for i in range(1, len(indices.flatten())):
        similar_title = combined.iloc[indices.flatten()[i]]['title']
        similarity_score = 1 - distances.flatten()[i]
        print(f"{i}. {similar_title} (Similarity Score: {similarity_score:.2f})")

recommend_books_knn("The Mulberry Tree", n_recommendations=5)


📖 Recommendations for: 'The Mulberry Tree'

1. Skinner's Festival (Similarity Score: 0.85)
2. Skinner's Festival (Similarity Score: 0.84)
3. Skinner's Mission (Similarity Score: 0.62)
4. Skinner's Rules (Similarity Score: 0.49)
5. Thursday Legends (Similarity Score: 0.49)
