In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-03-13 16:41:07--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: â€˜book-crossings.zip.1â€™


2025-03-13 16:41:07 (247 MB/s) - â€˜book-crossings.zip.1â€™ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: BX-Book-Ratings.csv     
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [6]:
#1 remove instances from the dataset users with less than 200 ratings and books with less than 100 ratings.
user_counts = df_ratings['user'].value_counts()
book_counts = df_ratings['isbn'].value_counts()

filtered_users = user_counts[user_counts >= 100].index
df_ratings = df_ratings[df_ratings['user'].isin(filtered_users)]

filtered_books = book_counts[book_counts >= 200].index
df_ratings = df_ratings[df_ratings['isbn'].isin(filtered_books)]

df_books = df_books[df_books['isbn'].isin(df_ratings['isbn'])]

Remaining books: 194
Remaining ratings: 28969


In [100]:
book_pivot = df_ratings.pivot(index='isbn', columns='user', values='rating')
print(book_pivot[book_pivot > 0])
isbn_to_title = df_books.set_index('isbn')['title'].to_dict()

nn_model = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean')
nn_model.fit(book_pivot)


user        254     507     882     1424    1435    1733    1903    2033    \
isbn                                                                         
0060392452     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
0060502258     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
0060915544     NaN     NaN     NaN     NaN    10.0     NaN     NaN     NaN   
0060928336     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
0060930535     NaN     NaN     NaN     7.0     NaN     NaN     NaN     NaN   
...            ...     ...     ...     ...     ...     ...     ...     ...   
1400031354     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1400034779     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
155874262X     NaN     NaN     NaN     NaN    10.0     NaN     NaN     NaN   
1558743669     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1573229326     NaN     NaN     NaN     NaN     NaN     NaN     N

ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [98]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  book_isbn = df_books[df_books['title'] == book]['isbn'].values[0]
  print("Book ISBN:", book_isbn)
  print(df_books[df_books['isbn'] == book_isbn])
  print("Yessir", [book_pivot.loc[book_isbn]])
  distances, indices = nn_model.kneighbors([book_pivot.loc[book_isbn]])

  recommended_isbns = book_pivot.index[indices[0]].tolist()

  recommended_books = []
  recommended_books.append(book)

  recommendations = []

  for idx in range(1,len(recommended_isbns)):
    recommendations.append([isbn_to_title[recommended_isbns[idx]], distances[0][idx]])

  recommended_books.append(recommendations)
  return recommended_books

In [99]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! ðŸŽ‰ðŸŽ‰ðŸŽ‰ðŸŽ‰ðŸŽ‰")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

Book ISBN: 0446672211
           isbn                                              title  \
706  0446672211  Where the Heart Is (Oprah's Book Club (Paperba...   

           author  
706  Billie Letts  
Yessir [user
254       0.0
507       0.0
882       0.0
1424      0.0
1435      8.0
         ... 
277478    0.0
277639    0.0
278137    0.0
278188    0.0
278418    0.0
Name: 0446672211, Length: 1722, dtype: float32]
["Where the Heart Is (Oprah's Book Club (Paperback))", [["Songs in Ordinary Time (Oprah's Book Club (Paperback))", 81.21575927734375], ['Here on Earth', 81.61495208740234], ['The Last Time They Met : A Novel', 81.70679473876953], ['Digital Fortress : A Thriller', 82.60750579833984]]]
Book ISBN: 0446672211
           isbn                                              title  \
706  0446672211  Where the Heart Is (Oprah's Book Club (Paperba...   

           author  
706  Billie Letts  
Yessir [user
254       0.0
507       0.0
882       0.0
1424      0.0
1435      8.0
         ..