In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-11-29 09:00:44--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-11-29 09:00:44 (152 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [None]:
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [None]:
df_ratings.tail()

Unnamed: 0,user,isbn,rating
1149775,276704,1563526298,9.0
1149776,276706,679447156,0.0
1149777,276709,515107662,10.0
1149778,276721,590442449,10.0
1149779,276723,5162443314,8.0


In [None]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [None]:
#df_ratings.isnull().sum()
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271377 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [None]:
user_counts = df_ratings['user'].value_counts()
valid_users = user_counts[user_counts >= 10].index
df_ratings = df_ratings[df_ratings['user'].isin(valid_users)]

# Filter books with less than 10 ratings
book_counts = df_ratings['isbn'].value_counts()
valid_books = book_counts[book_counts >= 10].index
df_ratings_filtered = df_ratings[df_ratings['isbn'].isin(valid_books)]
df_ratings_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 419075 entries, 31 to 1149772
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   user    419075 non-null  int32  
 1   isbn    419075 non-null  object 
 2   rating  419075 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 9.6+ MB


In [None]:
from sklearn.preprocessing import StandardScaler
# Create a utility matrix
utility_matrix = df_ratings_filtered.pivot_table(
    index='isbn',
    columns='user',
    values='rating',
    fill_value=0
)

# Normalize the data for KNN
scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(utility_matrix)

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(normalized_matrix)

In [None]:
def get_recommends(book_title):
    # Get the ISBN of the book
    book_info = df_books[df_books['title'] == book_title]
    if book_info.empty:
        return f"Book '{book_title}' not found in the dataset."

    isbn = book_info['isbn'].values[0]

    # Check if the ISBN exists in the utility matrix
    if isbn not in utility_matrix.index:
        return f"Book '{book_title}' is not part of the filtered dataset (insufficient ratings)."

    # Find the book index in the utility matrix
    book_index = utility_matrix.index.tolist().index(isbn)

    # Find distances and indices of neighbors
    distances, indices = knn.kneighbors([normalized_matrix[book_index]], n_neighbors=6)

    # Get recommended books
    recommendations = []
    for idx, dist in zip(indices.flatten()[1:], distances.flatten()[1:]):  # Skip the first result (itself)
        rec_isbn = utility_matrix.index[idx]
        rec_title = df_books[df_books['isbn'] == rec_isbn]['title'].values[0]
        recommendations.append([rec_title, dist])

    return [book_title, recommendations]


In [None]:
df_books.iloc[15:25]

Unnamed: 0,isbn,title,author
15,1567407781,The Witchfinder (Amos Walker Mystery Series),Loren D. Estleman
16,1575663937,More Cunning Than Man: A Social History of Rat...,Robert Hendrickson
17,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver
18,440234743,The Testament,John Grisham
19,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison
20,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion
21,1841721522,New Vegetarian: Bold and Beautiful Recipes for...,Celia Brooks Brown
22,1879384493,If I'd Known Then What I Know Now: Why Not Lea...,J. R. Parrish
23,61076031,Mary-Kate &amp; Ashley Switching Goals (Mary-K...,Mary-Kate &amp; Ashley Olsen
24,439095026,Tell Me This Isn't Happening,Robynn Clairday


In [None]:
get_recommends("The Testament")

['The Testament',
 [['The Prize', 0.85532016],
  ['Year Zero', 0.8691697],
  ["The Valley of Horses (Earth's Children)", 0.88160455],
  ['EVERYTHING SHE EVER WANTED', 0.8891807],
  ['Forty Words for Sorrow', 0.89643824]]]