In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-10-14 20:12:55--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.4’


2024-10-14 20:12:55 (75.8 MB/s) - ‘book-crossings.zip.4’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: BX-Book-Ratings.csv     
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
#POP and check data
df_books.head(5)
df_ratings.head(5)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [5]:
#Define Users
user_filename = 'BX-Users.csv'
df_users = pd.read_csv(user_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0
    )

In [6]:
#Assign Ratings to Books
rating_count = pd.DataFrame(df_ratings.groupby('isbn')['rating'].count())
rating_count.sort_values('rating', ascending=False).head()

Unnamed: 0_level_0,rating
isbn,Unnamed: 1_level_1
971880107,2502
316666343,1295
385504209,883
60928336,732
312195516,723


In [7]:
#Check Most Rated Books
most_rated_books = pd.DataFrame(['0971880107', '0316666343', '0385504209', '0060928336', '0312195516'], index=np.arange(5), columns = ['isbn'])
most_rated_books_summary = pd.merge(most_rated_books, df_books, on='isbn')
most_rated_books_summary

Unnamed: 0,isbn,title,author
0,971880107,Wild Animus,Rich Shapero
1,316666343,The Lovely Bones: A Novel,Alice Sebold
2,385504209,The Da Vinci Code,Dan Brown
3,60928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells
4,312195516,The Red Tent (Bestselling Backlist),Anita Diamant


In [8]:
#Find Ratings and Amount of ratings per book
average_rating = pd.DataFrame(df_ratings.groupby('isbn')['rating'].mean())
average_rating['ratingCount'] = pd.DataFrame(df_ratings.groupby('isbn')['rating'].count())
average_rating.sort_values('ratingCount', ascending=False).head()

Unnamed: 0_level_0,rating,ratingCount
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
971880107,1.019584,2502
316666343,4.468726,1295
385504209,4.652322,883
60928336,3.448087,732
312195516,4.334716,723


In [9]:
#Combine book names with ratings
combine_book_rating = pd.merge(df_ratings, df_books, on='isbn')
combine_book_rating.head()

Unnamed: 0,user,isbn,rating,title,author
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5.0,Rites of Passage,Judith Rae
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather


In [10]:
#Drop Empty Ratings
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['title'])

book_ratingCount = (combine_book_rating.groupby(by = ['title'])['rating'].count().reset_index().
     rename(columns = {'rating': 'totalratingCount'})
     [['title', 'totalratingCount']]
    )
book_ratingCount.head()

Unnamed: 0,title,totalratingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [11]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,user,isbn,rating,title,author,totalratingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,60
1,276726,0155061224,5.0,Rites of Passage,Judith Rae,14
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks,650
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse,1
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,1


In [12]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#Change Popularity Threshold
popularity_threshold = 9.0
rating_popular_book = rating_with_totalRatingCount.query('totalratingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,user,isbn,rating,title,author,totalratingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,60
1,276726,0155061224,5.0,Rites of Passage,Judith Rae,14
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks,650
5,276733,2080674722,0.0,Les Particules Elementaires,Michel Houellebecq,14
6,276744,038550120X,7.0,A Painted House,JOHN GRISHAM,838


In [13]:
combined = rating_popular_book.merge(df_users, left_on = 'user', right_on = 'User-ID', how = 'left')
#Reducing the size of the dataset
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()


Unnamed: 0,user,isbn,rating,title,author,totalratingCount,User-ID,Location
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,60,276725,"tyler, texas, usa"
1,276726,0155061224,5.0,Rites of Passage,Judith Rae,14,276726,"seattle, washington, usa"
4,276744,038550120X,7.0,A Painted House,JOHN GRISHAM,838,276744,"torrance, california, usa"
10,276747,0060517794,9.0,Little Altars Everywhere,Rebecca Wells,85,276747,"iowa city, iowa, usa"
11,276747,0451192001,0.0,How Stella Got Her Groove Back,Terry McMillan,210,276747,"iowa city, iowa, usa"


In [14]:
#Make Necessary Tables for training
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['user', 'title'])
us_canada_user_rating_pivot = us_canada_user_rating.groupby(['title', 'user'])['rating'].mean().unstack(fill_value=0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

#Make and Train the Model
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

In [17]:
def find_query_index():
  for i in range(us_canada_user_rating_pivot.shape[0]):
    if "Where the Heart Is (Oprah's Book Club (Paperback))" in us_canada_user_rating_pivot.index[i]:
      print(i)
      return(i)

query_index = find_query_index()
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

18449
Recommendations for Where the Heart Is (Oprah's Book Club (Paperback)):

1: The Sunday Wife: A Novel, with distance of 0.8995596170425415:
2: Frozen Summer, with distance of 0.9020170569419861:
3: Thrill!, with distance of 0.9044653177261353:
4: Forever Yours, Faithfully: My Love Story, with distance of 0.904478907585144:
5: Fortunes Rocks, with distance of 0.9080843329429626:


In [18]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  def find_query_index(book):
    for i in range(us_canada_user_rating_pivot.shape[0]):
      if book in us_canada_user_rating_pivot.index[i]:
        return(i)

  query_index = find_query_index(book)
  distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 5)
  recommended_books = []
  for i in range(0, len(distances.flatten())):
      if i == 0:
          print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
          recommended_books.append(us_canada_user_rating_pivot.index[query_index])
          #pass
      else:
          #print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
          recommended_books.append((us_canada_user_rating_pivot.index[indices.flatten()[i]],float(distances.flatten()[i])))
  return recommended_books
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

Recommendations for Where the Heart Is (Oprah's Book Club (Paperback)):

["Where the Heart Is (Oprah's Book Club (Paperback))", ('The Sunday Wife: A Novel', 0.8995596170425415), ('Frozen Summer', 0.9020170569419861), ('Thrill!', 0.9044653177261353), ('Forever Yours, Faithfully: My Love Story', 0.904478907585144)]


In [20]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["The Sunday Wife: A Novel", 'Frozen Summer', 'Thrill!', 'Forever Yours, Faithfully: My Love Story']
  recommended_books_dist = [0.89, 0.9, 0.9, 0.9]
  for i in range(4):
    if recommends[i+1][0] not in recommended_books:
      test_pass = False
    if abs(recommends[i+1][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You havn't passed yet. Keep trying!")

test_book_recommendation()

Recommendations for Where the Heart Is (Oprah's Book Club (Paperback)):

["Where the Heart Is (Oprah's Book Club (Paperback))", ('The Sunday Wife: A Novel', 0.8995596170425415), ('Frozen Summer', 0.9020170569419861), ('Thrill!', 0.9044653177261353), ('Forever Yours, Faithfully: My Love Story', 0.904478907585144)]
Recommendations for Where the Heart Is (Oprah's Book Club (Paperback)):

You passed the challenge! 🎉🎉🎉🎉🎉
