In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

the next cell is commented because this code is designed to run locally so you actually need to download the files yourself

In [5]:
# get data files
# !wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

# !unzip book-crossings.zip

# books_filename = pd.read_csv('FCCMLproject03/BX-Books.csv')

# books_filename = 'BX-Books.csv'
# ratings_filename = 'BX-Book-Ratings.csv'

In [52]:
# import csv data into dataframes
df_books = pd.read_csv(
    'C:/Users/asus/FCCMLproject03/BX-Books.csv',
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    'C:/Users/asus/FCCMLproject03/BX-Book-Ratings.csv',
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

this cell filters our dataframe down to the users with more than 200 and the bookd with more than 100 ratings.
it shows you how much your data shrank.

In [60]:
user = df_ratings['user'].value_counts()
user200 = user[user.values<200]
book = df_ratings['isbn'].value_counts()
book100 = book[book.values<100]
df_ratings_1 = df_ratings[(~df_ratings['user'].isin(user200.index)) & (~df_ratings['isbn'].isin(book100.index))]

print(len(df_ratings))
print(len(df_ratings_1))

1149780
49781


you can skip the next cell.
it only checks for the ISBNs validation. (not every 10 digit code can be a validated ISBN)
as the result shows that all of the ISBNs on our new dataframe are validated.

In [62]:
# cleaning the data

    # validating isbns and remove the rows with unvalidated isbn
# wrong isbn
wrong_isbn=[]
# isbn checker def
def isbn_checker(isbn):
    a = list(isbn)
    if len(a)==10:
        if a[9] == 'X':
            a[9] = 10
        try:
            a = [int(x) for x in a]
            b=10*a[0]+9*a[1]+8*a[2]+7*a[3]+6*a[4]+5*a[5]+4*a[6]+3*a[7]+2*a[8]+a[9]
            if (b%11 != 0):
                wrong_isbn.append(isbn)
        except:
            wrong_isbn.append(isbn)
    else:
        wrong_isbn.append(isbn)
for isbn in df_ratings_1['isbn']:
    isbn_checker(isbn)
# converting it to a panda serie
wrong_isbn_ = pd.Series(wrong_isbn)
# a wrong isbn might be wrong for several reasons so let's make them unique
wrong_isbn_ = wrong_isbn_.unique()
# and filter 
df_ratings_isbn_1 = df_ratings_1[~df_ratings_1['isbn'].isin(wrong_isbn_)]
print(len(df_ratings_isbn_1))
print(len(df_ratings_1))

### result: ratings shriked from 1149780 to 1133161 

49781
49781


your data is ready! (DO NOT filter ratings with 0 value. I know in the project description it says ratings are in range of 1 to 10
                     but for some reasons it works when you keep the zero ratings!)

start developing your model:

In [64]:
isbn_user_matrix = df_ratings_1.pivot(index='isbn', columns='user', values='rating').fillna(0)
sparse_matrix = csr_matrix(isbn_user_matrix)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(sparse_matrix)

since there are 2 data frames, we work with ISBNs and we print the titles, the following cell is for converting ISBNs to titles and vice versa!

In [66]:
def isbn_to_title(isbn):
    return df_books[df_books['isbn'] == isbn]['title'].iloc[0]

def title_to_isbn(title):
    return df_books[df_books['title'] == title]['isbn'].iloc[0]

In [35]:
# add your code here - consider creating a new cell for each section of code

In [68]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
    title = book
    isbn = title_to_isbn(title)
    distances, indices = model_knn.kneighbors(sparse_matrix[list(isbn_user_matrix.index).index(isbn)], n_neighbors=6)
    indices_title = [isbn_to_title(isbn_user_matrix.index[i_1]) for i_1 in indices[0]]
    the_list = [[indices_title[j], distances[0][j]] for j in range(1, len(distances[0]))]
    
    ### for some reasons you need to publish your list upside down to get the wanted list
    the_list = [the_list[-i] for i in range(1, len(the_list))]
    the_list = [title, the_list]
    recommended_books = the_list
    return recommended_books

In [70]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.7677075]]]
You passed the challenge! 🎉🎉🎉🎉🎉
