In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-06-13 22:29:27--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.2.33, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-06-13 22:29:28 (106 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [5]:
# add your code here - consider creating a new cell for each section of code

# EDA df_books

In [6]:
df_books.shape

(271379, 3)

In [7]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271377 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [8]:
df_books.sample(3)

Unnamed: 0,isbn,title,author
29379,441062253,Bio of an Ogre: The Autobiography of Piers Ant...,Piers Anthony
23429,345340418,Death in a Tenured Position (Kate Fansler Nove...,Amanda Cross
74115,590216880,Star Signs,Lori Reid


- nulls

In [9]:
df_books.isnull().sum()

isbn      0
title     0
author    2
dtype: int64

In [10]:
df_books[df_books['author'].isnull()]

Unnamed: 0,isbn,title,author
118038,751352497,A+ Quiz Masters:01 Earth,
187700,9627982032,The Credit Suisse Guide to Managing Your Perso...,


In [11]:
# Fill nulls with None because this books don't have an author

df_books.author.fillna('None', inplace=True)

In [12]:
df_books.isnull().any().sum()

0

- isbn

In [13]:
df_books.isbn.dtype

dtype('O')

In [14]:
df_books.isbn.nunique()

271379

In [15]:
df_books.isbn.nunique() == df_books.shape[0]

True

ISBN: Object. There are 271379 different values, one for each row in the dataset

- title

In [16]:
df_books.title.dtype

dtype('O')

In [17]:
df_books.title.nunique()

242154

In [18]:
df_books.title.value_counts()[df_books.title.value_counts() > 1].shape

(19907,)

In [19]:
df_books.title.value_counts(ascending=False).head(10)

title
Selected Poems                    27
Little Women                      24
Wuthering Heights                 21
The Secret Garden                 20
Dracula                           20
Adventures of Huckleberry Finn    20
Jane Eyre                         19
The Night Before Christmas        18
Pride and Prejudice               18
Great Expectations                17
Name: count, dtype: int64

In [20]:
df_books[df_books['title'] == 'Selected Poems']

Unnamed: 0,isbn,title,author
4523,081120958X,Selected Poems,William Carlos Williams
39417,0811201465,Selected Poems,K. Patchen
41317,0679750800,Selected Poems,Rita Dove
106890,0060931744,Selected Poems,Gwendolyn Brooks
118780,0517101548,Selected Poems,John Donne
127468,0520012984,Selected Poems,Paul Verlaine
156902,0871401541,Selected Poems,E. E. Cummings
158321,0060909897,Selected Poems,Gwendolyn Brooks
175611,0571050190,Selected Poems,T. T Gunn
183918,1550651498,Selected Poems,Ralph Gustafson


In [21]:
df_books[df_books['title'] == 'Pride and Prejudice']

Unnamed: 0,isbn,title,author
41,055321215X,Pride and Prejudice,Jane Austen
6745,0553213105,Pride and Prejudice,Jane Austen
13197,0451523652,Pride and Prejudice,Jane Austen
16831,1566190932,Pride and Prejudice,Jane Austen
18485,0451525884,Pride and Prejudice,Jane Austen
20542,0451519167,Pride and Prejudice,Jane Austen
24230,0893756113,Pride and Prejudice,Jane Austen
43982,0553210181,Pride and Prejudice,Jane Austen
46629,0192815032,Pride and Prejudice,Jane Austen
51204,0140238212,Pride and Prejudice,Jane Austen


TITLE: Object. There are more rows in the dataset than book titles, because some book titles are the same for different authors and also because other book titles have different isbn and their author written in different ways (it's necessary to standarize this).

- author

In [22]:
df_books.author.dtype

dtype('O')

In [23]:
df_books.author.value_counts()

author
Agatha Christie        632
William Shakespeare    567
Stephen King           524
Ann M. Martin          423
Carolyn Keene          373
                      ... 
Boissard                 1
Norris Houghton          1
Alain Layrac             1
Lorena-A Hickok          1
Christopher  Biffle      1
Name: count, Length: 102042, dtype: int64

In [24]:
df_books.groupby('author')['title'].nunique().sort_values(ascending=False)

author
William Shakespeare    496
Agatha Christie        476
Ann M. Martin          395
Carolyn Keene          371
Francine Pascal        349
                      ... 
Isabel Nitzsche          1
Isabel Marie             1
Isabel Maestre           1
Isabel Losada            1
Ã?Â?ric Holder           1
Name: title, Length: 102042, dtype: int64

In [25]:
(df_books.author.value_counts() > 2).sum()

19472

In [26]:
df_books[df_books['author'] == 'William Shakespeare']

Unnamed: 0,isbn,title,author
397,0198320264,Julius Caesar (Oxford School Shakespeare),William Shakespeare
1487,0174434642,Othello (3rd Series),William Shakespeare
2259,0671722727,King Lear,William Shakespeare
2366,0671722816,Othello,William Shakespeare
3327,0451521285,The Tragedy of Hamlet Prince of Denmark (Signe...,William Shakespeare
...,...,...,...
268294,052129455X,Macbeth (The New Cambridge Shakespeare),William Shakespeare
269397,0521634970,Romeo and Juliet (Cambridge School Shakespeare),William Shakespeare
269399,0748702563,Macbeth: Shakespeare Made Easy (Shakespeare Ma...,William Shakespeare
269815,0460871811,Antony and Cleopatra (Everyman Paperback Class...,William Shakespeare


AUTHOR: Object. There are 102042 different authors. 19472 of them have more than one book. It's necessary to standarize titles because some of them are repeated many times.

# EDA df_ratings

In [27]:
df_ratings.shape

(1149780, 3)

In [28]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [29]:
df_ratings.sample(3)

Unnamed: 0,user,isbn,rating
609870,147686,312866151,0.0
133074,30628,446347809,0.0
965285,233361,312263996,8.0


In [30]:
df_ratings.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user,1149780.0,140386.395126,80562.277719,2.0,70345.0,141010.0,211028.0,278854.0
rating,1149780.0,2.86695,3.854184,0.0,0.0,0.0,7.0,10.0


- user

In [31]:
df_ratings.user.dtype

dtype('int32')

In [32]:
df_ratings.user.nunique()

105283

In [33]:
df_ratings.user.value_counts(ascending=False)

user
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [34]:
mean_rev = df_ratings.user.value_counts(ascending=False).mean()
median_rev = df_ratings.user.value_counts(ascending=False).median()
total_rev = df_ratings.user.value_counts(ascending=False).sum()

print(f"Total reviews: {total_rev}\nMean reviews per user: {mean_rev}\nMedian reviews per user: {median_rev}")

Total reviews: 1149780
Mean reviews per user: 10.920851419507423
Median reviews per user: 1.0


In [35]:
(df_ratings.user.value_counts(ascending=False) == 1).sum()

59166

In [36]:
num_users_less200revs = (df_ratings.user.value_counts() < 200).sum()

print(f"There are {num_users_less200revs} users with less than 200 reviews. They will be dropped")

There are 104378 users with less than 200 reviews. They will be dropped


USER: int32. There are 105283 different users. 104378 of them have less than 200 reviews, they will be dropped for statistical significance.

In [37]:
users_more200revs = df_ratings.user.value_counts()[df_ratings.user.value_counts() >= 200].index.tolist()

df_ratings_UsersReduced = df_ratings[df_ratings.user.isin(users_more200revs)]

In [38]:
df_ratings_UsersReduced.user.value_counts()

user
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
225595      200
83671       200
252827      200
36554       200
99955       200
Name: count, Length: 905, dtype: int64

In [39]:
df_ratings_UsersReduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 527556 entries, 1456 to 1147616
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   user    527556 non-null  int32  
 1   isbn    527556 non-null  object 
 2   rating  527556 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 12.1+ MB


- isbn

In [40]:
df_ratings.isbn.dtype

dtype('O')

In [41]:
df_ratings.isbn.nunique()

340556

In [42]:
isbn_more100revs = df_ratings.isbn.value_counts()[(df_ratings.isbn.value_counts() >= 100)]

print(f"There are {len(isbn_more100revs)} isbn which have 100 or more reviews")

There are 731 isbn which have 100 or more reviews


ISBN: Objcet. There are 340556 different books in the original dataset, 207699 in the reduced dataset (without the less active users). Only 731 of them have 100 reviews or more. Books with less than 100 reviews will be dropped for statistical significance.

In [43]:
isbn_more100revs

isbn
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
              ... 
0786866586     100
0449203794     100
0425151867     100
0312966806     100
0425136981     100
Name: count, Length: 731, dtype: int64

In [44]:
df_ratings_isbnReduced = df_ratings[df_ratings.isbn.isin(isbn_more100revs.index.tolist())]

In [45]:
df_ratings_isbnReduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138461 entries, 2 to 1149772
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   user    138461 non-null  int32  
 1   isbn    138461 non-null  object 
 2   rating  138461 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 3.2+ MB


- rating

In [46]:
df_ratings.rating.dtype

dtype('float32')

In [47]:
df_ratings.rating.describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,1149780.0,2.86695,3.854184,0.0,0.0,0.0,7.0,10.0


In [48]:
df_ratings.groupby('isbn')['rating'].agg([len, 'mean']).sort_values(by='len', ascending=False)

Unnamed: 0_level_0,len,mean
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
0971880107,2502,1.019584
0316666343,1295,4.468726
0385504209,883,4.652322
0060928336,732,3.448087
0312195516,723,4.334716
...,...,...
0801064090,1,0.000000
0801064252,1,5.000000
0801064279,1,0.000000
0801064333,1,0.000000


RATING: float32. 2.86 (mean rating), 3.85 (std), 0 (min), 10 (max), 0 (median), 7 (75%).

- df_ratings_Reduced for statitistical significance

In [49]:
df_ratings_Reduced = df_ratings[(df_ratings.user.isin(users_more200revs)) & (df_ratings.isbn.isin(isbn_more100revs.index.tolist()))]

In [50]:
df_ratings_Reduced

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0
...,...,...,...
1147304,275970,0804111359,0.0
1147436,275970,140003065X,0.0
1147439,275970,1400031346,0.0
1147440,275970,1400031354,0.0


In [51]:
df_ratings_Reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49781 entries, 1456 to 1147441
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   isbn    49781 non-null  object 
 2   rating  49781 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 1.1+ MB


In [52]:
df_ratings_Reduced.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user,49781.0,140993.388562,80901.415891,254.0,70415.0,140358.0,213350.0,278418.0
rating,49781.0,2.024086,3.576591,0.0,0.0,0.0,3.0,10.0


In [53]:
# ratings_isbn = df_ratings.isbn.unique().tolist()
# books_isbn = df_books.isbn.unique().tolist()
# ratings_count_per_isbn = df_ratings.isbn.value_counts()

# problem_isbn = []

# for i in ratings_isbn:
#   if (i not in books_isbn) and (ratings_count_per_isbn[i] >= 100):
#     problem_isbn.append(i)

# print(f"{len(problem_isbn)} isbn not in df_book")
# for b in problem_isbn:
#   print(b)

In [54]:
df_ratings[df_ratings.isbn == '3257224281']

Unnamed: 0,user,isbn,rating
6,276736,3257224281,8.0
359579,86583,3257224281,6.0
496124,119485,3257224281,5.0
823718,198736,3257224281,8.0
1095325,262974,3257224281,0.0


In [55]:
df_books[df_books.isbn == '3257224281']

Unnamed: 0,isbn,title,author


## Merge datasets

In [56]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271379 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [57]:
df_ratings_Reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49781 entries, 1456 to 1147441
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   isbn    49781 non-null  object 
 2   rating  49781 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 1.1+ MB


In [58]:
df_merged = pd.merge(
    left = df_ratings_Reduced,
    right= df_books,
    how = 'left',
    on = 'isbn'
)

In [59]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49781 entries, 0 to 49780
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   isbn    49781 non-null  object 
 2   rating  49781 non-null  float32
 3   title   49517 non-null  object 
 4   author  49517 non-null  object 
dtypes: float32(1), int32(1), object(3)
memory usage: 1.5+ MB


In [60]:
df_merged.sample(3)

Unnamed: 0,user,isbn,rating,title,author
24902,138844,449221490,0.0,L Is for Lawless,Sue Grafton
26507,147847,553574574,0.0,Beach Music,Pat Conroy
39442,225810,553279912,6.0,A Is for Alibi (Kinsey Millhone Mysteries (Pap...,SUE GRAFTON


In [61]:
# Drop nulls

df_merged.dropna(axis=0, inplace=True)

In [62]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49517 entries, 0 to 49780
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49517 non-null  int32  
 1   isbn    49517 non-null  object 
 2   rating  49517 non-null  float32
 3   title   49517 non-null  object 
 4   author  49517 non-null  object 
dtypes: float32(1), int32(1), object(3)
memory usage: 1.9+ MB


In [63]:
df_merged.isbn.nunique()

727

In [64]:
df_merged.title.nunique()

673

In [65]:
df_merged.title.nunique()

673

In [66]:
df_merged.user.nunique()

888

In [67]:
df_merged.user = df_merged.user.astype('object')

### Utility matrix

In [68]:
isbn_user_matrix = df_merged.pivot(index='isbn', columns='user', values='rating').fillna(0)

In [69]:
isbn_user_matrix

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
sparse_isbn_user_matrix = csr_matrix(isbn_user_matrix.values)

### KNN Model

In [71]:
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(sparse_isbn_user_matrix)

In [72]:
# Find 5 k-nearest neighbors for each book (plus itself)

k = 6
distances, indices = model.kneighbors(sparse_isbn_user_matrix, n_neighbors=k)

- Book title to book isbn function

In [73]:
df_merged.groupby('title')['isbn'].agg('first')

title
1984                                                                              0451524934
1st to Die: A Novel                                                               0446610038
2nd Chance                                                                        0446612790
4 Blondes                                                                         0451203895
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash    0743224574
                                                                                     ...    
Without Remorse                                                                   0425143325
Year of Wonders                                                                   0142001430
You Belong To Me                                                                  0671004549
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values                 0553277472
\O\" Is for Outlaw"                                             

In [74]:
def title_to_isbn(item: str, df=df_merged, reverse=False):
  if not reverse:
    table = df_merged.groupby('title')['isbn'].agg('first')
  else:
    table = df_merged.groupby('isbn')['title'].agg('first')

  return table[item]


In [75]:
title_to_isbn(
    item = "Where the Heart Is (Oprah's Book Club (Paperback))"
)

'0446672211'

In [76]:
title_to_isbn(
    item = "0446672211",
    reverse = True
)

"Where the Heart Is (Oprah's Book Club (Paperback))"

## Get Recommends

In [77]:
indices

array([[  0, 682, 722, 138, 417, 690],
       [  1,  55, 495, 427, 428,  29],
       [  2, 662, 165, 612, 232,  98],
       ...,
       [724, 481,  55, 255, 663,   6],
       [725, 179,  75, 699, 710, 612],
       [726, 674,  65,   9, 580, 169]])

In [78]:
distances

array([[0.        , 0.7898418 , 0.8051655 , 0.8069457 , 0.8131442 ,
        0.82237244],
       [0.        , 0.7128463 , 0.73452413, 0.74777335, 0.75061035,
        0.7533023 ],
       [0.        , 0.70882016, 0.73184365, 0.78030056, 0.8016283 ,
        0.8157513 ],
       ...,
       [0.        , 0.6837633 , 0.7092408 , 0.73659056, 0.7375815 ,
        0.7495322 ],
       [0.        , 0.7694863 , 0.7790514 , 0.78266   , 0.7845751 ,
        0.7929697 ],
       [0.        , 0.6593015 , 0.7600024 , 0.792287  , 0.7958724 ,
        0.80388385]], dtype=float32)

In [79]:
isbn_user_matrix

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
book_index = np.random.choice(sparse_isbn_user_matrix.shape[0])
book_index

202

In [81]:
similar_books_indices = indices[book_index]
similar_books_indices

array([202, 207, 264,  54, 252, 122])

In [82]:
similar_books = isbn_user_matrix.index[similar_books_indices].tolist()
similar_books

['0375706771',
 '0375727345',
 '0385720106',
 '0140293248',
 '038550120X',
 '0316899984']

In [83]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  isbn = title_to_isbn(book)
  isbn_index = isbn_user_matrix.index.tolist().index(isbn)
  recommended_isbn_index = indices[isbn_index]
  recommended_isbn = isbn_user_matrix.index[recommended_isbn_index].tolist()
  recommended_titles = [title_to_isbn(isbn, reverse=True) for isbn in recommended_isbn][2:]
  recommended_dict = {k: v for k, v in zip(recommended_titles, distances[isbn_index][2:])}

  recommended_books = []
  recommended_books.append(book)
  books_list = [[k, v] for k, v in recommended_dict.items()]
  recommended_books.append(books_list)

  return recommended_books

In [84]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['I Know This Much Is True', 0.7677075], ['The Surgeon', 0.7699411], ['The Weight of Water', 0.77085835], ["I'll Be Seeing You", 0.8016211]]]
You passed the challenge! 🎉🎉🎉🎉🎉


## Test: Recommend a Stephen King book

In [85]:
df_merged[df_merged.author == 'Stephen King']

Unnamed: 0,user,isbn,rating,title,author
54,277427,0451169522,0.0,Misery,Stephen King
55,277427,0451177096,0.0,Dolores Claiborne,Stephen King
109,277639,0451170385,0.0,Four Past Midnight,Stephen King
180,278418,0451153553,0.0,Misery,Stephen King
181,278418,0451156609,0.0,The Tommyknockers,Stephen King
...,...,...,...,...,...
49505,274061,0743211383,10.0,Dreamcatcher,Stephen King
49543,274301,0451169530,7.0,The Stand: Complete and Uncut,Stephen King
49645,274308,0451153553,0.0,Misery,Stephen King
49646,274308,0451156609,0.0,The Tommyknockers,Stephen King


In [88]:
get_recommends("Gerald's Game")

["Gerald's Game",
 [['Dolores Claiborne', 0.60728693],
  ['Four Past Midnight', 0.6227506],
  ['The Tommyknockers', 0.626948],
  ['Bag of Bones', 0.67829126]]]