In [1]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
books = pd.read_csv('data.csv')

In [3]:
books.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [4]:
books.shape

(6810, 12)

In [5]:
books.columns

Index(['isbn13', 'isbn10', 'title', 'subtitle', 'authors', 'categories',
       'thumbnail', 'description', 'published_year', 'average_rating',
       'num_pages', 'ratings_count'],
      dtype='object')

In [6]:
selected_features= books[[ 'title', 'authors', 'categories', 'published_year','thumbnail']]

In [7]:
books.head(2)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0


In [8]:
#replace null values 
for feature in selected_features:
    books[feature]= books[feature].fillna('')

In [9]:
#combining all features
com_features =books['title']+' '+books['authors']+' '+books['categories']+' '+ f"{books['published_year']}"
com_features

0       Gilead Marilynne Robinson Fiction 0       2004...
1       Spider's Web Charles Osborne;Agatha Christie D...
2       The One Tree Stephen R. Donaldson American fic...
3       Rage of angels Sidney Sheldon Fiction 0       ...
4       The Four Loves Clive Staples Lewis Christian l...
                              ...                        
6805    I Am that Sri Nisargadatta Maharaj;Sudhakar S....
6806    Secrets Of The Heart Khalil Gibran Mysticism 0...
6807    Fahrenheit 451 Ray Bradbury Book burning 0    ...
6808    The Berlin Phenomenology Georg Wilhelm Friedri...
6809    'I'm Telling You Stories' Helena Grice;Tim Woo...
Length: 6810, dtype: object

In [10]:
vectorizer = TfidfVectorizer()
ft_vectors=vectorizer.fit_transform(com_features)

In [11]:
print (ft_vectors)

  (0, 6894)	0.06443750710614471
  (0, 2840)	0.06443750710614471
  (0, 143)	0.06443750710614471
  (0, 5557)	0.06443750710614471
  (0, 7629)	0.06443750710614471
  (0, 6655)	0.06443750710614471
  (0, 102)	0.06443750710614471
  (0, 142)	0.06443750710614471
  (0, 91)	0.06443750710614471
  (0, 141)	0.06443750710614471
  (0, 140)	0.06443750710614471
  (0, 139)	0.06443750710614471
  (0, 103)	0.06443750710614471
  (0, 138)	0.06443750710614471
  (0, 110)	0.06443750710614471
  (0, 98)	0.12887501421228942
  (0, 92)	0.06443750710614471
  (0, 108)	0.06443750710614471
  (0, 112)	0.12887501421228942
  (0, 3422)	0.11189605685054442
  (0, 8035)	0.4370004681370611
  (0, 6013)	0.5885172279797456
  (0, 3882)	0.5885172279797456
  (1, 9065)	0.2252600458713649
  (1, 6625)	0.3013956698681646
  :	:
  (6809, 10330)	0.33175135989257093
  (6809, 2240)	0.2206043839379868
  (6809, 9492)	0.30218650531542857
  (6809, 10435)	0.26439634843355025
  (6809, 5684)	0.20438577371569946
  (6809, 9065)	0.21319651041165227
  (68

In [12]:
#similarity calculation using cosine similarity

similarity= cosine_similarity(ft_vectors,ft_vectors)



In [13]:
print (similarity)

[[1.         0.08005311 0.11641123 ... 0.08492469 0.08320896 0.07576596]
 [0.08005311 1.         0.08011208 ... 0.06549297 0.06416982 0.10645452]
 [0.11641123 0.08011208 1.         ... 0.08498725 0.09570742 0.07582177]
 ...
 [0.08492469 0.06549297 0.08498725 ... 1.         0.06807484 0.06198557]
 [0.08320896 0.06416982 0.09570742 ... 0.06807484 1.         0.06073329]
 [0.07576596 0.10645452 0.07582177 ... 0.06198557 0.06073329 1.        ]]


Testing the system 

In [14]:
#creaating a list of all the book names 

list_of_titles=books['title'].tolist()
print (list_of_titles)



In [15]:
#asking user for their favorite book 
book_name = input('Enter Your Favorite Book Name : ')

In [16]:
#finding the book that matches 
close_match_finder =difflib.get_close_matches(book_name,list_of_titles)
print(close_match_finder)

['Notes from Underground']


In [17]:
close_match= close_match_finder[0]
book_index=books[books.title == close_match].index[0]

In [18]:
similar_score= list(enumerate(similarity[book_index]))
print(similarity)

[[1.         0.08005311 0.11641123 ... 0.08492469 0.08320896 0.07576596]
 [0.08005311 1.         0.08011208 ... 0.06549297 0.06416982 0.10645452]
 [0.11641123 0.08011208 1.         ... 0.08498725 0.09570742 0.07582177]
 ...
 [0.08492469 0.06549297 0.08498725 ... 1.         0.06807484 0.06198557]
 [0.08320896 0.06416982 0.09570742 ... 0.06807484 1.         0.06073329]
 [0.07576596 0.10645452 0.07582177 ... 0.06198557 0.06073329 1.        ]]


In [19]:
sorted_similar_books =sorted(similar_score, key =lambda x:x[1],reverse=True)
print (sorted_similar_books)


[(803, 1.0000000000000002), (2107, 0.5135091802145987), (3947, 0.48504620729576153), (4008, 0.48504620729576153), (5676, 0.48504620729576153), (845, 0.4800794813951192), (3413, 0.4800794813951192), (2108, 0.4321717152668201), (3205, 0.4111330840649004), (2199, 0.38664070733841605), (5437, 0.3503427006428775), (6767, 0.3370207331893142), (586, 0.32523487835863074), (797, 0.29330468653312874), (2042, 0.28050434993348455), (5281, 0.237736506504679), (3941, 0.2315189400118712), (1464, 0.2248199463633788), (1202, 0.21944629722944534), (5351, 0.21709021103969423), (5766, 0.21023379886820634), (4733, 0.20733258933895737), (6031, 0.19953306859099443), (1102, 0.19442398580191939), (3971, 0.19285290195599614), (158, 0.19127897537409588), (1767, 0.18891534976579819), (1877, 0.18593879786580697), (192, 0.18487802210509627), (3001, 0.18063061619500065), (2256, 0.1804018580993848), (2793, 0.18019868434997796), (3423, 0.1792831652654074), (5071, 0.17731611189623897), (3565, 0.17412251248986205), (533

In [20]:
most_similar= sorted_similar_books[:5]
most_similar

[(803, 1.0000000000000002),
 (2107, 0.5135091802145987),
 (3947, 0.48504620729576153),
 (4008, 0.48504620729576153),
 (5676, 0.48504620729576153)]

In [24]:
#print name from the index 
i=1
for book in sorted_similar_books:
    index= book[0]
    title_from_index=books[books.index==index]['title'].values[0]
    pictures = books[books.index==index]['thumbnail'].values[0]
    if (i< 6):
        print(i,'-', title_from_index, ' ' ,pictures )
        i+=1
 

1 - The Adolescent   http://books.google.com/books/content?id=nKJ27jiKJK4C&printsec=frontcover&img=1&zoom=1&source=gbs_api
2 - The Brothers Karamazov   http://books.google.com/books/content?id=HOf-64Go9cgC&printsec=frontcover&img=1&zoom=1&source=gbs_api
3 - The Brothers Karamazov   http://books.google.com/books/content?id=ZI2ncEieZloC&printsec=frontcover&img=1&zoom=1&source=gbs_api
4 - Crime and Punishment   http://books.google.com/books/content?id=hG5P6jrzJLAC&printsec=frontcover&img=1&zoom=1&source=gbs_api
5 - Crime and Punishment   http://books.google.com/books/content?id=PJTlydN3Mf0C&printsec=frontcover&img=1&zoom=1&source=gbs_api


In [22]:
book_name=input("Enter your favorite book :")

list_of_titles= books['title'].tolist()

find_close_match=difflib.get_close_matches(book_name,list_of_titles)

close_match= find_close_match[0]

book_index=books[books.title == close_match].index[0]

similarity_score=list(enumerate (similarity[book_index]))

sorted_similar_books= sorted(similarity_score , key = lambda x:x[1] ,reverse = True)

print ("Books Suggested for you : \n")

for book in sorted_similar_books:
    index = book[0]
    title_from_index =books[books.index==index]['title'].values[0]
    
    if (i<30):
        print ( i ,' ' , title_from_index)
        i+=1





Books Suggested for you : 

6   The Adolescent
7   The Brothers Karamazov
8   The Brothers Karamazov
9   Crime and Punishment
10   Crime and Punishment
11   Crime and Punishment
12   The Double and the Gambler
13   Notes from Underground
14   The Idiot
15   The Grand Inquisitor
16   The Gambler, Bobok, A Nasty Story
17   The Brothers Karamazov
18   The Karamazov Brothers
19   Love
20   The Gunslinger
21   The Book of the Dragon
22   The Lighthouse
23   Atonement
24   The Lord of the Rings
25   The Lord of the Rings
26   The Odyssey
27   The Gunslinger
28   The Mysterious Island
29   The Mind Parasites
