In [59]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
books=pd.read_csv('books.csv')

In [61]:
books.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [62]:
books.isnull().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

In [63]:
book=books[['isbn13','title','subtitle','authors','categories','description','thumbnail']]

In [64]:
book.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book.fillna('', inplace=True)


In [65]:
book.isnull().sum()

isbn13         0
title          0
subtitle       0
authors        0
categories     0
description    0
thumbnail      0
dtype: int64

In [66]:
book['tags']=book['title']+' '+book['subtitle']+' '+book['authors']+' '+book['categories']+' '+book['description']
book['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book['tags']=book['title']+' '+book['subtitle']+' '+book['authors']+' '+book['categories']+' '+book['description']


0       gilead  marilynne robinson fiction a novel tha...
1       spider's web a novel charles osborne;agatha ch...
2       the one tree  stephen r. donaldson american fi...
3       rage of angels  sidney sheldon fiction a memor...
4       the four loves  clive staples lewis christian ...
                              ...                        
6805    i am that talks with sri nisargadatta maharaj ...
6806       secrets of the heart  khalil gibran mysticism 
6807           fahrenheit 451  ray bradbury book burning 
6808    the berlin phenomenology  georg wilhelm friedr...
6809    'i'm telling you stories' jeanette winterson a...
Name: tags, Length: 6810, dtype: object

In [67]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors=tfidf.fit_transform(book['tags']).toarray()

In [68]:
similarity=cosine_similarity(vectors)

In [69]:
similarity

array([[1.        , 0.00569211, 0.00291048, ..., 0.        , 0.01205719,
        0.        ],
       [0.00569211, 1.        , 0.02085471, ..., 0.        , 0.00514629,
        0.03939511],
       [0.00291048, 0.02085471, 1.        , ..., 0.        , 0.01358364,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01205719, 0.00514629, 0.01358364, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.03939511, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [70]:
def recommend_books(book_title,n=5):
    idx=book[book['title']==book_title].index[0]
    distances=similarity[idx]
    book_list=sorted(list(enumerate(distances)),key=lambda x:x[1],reverse=True)[1:n+1]
    for i in book_list:
        print(book.iloc[i[0]]['title'])

In [74]:
recommend_books("Rage of angels")

Organized Crime
Trevayne
For the Sins of My Father
Crime Partners
If Tomorrow Comes


In [72]:
import pickle

In [73]:
pickle.dump(book,open('books_rec.pkl','wb'))
pickle.dump(similarity,open('book_similarity.pkl','wb'))