In [71]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

import datasets

In [2]:
books = pd.read_csv("BX-Books.csv", sep=";", encoding="latin1", on_bad_lines='skip')
users = pd.read_csv("BX-Users.csv", sep=";", encoding="latin1", on_bad_lines='skip')
ratings = pd.read_csv("BX-Book-Ratings.csv", sep=";", encoding="latin1", on_bad_lines='skip')

  books = pd.read_csv("BX-Books.csv", sep=";", encoding="latin1", on_bad_lines='skip')


BOOKS

In [3]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [4]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [5]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']]

In [6]:
books.rename(columns={
        'Book-Title': 'title',
        'Book-Author': 'author',
        'Year-Of-Publication': 'year',
        'Publisher': 'publisher',
        'Image-URL-L': 'image_url'
}, inplace=True)


USERS

In [7]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
users.rename(columns={'User-ID': 'user_id',
                    'Location': 'location',
                    'Age': 'age'}, inplace=True)

RATINGS

In [9]:
ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [10]:
ratings.rename(columns={'User-ID': 'user_id',
                    'Book-Rating': 'rating'}, inplace=True)

In [11]:
#users who had at least rated more than 200 books
x = ratings['user_id'].value_counts() >= 200


In [12]:
y = x[x].index


In [13]:
ratings = ratings[ratings['user_id'].isin(y)]

In [14]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [15]:
#merge the ratings with books
ratings_with_books = ratings.merge(books, on='ISBN')

In [16]:
ratings_with_books.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...


In [17]:
#no. of ratings with the book title
ratings_per_book = ratings_with_books.groupby('title')['rating'].count().reset_index()

In [18]:
ratings_per_book.head(2)

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


In [19]:
ratings_per_book.rename(columns={'rating': 'number_of_rating'}, inplace=True)

In [20]:
final_rating = ratings_with_books.merge(ratings_per_book, on='title')

In [21]:
final_rating.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,number_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7


In [22]:
final_rating = final_rating[final_rating['number_of_rating'] >= 50]

In [23]:
#drop duplicates
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)

In [24]:
#check for null
final_rating.isnull().sum()

user_id             0
ISBN                0
rating              0
title               0
author              0
year                0
publisher           0
image_url           0
number_of_rating    0
dtype: int64

**Pivot Table**

In [25]:
book_pivot = final_rating.pivot_table(columns='user_id', index='title', values='rating')

In [26]:
book_pivot.head(2)

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,


In [27]:
book_pivot.fillna(0, inplace=True)

In [28]:
#sparse metrics
book_sparse = csr_matrix(book_pivot)

In [29]:
book_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15059 stored elements and shape (746, 894)>

**Model**

In [30]:
model = NearestNeighbors(algorithm='brute')

In [31]:
model.fit(book_sparse)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [38]:
book_pivot.iloc[555,:]

user_id
254       0.0
2276      0.0
2766      0.0
2977      0.0
3363      0.0
         ... 
275970    0.0
277427    0.0
277478    0.0
277639    0.0
278418    0.0
Name: The Eyre Affair: A Novel, Length: 894, dtype: float64

In [41]:
distance, suggestion = model.kneighbors(book_pivot.iloc[555,:].values.reshape(1,-1), n_neighbors=10)

In [42]:
distance

array([[ 0.        , 32.81767816, 33.13608305, 33.63034344, 33.80828301,
        34.2928564 , 34.525353  , 34.62657939, 35.14256678, 35.22782991]])

In [45]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['The Eyre Affair: A Novel', 'No Safe Place', 'Foucault's Pendulum',
       'Long After Midnight', 'Exclusive', 'Lake Wobegon days',
       'Pleading Guilty', 'CAT'S EYE', 'Abduction', 'Jacob Have I Loved'],
      dtype='object', name='title')


In [46]:
book_names = book_pivot.index

In [58]:
book_names

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=746)

In [59]:
np.where(book_names == 'Wizard and Glass (The Dark Tower, Book 4)')[0][0]

np.int64(739)

In [None]:
# for t in books[books['title'].str.contains("narnia", case=False, na=False)]['title']:
#     print(t)


The Silver Chair (full color) (Narnia)
The Magician's Nephew (Narnia)
The Magician's Nephew (The Chronicles of Narnia, Book 1, Full-Color Collector's Edition)
The Land of Narnia: Brian Sibley Explores the World of C.S. Lewis
The Lion, the Witch, and the Wardrobe (The Chronicles of Narnia, Book 2)
Complete Chronicles of Narnia
Prince Caspian (rack) : The Return to Narnia (Narnia)
The Voyage of the Dawn Treader (rack) (Narnia)
The Last Battle (The Chronicles of Narnia Book 7)
The Last Battle (The Chronicles of Narnia, Book 7)
The Silver Chair (The Chronicles of Narnia, Book 6)
The Voyage of the 'Dawn Treader' (The Chronicles of Narnia, Book 5)
Prince Caspian : The Return to Narnia (Narnia)
The Horse and His Boy (The Chronicles of Narnia, Book 3)
The Lion, the Witch and the Wardrobe (rpkg) (Narnia)
The Voyage of the Dawn Treader (rpkg) (Narnia)
The Magician's Nephew (rack) (Narnia)
The Chronicles of Narnia Boxed Set
Prince Caspian (rpkg) : The Return to Narnia (Narnia)
The Chronicles of N

In [61]:
ids = np.where(final_rating['title'] == "Wizard and Glass (The Dark Tower, Book 4)")[0][0]

In [62]:
ids

np.int64(2432)

In [63]:
final_rating.iloc[ids]['image_url']

'http://images.amazon.com/images/P/0452279178.01.LZZZZZZZ.jpg'

In [64]:
book_name = []

for book_id in suggestion:
    book_name.append(book_pivot.index[book_id])

In [65]:
book_name

[Index(['The Eyre Affair: A Novel', 'No Safe Place', 'Foucault's Pendulum',
        'Long After Midnight', 'Exclusive', 'Lake Wobegon days',
        'Pleading Guilty', 'CAT'S EYE', 'Abduction', 'Jacob Have I Loved'],
       dtype='object', name='title')]

In [68]:
ids_index = []

for name in book_name[0]:
    ids = np.where(final_rating['title'] == name)[0][0]
    ids_index.append(ids)

In [69]:
ids_index

[np.int64(2009),
 np.int64(22),
 np.int64(157),
 np.int64(76),
 np.int64(787),
 np.int64(1234),
 np.int64(785),
 np.int64(716),
 np.int64(498),
 np.int64(868)]

In [70]:
for idx in ids_index:
    url = final_rating.iloc[idx]['image_url']
    print(url)

http://images.amazon.com/images/P/0142001805.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0345404777.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0345368754.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0553571818.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0446604232.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0140092323.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0446365505.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0553282476.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/042517736X.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0064403688.01.LZZZZZZZ.jpg


In [72]:
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(book_pivot, open('artifacts/book_pivot.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_names, open('artifacts/book_names.pkl', 'wb'))

__**Testing**__

In [78]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=11 )
    
    for i in range(len(suggestion)):
            books = book_pivot.index[suggestion[i]]
            for j in books:
                if j == book_name:
                    print(f"You searched '{book_name}'\n")
                    print("The suggestion books are: \n")
                else:
                    print(j)

In [79]:
book_name = "CAT'S EYE"
recommend_book(book_name)

You searched 'CAT'S EYE'

The suggestion books are: 

No Safe Place
Long After Midnight
Exclusive
Lake Wobegon days
Pleading Guilty
Abduction
Jacob Have I Loved
Waiting to Exhale
Journey
Deck the Halls (Holiday Classics)
