In [1]:
# Importing necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
books = pd.read_csv('../books_data/books.csv', sep=";", error_bad_lines=False, encoding='latin-1')
users = pd.read_csv('../books_data/users.csv', sep=";", error_bad_lines=False, encoding='latin-1')
ratings = pd.read_csv('../books_data/ratings.csv', sep=";", error_bad_lines=False, encoding='latin-1')


In [4]:
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users.columns = ['userID', 'Location', 'Age']
ratings.columns = ['userID', 'ISBN', 'bookRating']

In [5]:
counts1 = ratings['userID'].value_counts()
ratings = ratings[ratings['userID'].isin(counts1[counts1 >= 200].index)]
counts = ratings['bookRating'].value_counts()
ratings = ratings[ratings['bookRating'].isin(counts[counts >= 100].index)]

In [6]:
combine_book_rating = pd.merge(ratings, books, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...


In [7]:
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])

In [None]:
combine_book_rating

In [9]:
book_ratingCount = (combine_book_rating.
     groupby(by = ['bookTitle'])['bookRating'].
     count().
     reset_index().
     rename(columns = {'bookRating': 'totalRatingCount'})
     [['bookTitle', 'totalRatingCount']]
    )
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [10]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82


In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   160576.000
mean         3.044
std          7.428
min          1.000
25%          1.000
50%          1.000
75%          2.000
max        365.000
Name: totalRatingCount, dtype: float64


In [12]:
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    5.000
0.910    6.000
0.920    7.000
0.930    7.000
0.940    8.000
0.950   10.000
0.960   11.000
0.970   14.000
0.980   19.000
0.990   31.000
Name: totalRatingCount, dtype: float64


In [13]:
popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82


In [14]:
rating_popular_book.shape

(62149, 5)

In [17]:
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how = 'left')

us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)


Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82,"gilbert, arizona, usa"
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82,"knoxville, tennessee, usa"
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,82,"byron, minnesota, usa"
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82,"cordova, tennessee, usa"
5,16795,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,82,"mechanicsville, maryland, usa"


In [43]:
us_canada_user_rating[us_canada_user_rating['userID'] == 254]

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
81,254,0060930535,0,The Poisonwood Bible: A Novel,133,"minneapolis, minnesota, usa"
597,254,014100018X,0,Chocolat,104,"minneapolis, minnesota, usa"
656,254,0142001740,9,The Secret Life of Bees,209,"minneapolis, minnesota, usa"
1283,254,0316569321,0,White Oleander : A Novel,116,"minneapolis, minnesota, usa"
1679,254,0316776963,0,Me Talk Pretty One Day,146,"minneapolis, minnesota, usa"
2877,254,0399501487,0,Lord of the Flies,120,"minneapolis, minnesota, usa"
3673,254,0439064872,9,Harry Potter and the Chamber of Secrets (Book 2),184,"minneapolis, minnesota, usa"
3782,254,0439136369,9,Harry Potter and the Prisoner of Azkaban (Book 3),138,"minneapolis, minnesota, usa"
6702,254,0684872153,0,Angela's Ashes (MMP) : A Memoir,92,"minneapolis, minnesota, usa"
6896,254,080410753X,0,The Kitchen God's Wife,111,"minneapolis, minnesota, usa"


In [18]:
from scipy.sparse import csr_matrix
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

In [64]:
us_canada_user_rating_pivot.loc['Night Whispers']

userID
254      0.000
2276     0.000
2766     0.000
2977     0.000
3363     0.000
          ... 
274808   0.000
275970   0.000
277427   0.000
277639   0.000
278418   0.000
Name: Night Whispers, Length: 734, dtype: float64

In [86]:
us_canada_user_rating_pivot.index.get_loc("Harry Potter and the Sorcerer's Stone (Book 1)")

241

In [87]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
query_index

408

In [88]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[241,:].values.reshape(1, -1), n_neighbors = 6)

668


In [78]:
us_canada_user_rating_pivot.index[368]

'Night Whispers'

In [93]:
us_canada_user_rating_pivot.index.get_value

<bound method Index.get_value of Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='bookTitle', length=746)>

In [89]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[241]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Harry Potter and the Sorcerer's Stone (Book 1):

1: Harry Potter and the Chamber of Secrets (Book 2), with distance of 0.5651248104368343:
2: Harry Potter and the Prisoner of Azkaban (Book 3), with distance of 0.5663644552176471:
3: Harry Potter and the Goblet of Fire (Book 4), with distance of 0.6115492007941644:
4: Harry Potter and the Order of the Phoenix (Book 5), with distance of 0.687867075646224:
5: The Bonesetter's Daughter, with distance of 0.796039640311702:


In [41]:
#Build Pickle
import pickle

with open('../ML_Recommendation/User_rating_pivot.pkl','wb') as f:
    pickle.dump(us_canada_user_rating_pivot, f)

with open('../ML_Recommendation/NearestNeighbors_model.pkl', 'wb') as f:
    pickle.dump(model_knn, f)

In [31]:
with open('../ML_Recommendation/User_rating_pivot.pkl','rb') as f:
    loaded_obj = pickle.load(f)

loaded_obj.index[query_index]

'Nothing Lasts Forever'

In [97]:
with open('../ML_Recommendation/Book_List.pkl', 'wb') as f:
    pickle.dump(us_canada_user_rating_pivot.index.to_list(),f)



TypeError: file must have 'read' and 'readline' attributes

In [99]:
with open('../ML_Recommendation/Book_List.pkl','rb') as f:
    book_list = pickle.load(f)

In [100]:
book_list

['1984',
 '1st to Die: A Novel',
 '2nd Chance',
 '4 Blondes',
 '84 Charing Cross Road',
 'A Bend in the Road',
 'A Case of Need',
 'A Child Called \\It\\": One Child\'s Courage to Survive"',
 'A Civil Action',
 'A Cry In The Night',
 'A Darkness More Than Night',
 'A Day Late and a Dollar Short',
 'A Fine Balance',
 'A Great Deliverance',
 'A Heartbreaking Work of Staggering Genius',
 'A Is for Alibi (Kinsey Millhone Mysteries (Paperback))',
 'A Lesson Before Dying (Vintage Contemporaries (Paperback))',
 'A Man Named Dave: A Story of Triumph and Forgiveness',
 'A Man in Full',
 'A Map of the World',
 'A Painted House',
 'A Patchwork Planet',
 'A Prayer for Owen Meany',
 'A Thin Dark Line (Mysteries &amp; Horror)',
 "A Thousand Acres (Ballantine Reader's Circle)",
 'A Time to Kill',
 "A Virtuous Woman (Oprah's Book Club (Paperback))",
 'A Walk to Remember',
 'A Widow for One Year',
 'A Wrinkle In Time',
 'A Wrinkle in Time',
 'A Year in Provence',
 "ANGELA'S ASHES",
 'Abduction',
 'Abou

In [116]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

match = process.extractBests('Night', book_list, limit=1)

match[0][0]

'Night'

In [117]:
with open('../ML_Recommendation/Book_List.pkl', 'rb') as f:
    Book_List = pickle.load(f)

with open('../ML_Recommendation/NearestNeighbors_model.pkl', 'rb') as f:
    model_knn = pickle.load(f)

with open('../ML_Recommendation/User_rating_pivot.pkl', 'rb') as f:
    user_rating_pivot = pickle.load(f)



In [126]:
match = process.extractBests('Harry Potter and the S', Book_List, limit=1)
match[0][0]

"Harry Potter and the Sorcerer's Stone (Book 1)"

In [120]:
user_rating_pivot.index.get_loc(match[0][0])

237

In [121]:
distances, indices = model_knn.kneighbors(user_rating_pivot.iloc[237,:].values.reshape(1, -1), n_neighbors = 6)

In [122]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[241]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Harry Potter and the Sorcerer's Stone (Book 1):

1: Harry Potter and the Prisoner of Azkaban (Book 3), with distance of 0.39610115844747384:
2: Harry Potter and the Goblet of Fire (Book 4), with distance of 0.439160784751057:
3: Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)), with distance of 0.516662521855756:
4: Harry Potter and the Sorcerer's Stone (Book 1), with distance of 0.5651248104368343:
5: Harry Potter and the Order of the Phoenix (Book 5), with distance of 0.6408129701529381:
