In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [2]:
path = ".\\data\\"

users_data = pd.read_csv(path+"BX-Users.csv", sep=";", error_bad_lines=False, encoding='latin-1') # 'ISO-8859-1'
books_data = pd.read_csv(path+"BX_Books.csv", sep=";", error_bad_lines=False, encoding='latin-1')
rating_data = pd.read_csv(path+"BX-Book-Ratings.csv", sep=";", error_bad_lines=False, encoding='latin-1')

In [3]:
books_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
rating_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


##### Delete unusable information from books frame and merge with ratings frame

In [5]:
books_data = books_data.drop(['Year-Of-Publication', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [6]:
books_rating_data = books_data.merge(rating_data, on='ISBN')
books_rating_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,Oxford University Press,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,41385,0
...,...,...,...,...,...,...
1031170,0440400988,There's a Bat in Bunk Five,Paula Danziger,Random House Childrens Pub (Mm),276463,7
1031171,0525447644,From One to One Hundred,Teri Sloat,Dutton Books,276579,4
1031172,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,HarperSanFrancisco,276680,0
1031173,0192126040,Republic (World's Classics),Plato,Oxford University Press,276680,0


##### Add the number of assigned ratings. Clear from users who have rated a small number of times

In [7]:
number_of_ratings = books_rating_data.groupby('ISBN')['Book-Rating'].count().reset_index()
number_of_ratings.rename(columns={'Book-Rating': 'Number of Book-Rating'}, inplace=True)

In [8]:
books_rating_data = books_rating_data.merge(number_of_ratings, on='ISBN')
books_rating_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher,User-ID,Book-Rating,Number of Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,Oxford University Press,2,0,1
1,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,8,5,14
2,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,11400,0,14
3,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,11676,8,14
4,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,41385,0,14
...,...,...,...,...,...,...,...
1031170,0440400988,There's a Bat in Bunk Five,Paula Danziger,Random House Childrens Pub (Mm),276463,7,1
1031171,0525447644,From One to One Hundred,Teri Sloat,Dutton Books,276579,4,1
1031172,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,HarperSanFrancisco,276680,0,1
1031173,0192126040,Republic (World's Classics),Plato,Oxford University Press,276680,0,1


In [9]:
books_rating_data = books_rating_data[books_rating_data['Number of Book-Rating'] >= 30]
books_rating_data.shape

(315065, 7)

In [10]:
books_rating_data.drop_duplicates(['User-ID', 'Book-Title'], inplace=True)
books_rating_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


(313546, 7)

### Colloaborative Recommendation

In [11]:
books_rating_pivot = books_rating_data.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating')
books_rating_pivot.fillna(0, inplace=True)
books_rating_pivot

User-ID,8,9,10,14,16,17,26,32,39,42,...,278831,278832,278836,278838,278843,278844,278846,278849,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
book_csr_matrix = csr_matrix(books_rating_pivot)

In [13]:
model = NearestNeighbors(algorithm='brute')

In [14]:
model.fit(book_csr_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [15]:
books_rating_pivot.iloc[54, :].values.reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
_, recommendations = model.kneighbors(books_rating_pivot.iloc[54, :].values.reshape(1,-1))
recommendations

array([[  54, 1180, 2622, 1181, 2845]], dtype=int64)

In [17]:
for i in range(len(recommendations)):
    print(recommendations[i])
    print(books_rating_pivot.index[recommendations[i]])

[  54 1180 2622 1181 2845]
Index(['A Fine Balance', 'Golden Cup', 'Tall, Dark, and Deadly',
       'Golden Orange', 'The Curse of the Mummy's Tomb (Goosebumps, No 5)'],
      dtype='object', name='Book-Title')


In [18]:
def recomend_book(book_name):
    book_id = np.where(books_rating_pivot.index == book_name)[0][0]
    _, recommendations = model.kneighbors(books_rating_pivot.iloc[book_id,:].values.reshape(1,-1))
    for i in range(len(recommendations)):
        if i == 0:
            print(f"For book \"{book_name}\" is recommended")
        if not i:
            print(books_rating_pivot.index[recommendations[i]])

In [19]:
recomend_book('Exclusive')

For book "Exclusive" is recommended
Index(['Exclusive', 'Best Kept Secrets', 'Fatal Terrain', 'Golden Cup',
       'Hidden Leaves (Debeers)'],
      dtype='object', name='Book-Title')
