In [1]:
import numpy as np
import pandas as pd

In [2]:
# Path to ZIP files (update path if necessary)
path_books = 'BX-Books.csv.zip'
path_users = 'BX-Users.csv.zip'
path_ratings = 'BX-Book-Ratings.csv.zip'

In [3]:
books = pd.read_csv(
    'BX-Books.csv.zip',
    #handles special characters
    encoding='latin1',
    #semicolon-separated file
    sep=';',
    #properly handles semicolons inside quoted text
    quotechar='"',
    #skips broken rows with incorrect column counts
    on_bad_lines='skip'  # pandas 1.3.0+
)


  books = pd.read_csv(


In [4]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [5]:
# feature engineering
books=books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]

In [6]:
books.rename(columns={'Book-Title':'title','Book-Author':'author','Year-Of-Publication':'year','Publisher':'publisher'},inplace=True)

In [7]:
books.head(2)

Unnamed: 0,ISBN,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [8]:
users=pd.read_csv('BX-Users.csv.zip',encoding='latin1',sep=';',on_bad_lines='skip',quotechar='"')

In [9]:
users.rename(columns={'User-ID':'user_id','Location':'location','Age':'age'},inplace=True)

In [10]:
users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [11]:
ratings=pd.read_csv('BX-Book-Ratings.csv.zip',encoding='latin1',sep=';',quotechar='"',on_bad_lines='skip')

In [12]:
ratings.rename(columns={'User-ID':'user_id','Book-Rating':'rating'},inplace=True)

In [13]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


In [14]:
books.shape


(271360, 5)

In [15]:
users.shape

(278858, 3)

In [16]:
# seperate out those users which rated books above 200 
x=ratings['user_id'].value_counts()>200

In [17]:
# index of users[rated>200]
y=x[x].index

In [18]:
y.shape

(899,)

In [19]:
# putting y in ratings dataframe
ratings=ratings[ratings['user_id'].isin(y)]

In [20]:
 ratings.shape

(526356, 3)

In [21]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [22]:
# merging books on the basis of ISBN no.
# here the data reduces 
# and the books whose data is not present in the books dataset is being removed 
# so finally we have only the dataset in which books data is present
ratings_with_books=ratings.merge(books,on='ISBN')

In [23]:
ratings_with_books.shape

(487671, 7)

In [24]:
number_rating=ratings_with_books.groupby('title')['rating'].count().reset_index()

In [25]:
number_rating.rename(columns={'rating':'number of ratings'},inplace=True)

In [26]:
final_rating=ratings_with_books.merge(number_rating,on='title')

In [27]:
final_rating.shape

(487671, 8)

In [28]:
# books rated by those users who rated at min 200 books and on those books which already have rating at least 50  
final_rating=final_rating[final_rating['number of ratings']>=50]

In [29]:
final_rating.shape

(61853, 8)

In [30]:
# removing those rows in which one user gave rating multiple times on same book
final_rating.drop_duplicates(['user_id','title'],inplace=True)

In [31]:
final_rating.shape

(59850, 8)

In [32]:
book_pivot=final_rating.pivot_table(columns='user_id',index='title',values='rating')

In [33]:
# here Nan will replaced by 0
book_pivot.fillna(0,inplace=True)

In [34]:
book_pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# scipy-->scientific python
from scipy.sparse import csr_matrix
# consider only those values which are non-zero
book_sparse=csr_matrix(book_pivot)

In [36]:
type(book_sparse)

scipy.sparse._csr.csr_matrix

In [37]:
from sklearn.neighbors import NearestNeighbors
# brute-->solving wrt to each
model=NearestNeighbors(algorithm='brute')

In [38]:
model.fit(book_sparse)

In [39]:
distances, suggestions=model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1),n_neighbors=6)

In [40]:
distances

array([[ 0.        , 68.78953409, 69.5413546 , 72.64296249, 76.83098333,
        77.28518616]])

In [41]:
suggestions

array([[237, 240, 238, 241, 184, 536]], dtype=int64)

In [42]:
for i in range(len(suggestions)):
    print(book_pivot.index[suggestions[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [43]:
np.where(book_pivot.index=='Animal Farm')[0][0]

54

In [44]:
def recommend_book(book_name):
    book_id=np.where(book_pivot.index==book_name)[0][0]
    distances, suggestions=model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1),n_neighbors=6)
    
    for i in range(len(suggestions)):
        if i==0:
            print("The suggestions for",book_name,"are :")
        if not i:
            print(book_pivot.index[suggestions[i]])

In [45]:
recommend_book('Harry Potter and the Chamber of Secrets (Book 2)')

The suggestions for Harry Potter and the Chamber of Secrets (Book 2) are :
Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')
