# Book Recommendation Model

In [1]:
#Importing Data and Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

books = pd.read_csv('Books.csv')
ratings = pd.read_csv('Ratings.csv')
users = pd.read_csv('Users.csv')

In [2]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
books.drop(columns=['Image-URL-S','Image-URL-M','Image-URL-L','Year-Of-Publication','Book-Author','Publisher'],inplace=True)

In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 2)
(1149780, 3)
(278858, 3)


In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ISBN        271360 non-null  object
 1   Book-Title  271360 non-null  object
dtypes: object(2)
memory usage: 4.1+ MB


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [9]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


### Now we merge books with ratings on ISBN and merge users with ratings on User-ID

In [10]:
book_rating = books.merge(ratings, on='ISBN')
book_rating = book_rating.merge(users,on='User-ID')

In [11]:
book_rating.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating,Location,Age
0,195153448,Classical Mythology,2,0,"stockton, california, usa",18.0
1,2005018,Clara Callan,8,5,"timmins, ontario, canada",
2,60973129,Decision in Normandy,8,0,"timmins, ontario, canada",
3,374157065,Flu: The Story of the Great Influenza Pandemic...,8,0,"timmins, ontario, canada",
4,393045218,The Mummies of Urumchi,8,0,"timmins, ontario, canada",


In [12]:
book_rating.drop(columns=['Location','Age'],inplace=True)

### Now we calculate Number of ratings for each book and its average rating

In [13]:
#book_rating[book_rating==0]=np.NAN

In [14]:
rating_count = book_rating.groupby('Book-Title')['Book-Rating'].count().reset_index().rename(columns = {'Book-Rating':'count_rating'})

In [15]:
rating_avg=book_rating.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns = {'Book-Rating':'avg_rating'})

In [16]:
rating_count.head()

Unnamed: 0,Book-Title,count_rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [17]:
cum_rating=rating_count.merge(rating_avg,on='Book-Title')

In [18]:
cum_rating.head()

Unnamed: 0,Book-Title,count_rating,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


### Popularity based filtering for top 50 books

In [19]:
popular_books = cum_rating[cum_rating['count_rating']>250].sort_values(by='avg_rating', ascending=False).reset_index(drop=True).head(50)

In [20]:
popular_books.head()

Unnamed: 0,Book-Title,count_rating,avg_rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
1,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
2,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
3,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
4,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


### Collaberative Filtering:
**Condiser users who have rated more than 200 books and books that have more than 50 ratings**

In [21]:
x = book_rating.groupby('User-ID')['Book-Rating'].count()>200
active_users=x[x].index

In [22]:
book_rating = book_rating[book_rating['User-ID'].isin(active_users)]

In [23]:
y = book_rating.groupby('Book-Title')['Book-Rating'].count()>50
rated_books=y[y].index

In [24]:
final = book_rating[book_rating['Book-Title'].isin(rated_books)]

In [25]:
#We have a final data frame that has only those books with at least 50 ratings and users who have rated at least 200 books.
final.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating
81,399135782,The Kitchen God's Wife,11676,9
84,440234743,The Testament,11676,9
85,452264464,Beloved (Plume Contemporary Fiction),11676,8
88,971880107,Wild Animus,11676,6
89,345402871,Airframe,11676,0


### Now we create a pivot table with Book-Title as index, User-ID as columns and ratings as values

In [26]:
pt = final.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')

In [27]:
pt.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,


### Centering the mean around 0 to avoid the impact of 0 rating.
**This is done because 0 rating doesnot necessarily mean that the user rated it 0 but instead these movies are not rated by the user.**

In [33]:
pt_centered = pt.sub(pt.mean(axis=1,skipna=True),axis=0)

In [34]:
pt_centered=pt_centered.fillna(0)

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
cos_sim = cosine_similarity(pt_centered)

In [37]:
cos_sim

array([[ 1.00000000e+00,  1.71520630e-02, -6.43447390e-04, ...,
         7.60594340e-02,  2.65502527e-02, -2.23220463e-03],
       [ 1.71520630e-02,  1.00000000e+00,  1.33646462e-01, ...,
         3.84735618e-02,  1.46908371e-01,  6.81761353e-02],
       [-6.43447390e-04,  1.33646462e-01,  1.00000000e+00, ...,
         8.87766513e-03,  2.49756102e-02,  1.29601220e-01],
       ...,
       [ 7.60594340e-02,  3.84735618e-02,  8.87766513e-03, ...,
         1.00000000e+00,  5.71196682e-02,  1.11333641e-02],
       [ 2.65502527e-02,  1.46908371e-01,  2.49756102e-02, ...,
         5.71196682e-02,  1.00000000e+00,  8.40210457e-02],
       [-2.23220463e-03,  6.81761353e-02,  1.29601220e-01, ...,
         1.11333641e-02,  8.40210457e-02,  1.00000000e+00]])

In [72]:
def recommend(book_title):
    index=np.where(pt_centered.index==book_title)[0][0]
    #(type(index))
    #print(index)
    similar_books= sorted(enumerate(cos_sim[index]),key= lambda x:x[1],reverse=True)[1:6] #Top 5 similar Books
    #print(similar_books)
    for i in similar_books:
        print(pt_centered.index[i[0]])

In [73]:
recommend("2nd Chance")

Four Blind Mice
The Next Accident
The Murder Book
Flesh and Blood
Mortal Prey


In [74]:
recommend("4 Blondes")

Pleading Guilty
The House of the Spirits
The Edge
Schindler's List
Pride and Prejudice


### We have successfully built a popularity-based and collaborative filtering-based recommendation system