In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
books = pd.read_csv('Data/Books.csv')
users = pd.read_csv('Data/Users.csv')
ratings = pd.read_csv('Data/Ratings.csv')

  books = pd.read_csv('Data/Books.csv')


In [3]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 8)
(278858, 3)
(1149780, 3)


In [4]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [5]:
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [6]:
ratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


## Checking Null values and Duplicates

In [7]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [8]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [9]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [10]:
print(books.duplicated().sum())
print(users.duplicated().sum())
print(ratings.duplicated().sum())

0
0
0


## Popularity Based Recommender System

> Filtering the top 50 books that receive at least 250 ratings.

In [11]:
ratings_with_name = ratings.merge(books,on='ISBN') ## Merging the ratings raws on books raws

In [12]:
ratings_with_name.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [13]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()   ## Grouping the books title with its rating
num_rating_df.rename(columns={'Book-Rating' : 'Number_of_ratings'},inplace=True)
num_rating_df.head(3)

Unnamed: 0,Book-Title,Number_of_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1


#### Average Rating Calculation
>some values in the 'Book-Rating' column are non-numeric.

In [14]:
# Convert 'Book-Rating' column to numeric, coercing errors to NaN
ratings_with_name['Book-Rating'] = pd.to_numeric(ratings_with_name['Book-Rating'], errors='coerce')
error_rows = ratings_with_name[ratings_with_name['Book-Rating'].isnull()]

# Drop rows where 'Book-Rating' contains non-numeric values and convert it to integer
ratings_with_name = ratings_with_name.dropna(subset=['Book-Rating'])
ratings_with_name['Book-Rating'] = ratings_with_name['Book-Rating'].astype(int)

# Group the books by title and calculate the average rating
avg_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index()

# Round the average ratings to one decimal place
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)
avg_rating_df['avg_rating'] = avg_rating_df['avg_rating'].round(1)

avg_rating_df.head()



Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.2
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


>Merging Number of Ratings and Average Ratings

In [15]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df

Unnamed: 0,Book-Title,Number_of_ratings,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.2
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.0
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.2
241068,Ã?Â?sterlich leben.,1,7.0
241069,Ã?Â?stlich der Berge.,3,2.7


In [16]:
popular_df = popular_df[popular_df['Number_of_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

In [17]:
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','Number_of_ratings','avg_rating']]


In [18]:
popular_df.shape

(50, 5)

In [19]:
popular_df = popular_df.reset_index(drop=True)
popular_df

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Number_of_ratings,avg_rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.9
1,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.8
2,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.7
3,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.5
4,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.2
5,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.0
6,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,368,4.9
7,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...,260,4.9
8,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,575,4.9
9,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,510,4.7


In [20]:
titles_to_exclude = ["The Hitchhiker's Guide to the Galaxy", 'Outlander','The Color Purple']
popular_df = popular_df[~popular_df['Book-Title'].isin(titles_to_exclude)]


In [21]:
popular_df.shape

(47, 5)

In [22]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))

## Collaborative Filtering Based Recommender System

#### Filtering Users Who Have Rated More than 200 Books and Books with at Least 50 Ratings


In [23]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating']>200
good_readers = x[x].index           
good_readers.shape                    ## 811 Users Who Have Rated More than 200 Books

(811,)

In [24]:
good_readers_books = ratings_with_name[ratings_with_name['User-ID'].isin(good_readers)]
good_readers_books.shape            ## Good_Readers Rates 474007 books

(474007, 10)

In [25]:
good_readers_with_good_books = good_readers_books.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = good_readers_with_good_books[good_readers_with_good_books].index

In [26]:
famous_books.shape

(706,)

In [27]:
Final_ratings = good_readers_books[good_readers_books['Book-Title'].isin(famous_books)]
Final_ratings.shape

(58586, 10)

#### 811 dedicated readers (who have rated over 200 books) have collectively reviewed popular titles (with at least 50 ratings) a total of 58,586 times.

In [28]:
pt = Final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating') 
pt.fillna(0,inplace=True)     ## Pivot table for identifying each users rating
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pt)

In [30]:
similarity_score.shape

(706, 706)

In [31]:
def recommend(book_name):
    ## To fetch index
    index = np.where(pt.index == book_name)[0][0]    
    ## sort the top 5 similarity score books in descending order with index
    similar_items = sorted(list(enumerate(similarity_score[index])),key=lambda x :x[1],reverse=True)[1:6]
    
    data = []
    for i in similar_items:
        item = []
        # Filter the 'books' DataFrame to retrieve rows where the book title matches the title of the current similar book
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
        
    return data

In [32]:
recommend('Message in a Bottle')

[['Nights in Rodanthe',
  'Nicholas Sparks',
  'http://images.amazon.com/images/P/0446531332.01.MZZZZZZZ.jpg'],
 ['The Mulberry Tree',
  'Jude Deveraux',
  'http://images.amazon.com/images/P/0743437640.01.MZZZZZZZ.jpg'],
 ['A Walk to Remember',
  'Nicholas Sparks',
  'http://images.amazon.com/images/P/0446608955.01.MZZZZZZZ.jpg'],
 ["River's End",
  'Nora Roberts',
  'http://images.amazon.com/images/P/0515127833.01.MZZZZZZZ.jpg'],
 ['Nightmares &amp; Dreamscapes',
  'Stephen King',
  'http://images.amazon.com/images/P/0451180232.01.MZZZZZZZ.jpg']]

In [33]:
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(similarity_score,open('similarity_score.pkl','wb'))

In [34]:
books_without_duplicates = books.drop_duplicates(subset='Book-Title', keep='first')

famous_books_df = famous_books.to_frame()
famous_book_details = pd.merge(famous_books_df, books_without_duplicates, left_on=famous_books_df.index, right_on='Book-Title', how='inner')


In [35]:
famous_book_details.shape

(706, 10)

In [36]:
famous_book_details = famous_book_details.drop(columns=['Book-Title_x', 'Book-Title_y'])
famous_book_details.drop([304, 306], inplace=True)

In [37]:
famous_book_details.loc[famous_book_details['Book-Title'] == 'Call of the Wild', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/51nKN3ZhhVL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Carrie', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/71ifzjx0reL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Hearts In Atlantis', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/51PP6GAN6EL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Insomnia', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/71iiP3hCMFL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'GARDEN OF SHADOWS (Dollanger Saga (Paperback))', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/51tMXY6TO4L.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'No Greater Love', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/518DmGOS3vL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Accident', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/516SoHsT5wL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'The Color Purple', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/71f6DRbcrsL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Jacob Have I Loved', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/718iwgPAJML._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Pride and Prejudice', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/613Sp+rFRuL._AC_UL480_FMwebp_QL65_.jpg',
famous_book_details.loc[famous_book_details['Book-Author'] == 'Laura Esquivel', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/41DK12KejsL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Author'] == 'LAURA ESQUIVEL', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/41DK12KejsL._SY466_.jpg',
famous_book_details.loc[famous_book_details['Book-Title'] == 'Christine', 'Image-URL-M'] = 'https://m.media-amazon.com/images/I/51V4cV0DRWL._SY445_SX342_.jpg',


famous_book_details = famous_book_details[famous_book_details['Book-Title'] != 'Hearts In Atlantis']






In [38]:
famous_book_details.shape

(703, 8)

In [39]:
import pickle
pickle.dump(famous_book_details,open('famous_book_details.pkl','wb'))

In [2]:
famous_book_details.dtype

NameError: name 'famous_book_details' is not defined