In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
# A plotting library used for creating static, interactive, and animated visualizations in Python.
import matplotlib.pyplot as plt
# A statistical data visualization library based on matplotlib. It provides a high-level interface for creating attractive graphs.
import seaborn as sns
# A library to handle warnings that occur during code execution.
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading all the dataset
books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [3]:
# Top 5 row of Books dataset
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
# Top 5 row of Users dataset
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
# Top 5 row of Ratings dataset
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
# shape of books dataset
books.shape

(271360, 8)

In [7]:
# shape of users dataset
users.shape

(278858, 3)

In [9]:
# shape of ratings dataset
ratings.shape

(1149780, 3)

In [10]:
# Checking null values
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [11]:
# Checking null values
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [12]:
# Checking null values
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [13]:
# chacking duplicate values
books.duplicated().sum()

0

In [24]:
# chacking duplicate values
ratings.duplicated().sum()

0

In [26]:
# chacking duplicate values
users.duplicated().sum()

0

In [27]:
# columns name in ratings dataset
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [31]:
# columns name in ratings dataset
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [33]:
# Joining book dataset with ratings dataset
book_with_ratings = ratings.merge(books,on='ISBN')

In [34]:
# shape of new dataset
book_with_ratings.shape

(1031136, 10)

In [37]:
# top 5 rows of new joined dataset
book_with_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [39]:
# Group the dataset by "Book-Title" and count the occurrences of "Book-Rating" for each book.
# Reset the index to convert the grouped data into a DataFrame.
# Rename the "Book-Rating" column to "Num-Rating", indicating the number of ratings for each book.

num_ratings =  book_with_ratings.groupby("Book-Title").count()["Book-Rating"].reset_index()
num_ratings.rename(columns={"Book-Rating":"Num-Rating"},inplace=True)

In [40]:
num_ratings

Unnamed: 0,Book-Title,Num-Rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [41]:
# shape of the dataset
num_ratings.shape

(241071, 2)

In [45]:
# Group the dataset by "Book-Title" and calculate the mean of "Book-Rating" for each book, considering numeric columns only.
# Reset the index to convert the grouped data into a DataFrame.
# Rename the "Book-Rating" column to "Avg-Rating" for clarity, indicating the average rating for each book.

avg_ratings =  book_with_ratings.groupby("Book-Title").mean(numeric_only=True)["Book-Rating"].reset_index()
avg_ratings.rename(columns={"Book-Rating":"Avg-Rating"},inplace=True)

In [46]:
avg_ratings

Unnamed: 0,Book-Title,Avg-Rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [49]:
# Merge the "num_ratings" and "avg_ratings" DataFrames on the "Book-Title" column.
# This creates a new DataFrame "popularity_df" containing both the number of ratings ("Num-Rating") 
# and the average rating ("Avg-Rating") for each book.

popularity_df = num_ratings.merge(avg_ratings,on='Book-Title')

In [51]:
popularity_df

Unnamed: 0,Book-Title,Num-Rating,Avg-Rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [53]:
# Filter the "popularity_df" DataFrame to include only books with 250 or more ratings.
# Sort the resulting DataFrame by "Avg-Rating" in descending order to rank books by their average rating.
# Select the top 50 books with the highest average ratings to create "popularity_df1".

popularity_df1 = popularity_df[popularity_df['Num-Rating']>=250].sort_values('Avg-Rating',ascending=False).head(50)

In [55]:
# shape of new dataset
popularity_df1.shape

(50, 3)

In [57]:
popularity_df1

Unnamed: 0,Book-Title,Num-Rating,Avg-Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


In [59]:
# Merge the "books" DataFrame with "popularity_df1" on the "Book-Title" column 
# to include additional book details for the popular books.
# Drop duplicate entries based on the "Book-Title" column to ensure each book appears only once.
# The resulting DataFrame "popular_books" contains details of the most popular books.

popular_books = books.merge(popularity_df1,on="Book-Title").drop_duplicates("Book-Title")

In [61]:
popular_books.shape

(50, 10)

In [63]:
# Select specific columns from the "popular_books" DataFrame to include only relevant information:
# "Book-Title", "Book-Author", "Image-URL-M" (medium-sized image URL), "Num-Rating" (number of ratings), 
# and "Avg-Rating" (average rating). 
# This creates a cleaner and more concise DataFrame for further use or display.

popular_books = popular_books[["Book-Title","Book-Author","Image-URL-M","Num-Rating","Avg-Rating"]]

In [65]:
popular_books.shape

(50, 5)

In [67]:
popular_books

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Num-Rating,Avg-Rating
0,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,510,4.7
1,Seabiscuit: An American Legend,LAURA HILLENBRAND,http://images.amazon.com/images/P/0449005615.0...,275,4.098182
2,The Catcher in the Rye,J.D. Salinger,http://images.amazon.com/images/P/0316769487.0...,449,4.545657
3,The Five People You Meet in Heaven,Mitch Albom,http://images.amazon.com/images/P/0786868716.0...,430,4.551163
4,The Hitchhiker's Guide to the Galaxy,Douglas Adams,http://images.amazon.com/images/P/0671461494.0...,268,4.328358
5,Fahrenheit 451,Ray Bradbury,http://images.amazon.com/images/P/3257208626.0...,409,4.264059
6,Girl with a Pearl Earring,Tracy Chevalier,http://images.amazon.com/images/P/0452282152.0...,526,4.218631
7,Life of Pi,Yann Martel,http://images.amazon.com/images/P/0151008116.0...,664,4.088855
8,"The Golden Compass (His Dark Materials, Book 1)",PHILIP PULLMAN,http://images.amazon.com/images/P/037582345X.0...,336,4.0
9,The Secret Life of Bees,Sue Monk Kidd,http://images.amazon.com/images/P/0142001740.0...,774,4.447028


In [69]:
# Group the "book_with_ratings" DataFrame by "User-ID" and count the number of ratings ("Book-Rating") for each user.
# Create a boolean mask to identify users who have rated 250 or more books.
# Filter the boolean Series to include only those users with 250 or more ratings and extract their indices.
# The resulting "user_rating_250" contains the User-IDs of users who have rated at least 250 books.

user_rating_250 = book_with_ratings.groupby("User-ID").count()["Book-Rating"]>=250
user_rating_250 = user_rating_250[user_rating_250].index

In [71]:
# Filter the "book_with_ratings" DataFrame to include only rows where the "User-ID" is in the "user_rating_250" list.
# This creates a new DataFrame, "filtered_rating," containing ratings from users who have rated at least 250 books.
# Display the "filtered_rating" DataFrame to verify the filtered data.

filtered_rating = book_with_ratings[book_with_ratings['User-ID'].isin(user_rating_250)]
filtered_rating

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1151,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,http://images.amazon.com/images/P/0026217457.0...,http://images.amazon.com/images/P/0026217457.0...
1152,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...,http://images.amazon.com/images/P/003008685X.0...,http://images.amazon.com/images/P/003008685X.0...
1153,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...,http://images.amazon.com/images/P/0030615321.0...,http://images.amazon.com/images/P/0030615321.0...
1154,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...,http://images.amazon.com/images/P/0060002050.0...,http://images.amazon.com/images/P/0060002050.0...
...,...,...,...,...,...,...,...,...,...,...
1029357,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias,2002,Capital Books (VA),http://images.amazon.com/images/P/1931868123.0...,http://images.amazon.com/images/P/1931868123.0...,http://images.amazon.com/images/P/1931868123.0...
1029358,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt,1993,"Bibliographisches Institut, Mannheim",http://images.amazon.com/images/P/3411086211.0...,http://images.amazon.com/images/P/3411086211.0...,http://images.amazon.com/images/P/3411086211.0...
1029359,275970,3829021860,0,The Penis Book,Joseph Cohen,1999,Konemann,http://images.amazon.com/images/P/3829021860.0...,http://images.amazon.com/images/P/3829021860.0...,http://images.amazon.com/images/P/3829021860.0...
1029360,275970,4770019572,0,Musashi,Eiji Yoshikawa,1995,Kodansha International (JPN),http://images.amazon.com/images/P/4770019572.0...,http://images.amazon.com/images/P/4770019572.0...,http://images.amazon.com/images/P/4770019572.0...


In [73]:
filtered_rating.shape

(429537, 10)

In [75]:
# Group the "filtered_rating" DataFrame by "Book-Title" and count the number of ratings ("Book-Rating") for each book.
# Create a boolean mask to identify books with 50 or more ratings.
# Filter the boolean Series to include only those books with at least 50 ratings and extract their indices.
# The resulting "famous_book" contains the titles of books that are considered "famous" based on the threshold.

famous_book = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_book = famous_book[famous_book].index

In [77]:
famous_book

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Fine Balance',
       'A Heartbreaking Work of Staggering Genius',
       ...
       'Wild Animus', 'Winter Moon', 'Winter Solstice', 'Wish You Well',
       'Without Remorse', 'Wuthering Heights', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=574)

In [79]:
# Filter the "filtered_rating" DataFrame to include only rows where the "Book-Title" is in the "famous_book" list.
# This creates a new DataFrame, "final_ratings," containing ratings for books that have been rated at least 50 times.
# This ensures the dataset focuses on popular books with sufficient user engagement.

final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_book)]

In [81]:
final_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...
...,...,...,...,...,...,...,...,...,...,...
1029065,275970,0804117683,0,Moo,Jane Smiley,1998,Ivy Books,http://images.amazon.com/images/P/0804117683.0...,http://images.amazon.com/images/P/0804117683.0...,http://images.amazon.com/images/P/0804117683.0...
1029192,275970,140003065X,0,A Fine Balance,Rohinton Mistry,2001,Vintage Books USA,http://images.amazon.com/images/P/140003065X.0...,http://images.amazon.com/images/P/140003065X.0...,http://images.amazon.com/images/P/140003065X.0...
1029196,275970,1400031354,0,Tears of the Giraffe (No.1 Ladies Detective Ag...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...
1029270,275970,1573229725,0,Fingersmith,Sarah Waters,2002,Riverhead Books,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...


In [83]:
# Create a pivot table from the "final_ratings" DataFrame:
# - Use "Book-Title" as the rows (index) to represent each book.
# - Use "User-ID" as the columns to represent individual users.
# - Populate the table with the "Book-Rating" values, showing ratings given by users for each book.
# The resulting "final_df" is a matrix where rows represent books, columns represent users, 
# and cell values indicate the ratings given by a specific user to a specific book.

final_df = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [85]:
final_df.shape

(574, 613)

In [87]:
final_df.head()

User-ID,254,2276,2766,3363,4385,6251,6543,6575,7158,7346,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,8.0,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,9.0,,0.0,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,0.0,,,,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,0.0,,,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,1.0,,,...,,0.0,,,,,,,,


In [89]:
# Replace all NaN (missing) values in the "final_df" DataFrame with 0.
# This ensures the pivot table is complete and ready for numerical operations, 
# where a value of 0 indicates no rating was given by the user for that book.
# The "inplace=True" parameter applies the change directly to the "final_df" DataFrame.

final_df.fillna(0,inplace=True)

In [91]:
final_df.head()

User-ID,254,2276,2766,3363,4385,6251,6543,6575,7158,7346,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
# Import the "cosine_similarity" function from the sklearn.metrics.pairwise module.
# This function is used to compute the cosine similarity between rows or columns of a matrix.
# It is often applied in recommendation systems to measure the similarity between items (e.g., books) 
# or users based on their rating patterns.

from sklearn.metrics.pairwise import cosine_similarity

### Formula for Cosine Similarity
Cosine similarity is calculated using the formula:

\[
\text{Cosine Similarity}(A, B) = \frac{A \cdot B}{\|A\| \times \|B\|}
\]

Where:
- \(A\) and \(B\) are vectors (e.g., rating patterns of two books).
- \(A \cdot B\) is the dot product of \(A\) and \(B\).
- \(\|A\|\) is the magnitude (Euclidean norm) of vector \(A\), calculated as \(\sqrt{\sum A_i^2}\).
- \(\|B\|\) is the magnitude of vector \(B\), calculated as \(\sqrt{\sum B_i^2}\).

### Steps:
1. Compute the dot product of the two vectors \(A\) and \(B\).
2. Compute the magnitude of each vector (\(\|A\|\) and \(\|B\|\)).
3. Divide the dot product by the product of the magnitudes.

This formula ensures that the similarity score is bounded between -1 (completely opposite) and 1 (identical), with 0 indicating no similarity. For recommendation systems, values closer to 1 are preferred.

In [95]:
# Compute the cosine similarity between rows of the "final_df" DataFrame using the "cosine_similarity" function.
# The resulting "similarity_score" is a matrix where each entry (i, j) represents the cosine similarity between
# the ratings of books i and j. The diagonal values (i, i) will be 1, as every book is identical to itself.

# Print the "similarity_score" matrix to view the calculated similarities.

similarity_score = cosine_similarity(final_df)
print(similarity_score)

[[1.         0.12637786 0.01584756 ... 0.1387288  0.08450515 0.0521102 ]
 [0.12637786 1.         0.28824811 ... 0.08010424 0.18117097 0.1617243 ]
 [0.01584756 0.28824811 1.         ... 0.05165773 0.05618509 0.12893887]
 ...
 [0.1387288  0.08010424 0.05165773 ... 1.         0.07113412 0.02067643]
 [0.08450515 0.18117097 0.05618509 ... 0.07113412 1.         0.11219793]
 [0.0521102  0.1617243  0.12893887 ... 0.02067643 0.11219793 1.        ]]


In [97]:
similarity_score.shape

(574, 574)

In [99]:
# Check the shape of the "similarity_score" matrix using the .shape attribute.
# This will return a tuple representing the dimensions of the matrix.
# Since "final_df" has rows representing books, the "similarity_score" matrix will be square,
# with dimensions equal to the number of books in "final_df".

similarity_score[0][0]

0.9999999999999998

In [101]:
def recommend(book_name):
    # index fetch
    index = np.where(final_df.index==book_name)[0][0]
    # Retrieve similarity scores for the book and sort them in descending order, excluding the book itself
    similar_items = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1],reverse=True)[1:5]

    # Initialize a list to store the recommended book details
    data = []

    # Loop through the similar items to collect details for each recommended book
    for i in similar_items:
        item = []
        # Filter the books DataFrame to find the book details matching the current index
        temp_df = books[books['Book-Title'] == final_df.index[i[0]]]
        # Add the book title, author, and image URL to the item list after removing duplicates
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))

        # Append the item list to the data list
        data.append(item)
        
     # Return the list of recommended book details
    return data

In [103]:
recommend('1984')

[["The Handmaid's Tale",
  'Margaret Atwood',
  'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'],
 ['The Vampire Lestat (Vampire Chronicles, Book II)',
  'ANNE RICE',
  'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'],
 ['The Hours : A Novel',
  'Michael Cunningham',
  'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg'],
 ['Brave New World',
  'Aldous Huxley',
  'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg']]

In [105]:
final_df.index[240]

'Like Water for Chocolate: A Novel in Monthly Installments With Recipes, Romances and Home Remedies'

In [107]:
books.drop_duplicates('Book-Title')

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271354,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [109]:
import pickle
pickle.dump(popular_books,open("popular.pkl",'wb'))
pickle.dump(final_df,open('final_df.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(similarity_score,open('similarity_scores.pkl','wb'))

In [115]:
list(popular_books['Book-Title'].values)

['To Kill a Mockingbird',
 'Seabiscuit: An American Legend',
 'The Catcher in the Rye',
 'The Five People You Meet in Heaven',
 "The Hitchhiker's Guide to the Galaxy",
 'Fahrenheit 451',
 'Girl with a Pearl Earring',
 'Life of Pi',
 'The Golden Compass (His Dark Materials, Book 1)',
 'The Secret Life of Bees',
 'The Lovely Bones: A Novel',
 'Nickel and Dimed: On (Not) Getting By in America',
 "Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)",
 'The Bean Trees',
 'The Red Tent (Bestselling Backlist)',
 'The Vampire Lestat (Vampire Chronicles, Book II)',
 'Fast Food Nation: The Dark Side of the All-American Meal',
 "Where the Heart Is (Oprah's Book Club (Paperback))",
 'The Da Vinci Code',
 'Skipping Christmas',
 'The Two Towers (The Lord of the Rings, Part 2)',
 "Suzanne's Diary for Nicholas",
 'American Gods',
 'Neverwhere',
 'A Prayer for Owen Meany',
 'Good in Bed',
 'The Fellowship of the Ring (The Lord of the Rings, Part 1)',
 'The Alchemist: A Fable About Followi