### Load Dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset into a Pandas DataFrame
data = pd.read_csv("dataset\GoodReads_100k.csv")


# Remove duplicates from df
data = data.drop_duplicates()


In [2]:
# Handle outliers (e.g., filter unrealistic ratings)
data = data[(data['rating'] >= 1) & (data['rating'] <= 5)]

# print(len(data))
# data.head(5)

#### Remove unnecessary columns

In [3]:
data.columns

Index(['author', 'bookformat', 'desc', 'genre', 'img', 'isbn', 'isbn13',
       'link', 'pages', 'rating', 'reviews', 'title', 'totalratings'],
      dtype='object')

In [4]:
data = data[['isbn','title','author','rating','reviews','img','desc','genre','pages']]
data.head(5)

Unnamed: 0,isbn,title,author,rating,reviews,img,desc,genre,pages
0,002914180X,Between Two Fires: American Indians in the Civ...,Laurence M. Hauptman,3.52,5,https://i.gr-assets.com/images/S/compressed.ph...,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",0
1,1906863482,Fashion Sourcebook 1920s,"Charlotte Fiell,Emmanuelle Dirix",4.51,6,https://i.gr-assets.com/images/S/compressed.ph...,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",576
2,948984147,Hungary 56,Andy Anderson,4.15,2,https://i.gr-assets.com/images/S/compressed.ph...,The seminal history and analysis of the Hungar...,"Politics,History",124
3,814327079,All-American Anarchist: Joseph A. Labadie and ...,Carlotta R. Anderson,3.83,1,https://i.gr-assets.com/images/S/compressed.ph...,"""All-American Anarchist"" chronicles the life a...","Labor,History",324
4,2761920813,Les oiseaux gourmands,Jean Leveille,4.0,1,https://i.gr-assets.com/images/S/compressed.ph...,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,177


#### Column rename

In [5]:
data.rename(columns={'isbn':'ISBN','title':'Title','link':'Link','author':'Author','rating':'Rating','reviews':'No. of ratings','img':'Image','desc':'Desc','genre':'Genre','pages':'Pages'},inplace=True)
data.head(5)

Unnamed: 0,ISBN,Title,Author,Rating,No. of ratings,Image,Desc,Genre,Pages
0,002914180X,Between Two Fires: American Indians in the Civ...,Laurence M. Hauptman,3.52,5,https://i.gr-assets.com/images/S/compressed.ph...,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",0
1,1906863482,Fashion Sourcebook 1920s,"Charlotte Fiell,Emmanuelle Dirix",4.51,6,https://i.gr-assets.com/images/S/compressed.ph...,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",576
2,948984147,Hungary 56,Andy Anderson,4.15,2,https://i.gr-assets.com/images/S/compressed.ph...,The seminal history and analysis of the Hungar...,"Politics,History",124
3,814327079,All-American Anarchist: Joseph A. Labadie and ...,Carlotta R. Anderson,3.83,1,https://i.gr-assets.com/images/S/compressed.ph...,"""All-American Anarchist"" chronicles the life a...","Labor,History",324
4,2761920813,Les oiseaux gourmands,Jean Leveille,4.0,1,https://i.gr-assets.com/images/S/compressed.ph...,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,177


#### Data Cleaning

In [6]:
data.isnull().sum() # no. of null values

ISBN              14338
Title                 1
Author                0
Rating                0
No. of ratings        0
Image              2489
Desc               6214
Genre              9056
Pages                 0
dtype: int64

In [7]:
data = data.dropna(subset=['Genre'])
data = data.dropna(subset=['Desc'])
data = data.dropna(subset=['Image'])
data = data.dropna(subset=['ISBN'])
print(len(data))
print("____________________________________")

data = data.reset_index(drop=True)
data.head(5)

73629
____________________________________


Unnamed: 0,ISBN,Title,Author,Rating,No. of ratings,Image,Desc,Genre,Pages
0,002914180X,Between Two Fires: American Indians in the Civ...,Laurence M. Hauptman,3.52,5,https://i.gr-assets.com/images/S/compressed.ph...,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",0
1,1906863482,Fashion Sourcebook 1920s,"Charlotte Fiell,Emmanuelle Dirix",4.51,6,https://i.gr-assets.com/images/S/compressed.ph...,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",576
2,948984147,Hungary 56,Andy Anderson,4.15,2,https://i.gr-assets.com/images/S/compressed.ph...,The seminal history and analysis of the Hungar...,"Politics,History",124
3,814327079,All-American Anarchist: Joseph A. Labadie and ...,Carlotta R. Anderson,3.83,1,https://i.gr-assets.com/images/S/compressed.ph...,"""All-American Anarchist"" chronicles the life a...","Labor,History",324
4,875848419,The Human Equation: Building Profits by Puttin...,Jeffrey Pfeffer,3.73,7,https://i.gr-assets.com/images/S/compressed.ph...,Why is common sense so uncommon when it comes ...,"Business,Leadership,Romance,Historical Romance...",368


In [8]:
final_data = data[data['No. of ratings'] >= 50]
print(len(final_data))
print("_____________")
final_data = final_data.reset_index(drop=True)
final_data.head(5)

25151
_____________


Unnamed: 0,ISBN,Title,Author,Rating,No. of ratings,Image,Desc,Genre,Pages
0,143037013,Happiness: Lessons from a New Science,Richard Layard,3.73,85,https://i.gr-assets.com/images/S/compressed.ph...,There is a paradox at the heart of our lives. ...,"Psychology,Nonfiction,Economics,Science,Philos...",320
1,385333706,After Long Silence,Helen Fremont,4.0,328,https://i.gr-assets.com/images/S/compressed.ph...,"""To this day, I don't even know what my mother...","Autobiography,Memoir,Nonfiction,World War II,H...",368
2,1414270658,Parnassus on Wheels,Christopher Morley,4.03,1214,https://i.gr-assets.com/images/S/compressed.ph...,"Roger Mifflin is part pixie, part sage, part n...","Fiction,Writing,Books About Books,Classics,Hum...",152
3,767923634,Irrational Exuberance,Robert J. Shiller,3.98,275,https://i.gr-assets.com/images/S/compressed.ph...,As Robert Shillerâ€™s new 2009 preface to his ...,"Economics,Economics,Finance,Business,Nonfictio...",304
4,765315696,Halo: Contact Harvest,Joseph Staten,3.98,271,https://i.gr-assets.com/images/S/compressed.ph...,"This is how it began...,It is the year 2524. H...","Science Fiction,Fiction,Games,Video Games,Spor...",396


#### Models Used for Recommendation

In [13]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF Vectorizer for the 'desc' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

# To check Output from above code: 
# print(f"Final Data Null Values: {final_data['Desc'].isnull().sum()}")
# print(f"Lenght of Final Data: {len(final_data)}")

# print(f"TfidfVectorizer: {tfidf_vectorizer}")


# Replace NaN values with an empty string
final_data['Desc'] = final_data['Desc'].fillna('')

# Apply the TF-IDF vectorizer to the 'desc' column
tfidf_matrix_desc = tfidf_vectorizer.fit_transform(final_data['Desc'])

# print(f"tfidf_matrix_desc: {tfidf_matrix_desc}") # To check Output from above code


# Convert the data type to float32
tfidf_matrix_desc = tfidf_matrix_desc.astype(np.float64)
# print(f"tfidf_matrix_desc: {tfidf_matrix_desc}") # To check Output from above code


# Compute the cosine similarity matrix for book descriptions
cosine_sim_desc = linear_kernel(tfidf_matrix_desc, tfidf_matrix_desc)
# print(f"cosine_sim_desc: {cosine_sim_desc}") # To check Output from above code

 

AttributeError: module 'numpy' has no attribute 'float128'

In [None]:
# To check Output from above code: 
print(f"Final Data Null Values: {final_data['Desc'].isnull().sum()}")
print(f"Lenght of Final Data: {len(final_data)}")
print("_______________________")
print(f"Length of Cosine_Similarity: {len(cosine_sim_desc)}")

Final Data Null Values: 0
Lenght of Final Data: 25151
_______________________
Length of Cosine_Similarity: 25151


#### Recommendation Function


1. `tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)`: This line creates a TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer. This is a technique used to quantify a word in documents, we generally compute a weight to each word which signifies the importance of the word in the document and corpus. The vectorizer is set to ignore common English stop words (like 'the', 'is', 'and', etc.) and only consider the top 10,000 features ordered by term frequency across the corpus.

2. `final_data['Desc'] = final_data['Desc'].fillna('',inplace=True)`: This line replaces any NaN (Not a Number) values in the 'Desc' column of the final_data DataFrame with an empty string.

3. `tfidf_matrix_desc = tfidf_vectorizer.fit_transform(final_data['Desc'])`: This line applies the TF-IDF vectorizer to the 'Desc' column of the final_data DataFrame. The `fit_transform` function learns the vocabulary and idf, and returns a term-document matrix.

4. `tfidf_matrix_desc = tfidf_matrix_desc.astype(np.float32)`: This line converts the data type of the tfidf_matrix_desc to float32. This is done to reduce memory usage.

5. `cosine_sim_desc = linear_kernel(tfidf_matrix_desc, tfidf_matrix_desc)`: This line computes the cosine similarity matrix for book descriptions. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.

6. `def get_recommendations(book_title, cosine_sim)`: This line defines a function named get_recommendations that takes a book title and a cosine similarity matrix as input.

7. `if not final_data.empty:`: This line checks if the final_data DataFrame is empty. If it is not empty, the code inside the if block is executed.

8. `idx = final_data[final_data['Title'] == book_title].index`: This line gets the index of the book that matches the input book title.

9. `if len(idx) > 0:`: This line checks if the book title exists in the DataFrame. If it does, the code inside the if block is executed.

10. `idx = idx[0]`: This line gets the first index from the idx list.

11. `sim_scores = list(enumerate(cosine_sim[idx]))`: This line creates a list of tuples where the first element is the index and the second element is the cosine similarity score.

12. `sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)`: This line sorts the list of tuples based on the cosine similarity score in descending order.

13. `sim_scores = sim_scores[1:11]`: This line gets the top 10 tuples from the sorted list.

14. `book_indices = [i[0] for i in sim_scores]`: This line gets the indices of the top 10 tuples.

15. `return final_data['Title'].iloc[book_indices]`: This line returns the titles of the books that correspond to the top 10 indices.

16. `else: return "Book not found"`: If the book title does not exist in the DataFrame, the function returns "Book not found".

17. `else: return "No data available"`: If the final_data DataFrame is empty, the function returns "No data available".

18. `get_recommendations('The Art of Love', cosine_sim_desc)`: This line calls the get_recommendations function with 'The Art of Love' as the book title and cosine_sim_desc as the cosine similarity matrix.

- Recommendation Function Store

In [11]:
# Function to get book recommendations based on book title
def get_recommendations(book_title, cosine_sim):

    # Check if the final_data DataFrame is empty
    if not final_data.empty:
        # Get the index of the book title
        idx = final_data[final_data['Title'] == book_title].index
        # print(f"idx: {idx}") # Output check
        if len(idx) > 0:
            idx = idx[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            # print(f"sim_scores: {sim_scores}") # Output check
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]
            # print(f"sim_scores top 10: {sim_scores}") # Output check
            book_indices = [i[0] for i in sim_scores]
            # print(f"book_indices: {book_indices}") # Output check
            # return book title with image url and author
            return final_data[['Title', 'Image', 'Author','Pages']].iloc[book_indices]
        else:
            return "Book not found"
    else:
        return "No data available"
    
get_recommendations("Happiness: Lessons from a New Science",cosine_sim_desc)

Unnamed: 0,Title,Image,Author,Pages
13138,"Happier at Home: Kiss More, Jump More, Abandon...",https://i.gr-assets.com/images/S/compressed.ph...,"Gretchen Rubin,KÃ¤the Mazur",10
19954,The Myths of Happiness,https://i.gr-assets.com/images/S/compressed.ph...,Sonja Lyubomirsky,320
19723,Happy Money: The Science of Smarter Spending,https://i.gr-assets.com/images/S/compressed.ph...,"Elizabeth Dunn,Michael Norton",224
2263,Anna Hibiscus' Song,https://i.gr-assets.com/images/S/compressed.ph...,"Atinuke,Lauren Tobia",34
43,How We Choose to Be Happy: The 9 Choices of Ex...,https://i.gr-assets.com/images/S/compressed.ph...,"Rick Foster,Greg Hicks",256
7758,Dark Heart Surrender,https://i.gr-assets.com/images/S/compressed.ph...,Lee Monroe,390
6054,Spontaneous Happiness,https://i.gr-assets.com/images/S/compressed.ph...,Andrew Weil,288
24990,Hardwiring Happiness: The New Brain Science of...,https://i.gr-assets.com/images/S/compressed.ph...,Rick Hanson,304
14572,You Can Buy Happiness (and It's Cheap): How On...,https://i.gr-assets.com/images/S/compressed.ph...,Tammy Strobel,224
5625,What Makes Your Brain Happy and Why You Should...,https://i.gr-assets.com/images/S/compressed.ph...,David DiSalvo,309


### Save Models
- cosine_sim_desc
- final_data

In [12]:
import pickle
import csv
# save cosine_sim_desc
pickle.dump(cosine_sim_desc,open('model/cosine_sim_desc.pkl',"wb"))

# save final_data
# pickle.dump(final_data,open("model/final_data.pkl","wb"))

# save final_data as csv
final_data.to_csv("model/final_data.csv",index=False)
