### Load Dataset

In [None]:
import pandas as pd
import opendatasets as od

# Load dataset into a Pandas DataFrame
od.download("https://drive.usercontent.google.com/download?id=1dDd2Kr9wxE44jARLPG_ysZjOHjeoKOLw&export=download&authuser=1&confirm=t&uuid=aee55275-8452-44f4-8a74-ad70554e1c79&at=APZUnTUB5u7-vOVHH3U0wHBtiDZ3%3A1708431658128")
data = pd.read_csv("GoodReads_100k.csv")


# Remove duplicates from df
data = data.drop_duplicates()


In [None]:
# Handle outliers (e.g., filter unrealistic ratings)
data = data[(data['rating'] >= 1) & (data['rating'] <= 5)]

# print(len(data))
# data.head(5)

#### Remove unnecessary columns

In [None]:
data.columns

In [None]:
data = data[['isbn','title','author','rating','reviews','img','desc','genre','pages']]
data.head(5)

#### Column rename

In [None]:
data.rename(columns={'isbn':'ISBN','title':'Title','link':'Link','author':'Author','rating':'Rating','reviews':'No. of ratings','img':'Image','desc':'Desc','genre':'Genre','pages':'Pages'},inplace=True)
data.head(5)

#### Data Cleaning

In [None]:
data.isnull().sum() # no. of null values

In [None]:
data = data.dropna(subset=['Genre'])
data = data.dropna(subset=['Desc'])
data = data.dropna(subset=['Image'])
data = data.dropna(subset=['ISBN'])
print(len(data))
print("____________________________________")

data = data.reset_index(drop=True)
data.head(5)

In [None]:
final_data = data[data['No. of ratings'] >= 100]
print(len(final_data))
print("_____________")
final_data = final_data.reset_index(drop=True)
final_data.head(5)

#### Models Used for Recommendation

In [None]:
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import linear_kernel

# # Create a TF-IDF Vectorizer for the 'desc' column
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# # To check Output from above code: 
# # print(f"Final Data Null Values: {final_data['Desc'].isnull().sum()}")
# # print(f"Lenght of Final Data: {len(final_data)}")

# # print(f"TfidfVectorizer: {tfidf_vectorizer}")


# # Replace NaN values with an empty string
# final_data['Desc'] = final_data['Desc'].fillna('')

# # Apply the TF-IDF vectorizer to the 'desc' column
# tfidf_matrix_desc = tfidf_vectorizer.fit_transform(final_data['Desc'])

# # print(f"tfidf_matrix_desc: {tfidf_matrix_desc}") # To check Output from above code


# # Convert the data type to float32
# tfidf_matrix_desc = tfidf_matrix_desc.astype(np.float32)
# # print(f"tfidf_matrix_desc: {tfidf_matrix_desc}") # To check Output from above code


# # Compute the cosine similarity matrix for book descriptions
# cosine_sim_desc = linear_kernel(tfidf_matrix_desc, tfidf_matrix_desc)
# # print(f"cosine_sim_desc: {cosine_sim_desc}") # To check Output from above code

 

In [None]:
# To check Output from above code: 
print(f"Final Data Null Values: {final_data['Desc'].isnull().sum()}")
print(f"Lenght of Final Data: {len(final_data)}")
print("_______________________")
# print(f"Length of Cosine_Similarity: {len(cosine_sim_desc)}")

#### Recommendation Function


1. `tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)`: This line creates a TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer. This is a technique used to quantify a word in documents, we generally compute a weight to each word which signifies the importance of the word in the document and corpus. The vectorizer is set to ignore common English stop words (like 'the', 'is', 'and', etc.) and only consider the top 10,000 features ordered by term frequency across the corpus.

2. `final_data['Desc'] = final_data['Desc'].fillna('',inplace=True)`: This line replaces any NaN (Not a Number) values in the 'Desc' column of the final_data DataFrame with an empty string.

3. `tfidf_matrix_desc = tfidf_vectorizer.fit_transform(final_data['Desc'])`: This line applies the TF-IDF vectorizer to the 'Desc' column of the final_data DataFrame. The `fit_transform` function learns the vocabulary and idf, and returns a term-document matrix.

4. `tfidf_matrix_desc = tfidf_matrix_desc.astype(np.float32)`: This line converts the data type of the tfidf_matrix_desc to float32. This is done to reduce memory usage.

5. `cosine_sim_desc = linear_kernel(tfidf_matrix_desc, tfidf_matrix_desc)`: This line computes the cosine similarity matrix for book descriptions. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.

6. `def get_recommendations(book_title, cosine_sim)`: This line defines a function named get_recommendations that takes a book title and a cosine similarity matrix as input.

7. `if not final_data.empty:`: This line checks if the final_data DataFrame is empty. If it is not empty, the code inside the if block is executed.

8. `idx = final_data[final_data['Title'] == book_title].index`: This line gets the index of the book that matches the input book title.

9. `if len(idx) > 0:`: This line checks if the book title exists in the DataFrame. If it does, the code inside the if block is executed.

10. `idx = idx[0]`: This line gets the first index from the idx list.

11. `sim_scores = list(enumerate(cosine_sim[idx]))`: This line creates a list of tuples where the first element is the index and the second element is the cosine similarity score.

12. `sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)`: This line sorts the list of tuples based on the cosine similarity score in descending order.

13. `sim_scores = sim_scores[1:11]`: This line gets the top 10 tuples from the sorted list.

14. `book_indices = [i[0] for i in sim_scores]`: This line gets the indices of the top 10 tuples.

15. `return final_data['Title'].iloc[book_indices]`: This line returns the titles of the books that correspond to the top 10 indices.

16. `else: return "Book not found"`: If the book title does not exist in the DataFrame, the function returns "Book not found".

17. `else: return "No data available"`: If the final_data DataFrame is empty, the function returns "No data available".

18. `get_recommendations('The Art of Love', cosine_sim_desc)`: This line calls the get_recommendations function with 'The Art of Love' as the book title and cosine_sim_desc as the cosine similarity matrix.

- Recommendation Function Store

In [None]:
# Function to get book recommendations based on book title
def get_recommendations(book_title, cosine_sim):

    # Check if the final_data DataFrame is empty
    if not final_data.empty:
        # Get the index of the book title
        idx = final_data[final_data['Title'] == book_title].index
        # print(f"idx: {idx}") # Output check
        if len(idx) > 0:
            idx = idx[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            # print(f"sim_scores: {sim_scores}") # Output check
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]
            # print(f"sim_scores top 10: {sim_scores}") # Output check
            book_indices = [i[0] for i in sim_scores]
            # print(f"book_indices: {book_indices}") # Output check
            # return book title with image url and author
            return final_data[['Title', 'Image', 'Author','Pages']].iloc[book_indices]
        else:
            return "Book not found"
    else:
        return "No data available"
    
# get_recommendations("Happiness: Lessons from a New Science",cosine_sim_desc)

### Save Models
- cosine_sim_desc
- final_data

In [None]:
import pickle
# save cosine_sim_desc
# pickle.dump(cosine_sim_desc,open('model/cosine_sim_desc.pkl',"wb"), protocol=4)

# save final_data
# pickle.dump(final_data,open("model/final_data.pkl","wb"))

# save final_data as csv
final_data.to_csv("model/final_data.csv",index=False)
