# Import libraries

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data preparation/cleaning

In [2]:
# Import dataset and rename the first column to 'index'
df = pd.read_csv("/Users/vincent/Downloads/Goodreads_best1500books.csv") 

# Rename the first column to 'index'
df.rename(columns={df.columns[0]: 'index'}, inplace=True)

# Data preparation/cleaning

# Remove text between parentheses in 'book_name' column
df['book_name'] = df['book_name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))

# Replace commas in 'no_of_raters' column
df["no_of_raters"] = df["no_of_raters"].str.replace(",", "")

# Replace certain strings in 'avg_rating' column with NaN values and drop rows with NaN values
df = df[~df['avg_rating'].isin(['it', 'liked', 'really'])].dropna(subset=['avg_rating'])
"""
isin() find the rows where the 'avg_rating' column contains the values 'it', 'liked', or 'really'. 
The ~ operator inverts the Boolean mask to keep only the rows where the 'avg_rating' column does not contain these values.
Then we use dropna() to remove any remaining rows with NaN values in the 'avg_rating' column.
"""

# Convert 'avg_rating' and 'no_of_raters' columns to float
df["avg_rating"] = df["avg_rating"].astype(float)
df["no_of_raters"] = df["no_of_raters"].astype(float)

# Define the book the user likes

In [3]:
books_user_likes = "Exotic Neurotic"

# Recommendation part - Cosine similarity (content-based system) and popularity-based system

In [4]:
# Define list of features to use for recommendation
features = ['author_name', 'book_genre', 'year_published']

# Fill any missing values in these columns with an empty string
for feature in features:
    df[feature] = df[feature].fillna('')

# Define function to combine feature values for each row into a single string
def combined_features(row):
    return str(row['author_name']) + " " + str(row['book_genre']) + " " + str(row['year_published'])

# Apply the function to each row to create a new column 'combined_features'
df["combined_features"] = df.apply(combined_features, axis=1)

# Use CountVectorizer to convert 'combined_features' column into a matrix of token counts
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

# Use cosine similarity to calculate similarity scores between all books based on their 'combined_features' token counts
cosine_sim = cosine_similarity(count_matrix)

# Define the book the user likes
books_user_likes = "Exotic Neurotic"

# Define a function to get the index of a book by its name
def get_index_from_book_name(book_name):
    return df[df.book_name == book_name]["index"].values[0]

# Get the index of the book the user likes
books_index = get_index_from_book_name(books_user_likes)

# Get the similarity scores between the book the user likes and all other books
similar_books = list(enumerate(cosine_sim[books_index]))

# Sort the list of books by their similarity score (highest to lowest), and exclude the book the user likes
sorted_similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)[1:16]

# Define a function to get the name of a book by its index
def get_book_name_from_index(index):
    return df[df.index == index]["book_name"].values[0]

# Create a list of the names of the 15 most similar books
result = []
for book in sorted_similar_books:
    result.append(get_book_name_from_index(book[0]))

# Print the list of recommended books
for book in result:
    print(book)


Hunting Ground 
The Dive From Clausen's Pier
Dry
Station Eleven
The Chronicle of Sapta Sindhu
Shades of Grey 
The Hunger Games 
Cloud Atlas
The Host 
The Sea of Monsters 
The Titan's Curse 
City of Lost Souls 
Phantom Wolf 
State of Wonder
Master of the Senate
