In [None]:
# Parameters entered by user
number_review_limit  = 3
search_term = ["stranger in a strange land"]
author = 'heinlein'
# Fuzzywuzzy score: higher score = more strict comparison of the string
threshold = 70

# import
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import warnings 
warnings.filterwarnings('ignore', category=RuntimeWarning)

                                        # IMPORT AND DATA CLEANING
# Here will you have to download a zip file from my Github, there you will find 2 csv files, copy the first file path into the brackets below

# load ratings
ratings = pd.read_csv('', encoding='cp1251', sep=';', on_bad_lines='skip')
ratings = ratings[ratings['Book-Rating']!=0]
ratings = ratings.dropna(subset=['Book-Rating'])

# Copy the second file path into the brackets below
# load books
books = pd.read_csv('',  encoding='cp1251', sep=';', on_bad_lines='skip')

# Drop unnecessary columns to speed up the process
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis = 1, inplace=True)

                                        # MERGING AND DATA STANDARDIZATION

#users_ratings = pd.merge(ratings, users, on='User-ID')
dataset = pd.merge(ratings, books, on='ISBN')

# Convert all object values in all columns to lowercase
dataset_lowercase=dataset.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)

# Strip whitespaces from all columns
dataset_lowercase = dataset_lowercase.apply(lambda x: x.str.strip() if x.dtype in ['object', 'string'] else x)

# Defining method how to search for similarities in a book title
def find_similar_books(search_term, titles, threshold=threshold, scorer=fuzz.token_set_ratio):

# Use process.extract to get scores for all titles
    matches = process.extract(search_term, titles, scorer=scorer)
    
# Filter matches based on threshold
    return [title for title, score in matches if score >= threshold]

# Return unique book titles
book_titles = dataset_lowercase['Book-Title'].unique()

# A list of books that match
matches = find_similar_books(search_term[0], book_titles, threshold=threshold)

                                        # LIMITING DATASET TO SIMILAR USERS AND BOOKS THEY READ

# Limit dataset to readers that read the book entered by the user
similar_readers = dataset_lowercase['User-ID'][(dataset_lowercase['Book-Title'].isin(matches)) & 
        (dataset_lowercase['Book-Author'].str.contains(author))]

# Table with books that matched the book title input by the user
books_matched = dataset_lowercase[(dataset_lowercase['Book-Title'].isin(matches)) & 
        (dataset_lowercase['Book-Author'].str.contains(author))]['Book-Title'].unique()
books_matched = pd.DataFrame(books_matched, columns = ['Book-Title'])

# Convert into a list
similar_readers = similar_readers.tolist()
# Drop duplicates
similar_readers = np.unique(similar_readers)

# Books that got reviewed by the same readers that read the book chosen by the user
books_of_similar_readers = dataset_lowercase[dataset_lowercase['User-ID'].isin(similar_readers)]

# Number of ratings per other books in dataset
number_of_rating_per_book = books_of_similar_readers.groupby('Book-Title').count().reset_index()

#select only books which have actually higher number of ratings than threshold
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= number_review_limit]
books_to_compare2 = books_to_compare.tolist()

# Table with books that matched the book title input by the user and have enough reviews
books_matched2 = pd.merge(books_matched, books_to_compare, on = 'Book-Title', how = 'left')
books_matched2 = books_matched2.apply(lambda x: x.str.title() if(x.dtype == 'object') else x)

ratings_data_raw = books_of_similar_readers[['User-ID', 'Book-Rating', 'Book-Title']][
    books_of_similar_readers['Book-Title'].isin(books_to_compare2)]

# Group by User and Book and compute mean
ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()

# Reset index to see User-ID in every row
ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()

dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')

result_list = []
worst_list = []

                                        # FAIL-SAVES

# Validate user input
if not search_term or not author:
    raise ValueError("Book title and author must be provided.")
if not isinstance(search_term, list):
    search_term = [search_term]

# Check if the book exists in the dataset
book_exists = dataset_lowercase[
    (dataset_lowercase['Book-Title'].isin(matches)) &
    (dataset_lowercase['Book-Author'].str.contains(author, na=False))
]
if book_exists.empty:
    raise ValueError(f"No book matching '{search_term[0].title()}' by '{author.title()}' found in the dataset. Check for typos in the book title")

                                        # CORRELATIONS

# Drop the book and its variation that was chosen by the user
for match in matches:

    # One more fail-safe
    if match not in dataset_for_corr.columns:
        print(f"\nWarning: '{match.title()}' was written by a different author or it doesn't have enough reviews. Also, Try to decrease fuzz threshold.")
        continue

    #Take out user's favourite book from correlation dataframe
    dataset_of_other_books = dataset_for_corr.copy(deep=False)
    dataset_of_other_books.drop(match, axis=1, inplace=True)
      
    # empty lists
    book_titles = []
    correlations = []
    avgrating = []          

    # Calculate Pearson correlation coefficient for each book
    for other_book in list(dataset_of_other_books.columns.values):
        book_titles.append(other_book)
        correlations.append(dataset_for_corr[match].corr(dataset_of_other_books[other_book]))
        mean_rating = ratings_data_raw[ratings_data_raw['Book-Title']==other_book]['Book-Rating'].mean()
        avgrating.append(mean_rating)
        
    # final dataframe of all correlation of each book   
    corr_book_chosen = pd.DataFrame(list(zip(book_titles, np.round(correlations, 3), np.round(avgrating, 3))), columns=['book','corr','avg_rating'])
    corr_book_chosen = corr_book_chosen[corr_book_chosen['corr'].notnull()]

    # top 10 books with highest correlation
    result_list.append(corr_book_chosen[corr_book_chosen['corr'] > 0].sort_values(['corr', 'avg_rating'], ascending = [False, False]).head(10))
    
    #10 books with the lowest correlation
    worst_list.append(corr_book_chosen[corr_book_chosen['corr'] < 0].sort_values(['corr', 'avg_rating'], ascending = [False, False])[::-1].tail(10))
  

# Show results 
print(f"\nBooks that matched your input:")
print(books_matched2)
if matches: 
    print(f"\nTop recommendations for '{search_term[0].title()}' by {author.title()}:")
    if len(result_list) > 0 and not result_list[0].empty:
        print(result_list[0].to_string(index=False))
    else:
        print(f"No top recommendations found for '{search_term[0].title()}' by {author.title()}. "
              f"This may be due to insufficient ratings or correlations. Try decreasing the review limit.")
    print(f"\nBottom recommendations for '{search_term[0].title()}' by {author.title()}:")
    if len(worst_list) > 0 and not worst_list[0].empty:
        print(worst_list[0].to_string(index=False))
    else:
        print(f"No bottom recommendations found for '{search_term[0].title()}' by {author.title()}. "
              f"This may be due to insufficient ratings or correlations. Try decreasing the review limit.")
else: 
    print(f"\nUnfortunately, our database doesn't have reviews of your favorite book")

  books = pd.read_csv('C:\\Users\\sasha\\Documents\\Data Analysis\\book_recommender\\BX-Books.csv',  encoding='cp1251', sep=';', on_bad_lines='skip')





Books that matched your input:
                                          Book-Title
0  Stranger In A Strange Land (Remembering Tomorrow)
1                         Stranger In A Strange Land

Top recommendations for 'Stranger In A Strange Land' by Heinlein:
                                                                book  corr  avg_rating
                                                         animal farm   1.0       9.667
                                         tales of the cthulhu mythos   1.0       9.667
                                                  the caves of steel   1.0       9.333
the ritual bath (peter decker &amp; rina lazarus novels (paperback))   1.0       9.333
   zen and the art of motorcycle maintenance: an inquiry into values   1.0       9.250
                                                 the mists of avalon   1.0       9.200
      anne of green gables (anne of green gables novels (paperback))   1.0       9.000
                                         lit