In [301]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [302]:
# Load and preprocess data
books_path = "BX-Books.csv"
ratings_path = "BX-Book-Ratings.csv"

df_books = pd.read_csv(
    books_path,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_path,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [320]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271378 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [303]:
def test(df_books, df_ratings):
    df_cleaned_ratings = df_ratings[df_ratings.rating != 0]
    
    #select users which gives more than x ratings
    rating_treshold = 30
    # mask
    valid_users = df_cleaned_ratings["user"].value_counts() >= rating_treshold
    
    # Create index for DataFrame from valid users
    # Construct dataset with valid users which gives more than 100 ratings
    # Merge Books and Ratings df to get clean data
    user_index = valid_users[valid_users].index
    df_cleaned_ratings = df_cleaned_ratings[df_cleaned_ratings["user"].isin(user_index)]
    clean_dataset = df_cleaned_ratings.merge(df_books, on="isbn")
    
    # get rating counts for every title from all valid users and reset index
    # rename rating columns to rating counts
    # get DataFrame with rating counts for every Book
    # get books with more than x rating counts
    # drop duplicate rating by same user
    count_rating = clean_dataset.groupby('title')['rating'].count().reset_index()
    count_rating.rename(columns={"rating":"rating_counts"}, inplace=True)
    final_dataset = count_rating.merge(clean_dataset, on="title")
    mask_ratings = final_dataset["rating_counts"] >= 10
    final_dataset = final_dataset[mask_ratings].reset_index(drop=True)
    final_dataset.drop_duplicates(["user","title"])

    # contruct pivot table for recommendation engine
    pivot = final_dataset.pivot_table(index="title",columns="user",values="rating")
    pivot.fillna(0,inplace=True)

    return pivot


In [304]:
pivot = test(df_books, df_ratings)

In [305]:
def book_suggestions(pivot, book_name):
    book_titles = df.index.tolist()
    row_index = book_titles.index(book_name)
    
    book_sparse = csr_matrix(pivot)
    model = NearestNeighbors(metric = 'cosine', algorithm='auto',n_neighbors=6)
    model.fit(book_sparse)
    distances, suggestions = model.kneighbors(pivot.iloc[row_index, :].values.reshape(1, -1))
    for i in range(len(suggestions)):
        print(pivot.index[suggestions[i]])


In [306]:
like_book_2 = "Where the Heart Is (Oprah's Book Club (Paperback))"
book_suggestions(pivot, like_book_2)

Index(['Where the Heart Is (Oprah's Book Club (Paperback))', 'Blue Diary',
       'The Lovely Bones: A Novel', 'The Weight of Water',
       'The Pilot's Wife : A Novel',
       'The Book of Ruth (Oprah's Book Club (Paperback))'],
      dtype='object', name='title')


In [315]:
class BookSuggestionEngine():

    def __init__(self,book_name):
        self.book_name = book_name
        self.books = "BX-Books.csv"
        self.ratings = "BX-Book-Ratings.csv"


    def prepare_data(self):
        df_books = pd.read_csv(
            self.books,
            encoding = "ISO-8859-1",
            sep=";",
            header=0,
            names=['isbn', 'title', 'author'],
            usecols=['isbn', 'title', 'author'],
            dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

        df_ratings = pd.read_csv(
            self.ratings,
            encoding = "ISO-8859-1",
            sep=";",
            header=0,
            names=['user', 'isbn', 'rating'],
            usecols=['user', 'isbn', 'rating'],
            dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

        df_cleaned_ratings = df_ratings[df_ratings.rating != 0]
    
        #select users which gives more than x ratings
        rating_treshold = 30
        # mask
        valid_users = df_cleaned_ratings["user"].value_counts() >= rating_treshold
        
        # Create index for DataFrame from valid users
        # Construct dataset with valid users which gives more than 100 ratings
        # Merge Books and Ratings df to get clean data
        user_index = valid_users[valid_users].index
        df_cleaned_ratings = df_cleaned_ratings[df_cleaned_ratings["user"].isin(user_index)]
        clean_dataset = df_cleaned_ratings.merge(df_books, on="isbn")
        
        # get rating counts for every title from all valid users and reset index
        # rename rating columns to rating counts
        # get DataFrame with rating counts for every Book
        # get books with more than x rating counts
        # drop duplicate rating by same user
        count_rating = clean_dataset.groupby('title')['rating'].count().reset_index()
        count_rating.rename(columns={"rating":"rating_counts"}, inplace=True)
        final_dataset = count_rating.merge(clean_dataset, on="title")
        mask_ratings = final_dataset["rating_counts"] >= 10
        final_dataset = final_dataset[mask_ratings].reset_index(drop=True)
        final_dataset.drop_duplicates(["user","title"])

        # contruct pivot table for recommendation engine
        pivot = final_dataset.pivot_table(index="title",columns="user",values="rating")
        pivot.fillna(0,inplace=True)

        return pivot

    def book_suggestions(self, pivot):
        try: 
            book_titles = pivot.index.tolist()
            row_index = book_titles.index(self.book_name)
            
            book_sparse = csr_matrix(pivot)
            model = NearestNeighbors(metric = 'cosine', algorithm='auto',n_neighbors=5)
            model.fit(book_sparse)
            distances, suggestions = model.kneighbors(pivot.iloc[row_index, :].values.reshape(1, -1))
            for i in range(len(suggestions)):
                print(pivot.index[suggestions[i]])
        except:
            print(f"Sorry, the book: {self.book_name} is not in database or you maybe misstype" )

In [323]:
like_book_1 = 'Dark Justice' ## méně než 10 hodnocení
like_book_2 = "Where the Heart Is (Oprah's Book Club (Paperback))"
like_book_3 = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
like_book_4 = 'The Fellowship of The Ring (the lord of the rings, part 1)'
like_book_5 = "The Fellowship of the Ring (The Lord of the Rings, Part 1)"
like_book_6 =  "Harry Potter and the Sorcerer's Stone (Book 1)"
a = BookSuggestionEngine(like_book_5)

c = a.book_suggestions()

Index(['The Fellowship of the Ring (The Lord of the Rings, Part 1)',
       'The Two Towers (The Lord of the Rings, Part 2)',
       'The Return of the King (The Lord of the Rings, Part 3)',
       'The Return of the King (The Lord of The Rings, Part 3)',
       'The Hobbit : The Enchanting Prelude to The Lord of the Rings'],
      dtype='object', name='title')


In [246]:
like_book_1 = ' Dark Justice' ## méně než 10 hodnocení
like_book_2 = "Where the Heart Is (Oprah's Book Club (Paperback))"
like_book_3 = 'The Queen of the Damned (Vampire Chronicles (Paperback))'

def get_index_by_title(like_book, pivot):
    book_titles = df.index.tolist()
    row_index = book_titles.index(like_book)
    return row_index



In [244]:
pivot.iloc[1]

user
183       0.0
242       0.0
254       0.0
507       0.0
638       0.0
         ... 
278188    0.0
278356    0.0
278418    0.0
278582    0.0
278633    0.0
Name: 10 Lb. Penalty, Length: 2285, dtype: float32

In [195]:
df = test(df_books, df_ratings)

In [201]:
df.iloc[2365]

user
183       0.0
242       0.0
254       0.0
507       0.0
638       0.0
         ... 
278188    0.0
278356    0.0
278418    0.0
278582    0.0
278633    0.0
Name: Where the Heart Is (Oprah's Book Club (Paperback)), Length: 2285, dtype: float32

In [202]:

def get_index_by_title(like_book, df):
    book_titles = df.index.tolist()
    row_index = book_titles.index(like_book)
    return row_index


In [205]:
like_book_1 = ' Dark Justice' ## méně než 10 hodnocení
like_book_2 = "Where the Heart Is (Oprah's Book Club (Paperback))"
like_book_3 = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
print(get_index_by_title(like_book_3, df))

2066


In [199]:
like_book = "Where the Heart Is (Oprah's Book Club (Paperback))"
df.iloc[get_index_by_title(like_book, df)]

user
183       0.0
242       0.0
254       0.0
507       0.0
638       0.0
         ... 
278188    0.0
278356    0.0
278418    0.0
278582    0.0
278633    0.0
Name: Where the Heart Is (Oprah's Book Club (Paperback)), Length: 2285, dtype: float32