# Book recommender system

In [1]:
import numpy as np
import pandas as pd
import ast
import json

In [2]:
BX_books = pd.read_csv('./data/BX_Books.csv', sep=';', encoding='latin-1')
BX_users = pd.read_csv('./data/BX-Users.csv', sep=';', encoding='latin-1')
BX_ratings = pd.read_csv('./data/BX-Book-Ratings.csv', sep=';', encoding='latin-1')

In [3]:
# for book description
best_books = pd.read_csv('./data/Best_Books_Ever.csv')

## Recommendations based on popularity

In [4]:
best_books.head()

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",...,07/11/60,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,2269402,23328,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"['Classics', 'Fiction', 'Romance', 'Historical...","['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",...,01/28/13,[],2998241,"['1617567', '816659', '373311', '113934', '767...",94.0,"['United Kingdom', 'Derbyshire, England (Unite...",https://i.gr-assets.com/images/S/compressed.ph...,1983116,20452,
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,10/05/05,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1


In [5]:
NUM_RATINGS = 300_000
NUM_GENRES = 10

In [6]:
# take necessary columns
popular_df = best_books[['title', 'author', 'rating', 'genres', 'numRatings', 'coverImg']].copy()
popular_df = popular_df.dropna().drop_duplicates(subset='title', keep='first')

# take books with atleast N ratings (n = 300k)
popular_df = popular_df[popular_df['numRatings'] >= NUM_RATINGS]

# sort dataframe based on rating
popular_df = popular_df.sort_values(by='rating', ascending=False)

# 'genres' is a list in a string, convert to list
def to_list(str):
    L = ast.literal_eval(str)
    L = [i.replace(' ', '_') for i in L]
    return L
popular_df['genres'] = popular_df['genres'].apply(to_list)

def clean_author(str):
    str = str.replace(' (Goodreads Author)', '')
    str = str.split(',')[0]
    return str
popular_df['author'] = popular_df['author'].apply(clean_author)

In [7]:
popular_df

Unnamed: 0,title,author,rating,genres,numRatings,coverImg
342,The Way of Kings,Brandon Sanderson,4.63,"[Fantasy, Fiction, Epic_Fantasy, High_Fantasy,...",302877,https://i.gr-assets.com/images/S/compressed.ph...
71,Harry Potter and the Deathly Hallows,J.K. Rowling,4.62,"[Fantasy, Young_Adult, Fiction, Magic, Childre...",2811637,https://i.gr-assets.com/images/S/compressed.ph...
322,A Court of Mist and Fury,Sarah J. Maas,4.62,"[Fantasy, Romance, Young_Adult, New_Adult, Fae...",332646,https://i.gr-assets.com/images/S/compressed.ph...
709,The Nightingale,Kristin Hannah,4.57,"[Historical_Fiction, Fiction, Historical, Worl...",697799,https://i.gr-assets.com/images/S/compressed.ph...
105,Harry Potter and the Half-Blood Prince,J.K. Rowling,4.57,"[Fantasy, Young_Adult, Fiction, Magic, Childre...",2437658,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...,...,...,...
100,"Moby-Dick or, the Whale",Herman Melville,3.51,"[Classics, Fiction, Literature, Adventure, Nov...",479125,https://i.gr-assets.com/images/S/compressed.ph...
2331,The Nanny Diaries,Emma McLaughlin,3.43,"[Chick_Lit, Fiction, Contemporary, Humor, Roma...",360888,https://i.gr-assets.com/images/S/compressed.ph...
169,Heart of Darkness,Joseph Conrad,3.43,"[Classics, Fiction, Literature, Africa, Histor...",414628,https://i.gr-assets.com/images/S/compressed.ph...
128,The Scarlet Letter,Nathaniel Hawthorne,3.41,"[Classics, Fiction, Historical_Fiction, School...",706272,https://i.gr-assets.com/images/S/compressed.ph...


In [8]:
# get all genres in a single list
genres = popular_df[['genres']].values
genres = [item for i in genres for item in i[0]]

# get unique genres and sort
genres = np.unique(genres, return_counts=True)
genres = sorted(zip(genres[0], genres[1]), key=lambda x:x[1], reverse=True)

# get top X popular genres (X = 10)
popular_genres = genres[:NUM_GENRES]
popular_genres = [i[0] for i in popular_genres]

In [9]:
popular_genres

['Fiction',
 'Fantasy',
 'Classics',
 'Young_Adult',
 'Audiobook',
 'Adult',
 'Novels',
 'Romance',
 'Adventure',
 'Contemporary']

In [10]:
# for each genre, take top T books (T = 25)
genre_wise_dfs = []

for genre in popular_genres:
    _ = popular_df[popular_df['genres'].apply(lambda x:genre in x)].iloc[:30]
    _ = _[['title', 'author', 'rating', 'coverImg']]
    # for use in JS, convert to necessary format:
    # dictionary with genre as key and list of books as value, where each book is a dictionary of reqd stuff
    genre_wise_dfs.append(_.reset_index().to_dict(orient='records'))
    # genre_wise_dfs.append(_)

popular_genre_dfs = dict((zip(popular_genres, genre_wise_dfs)))
popular_genre_dfs['genres'] = popular_genres

In [11]:
with open("./app/top_books.json", "w") as f:
  f.write(json.dumps(popular_genre_dfs))

## Recommendations based on collaborative filtering

## Recommendations based on content