In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet



import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

## Simple Weighted Rating

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* is the number of votes for the book (total)
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of all books
* *C* is the mean vote across the entire dataset

The next step is to determine an appropriate value for *m*,
the minimum ratings required to be listed in the chart. We 
will use **95th percentile** as our cutoff. In other words,
for a book to feature in the charts, it must have more votes
than at least 95% of the books in the list.


In [2]:
data = pd.read_csv("./goodreads_cleaned.csv")
ratings = data["Rating"]
rating_count = data["RatingDistTotal"]
MR = ratings.mean()
MR

2.892806491434076

In [18]:
m = rating_count.quantile(0.98)
m

15602.0

### Weighting Metrics
- ``MR`` = Mean rating = 2.89 (out of 5)
- ``m`` = minimum number of ratings required

In [7]:
def weighted_rating(book):
    v = book['RatingDistTotal']
    R = book['Rating']
    return (v/(v+m) * R) + (m/(m+v) * MR)

In [27]:
top_books = data[(data["RatingDistTotal"] >= m) & (data["Rating"] > MR)]
top_books.shape

(36525, 16)

In [30]:
top_books["weighted"] = top_books.apply(weighted_rating, axis=1)
top_books.sample(100).sort_values("weighted", ascending=False)

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Description,weighted
231103,113968,Le Grinch,Dr. Seuss,2266106015,4.36,2002,1,Presse Pocket,187127,74900,38530,9359,5084,315000,0,,4.290759
229222,110805,Pride and Prejudice,Jane Austen,1933652446,4.26,2005,7,Bed Book,1521610,774781,355933,108580,72614,2833518,15,,4.252513
238113,126452,On Writing: A Memoir of the Craft,Stephen King,1416549862,4.33,2007,25,Pocket Star,108991,69709,24232,4514,2079,209525,2,,4.230398
472388,2068057,"Going Postal (Discworld, #33)",Terry Pratchett,1417735481,4.39,2005,9,Turtleback Books,53413,34802,10637,1286,451,100589,3,Arch-swindler Moist Van Lipwig never believed ...,4.188958
410959,833468,"Going Postal (Discworld, #33)",Terry Pratchett,0385603428,4.39,2004,29,Doubleday,52538,34354,10504,1242,402,99040,60,<p>Moist von Lipwig was a con artist and a fra...,4.186242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164599,664737,How the Garcia Girls Lost Their Accents,Julia Alvarez,0945575572,3.64,1991,4,Algonquin Books,4299,8916,7673,1876,434,23198,47,Julia Alvarez's brilliant first book of fictio...,3.339543
387242,1287726,"I'm Ok, You're Ok",Thomas A. Harris,0099552418,3.74,1995,4,Arrow,4512,5043,4715,1419,373,16062,7,This practical guide to Transactional Analysis...,3.322557
1112041,381514,Bleachers,John Grisham,0385340877,3.48,2007,29,Delta,6741,10628,12287,4475,1335,35466,78,,3.300604
481031,2087619,The Gatecrasher,Madeleine Wickham,1427204373,3.27,2009,7,MacMillan Audio,3329,5617,8751,3722,1179,22598,6,The Secret Is Out!<br />Madeleine Wickham is S...,3.115943


In [None]:
m = rating_count.quantile(0.95)
top_books = data[(data["RatingDistTotal"] >= m) & (data["Rating"] > MR)]
top_books["weighted"] = top_books.apply(weighted_rating, axis=1)
above_average = top_books.sort_values("weighted", ascending=False)
above_average.shape

# Content Based Recommendations

The goal here is to see if we can recommend books based on the similarity in the descriptions between books.


In [93]:
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0,
                     stop_words='english')
tfidf_matrix = tf.fit_transform(above_average['Description'])

In [94]:
tfidf_matrix.shape

(91263, 2278115)

#### Cosine Similarity

I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two books. Mathematically, it is defined as follows:

$cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||} $

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's **linear_kernel** instead of cosine_similarities since it is much faster.

In [95]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [96]:

# write function to get recommendations based on cosine similarity
above_avg = above_average.reset_index()
titles = above_avg['Name']
indices = pd.Series(above_avg.index, index=above_avg['Name'])

def get_recommendations(title):
    try:
        # handle case in which book by same title is in dataset
        idx = indices[title][0]
    except IndexError:
        idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:15]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Int64Index([57699, 58314], dtype='int64')

In [127]:
index = 57699
title = above_avg.iloc[index].Name
desc = above_avg.iloc[index].Description
author = above_avg.iloc[index].Authors
year = above_avg.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The Extraordinary Adventures of Arsene Lupin, Gentleman-Burglar 
Description: Leblanc was a French novelist and short story writer known for creating the character Arsene Lupin, who is the French counterpart to the English Sherlock Holmes. In this first of twenty volumes about the French detective, Lupin actually meets the famous Sherlock Holmes. They also met in second volume, but at this point legal problems ended their fictitious meetings. 
Author: Maurice Leblanc 
Year: 2008


In [128]:
get_recommendations(title)

58314    The Extraordinary Adventures of Arsene Lupin, ...
58255                      Arsene Lupin, Gentleman Burglar
58838                         The Exploits of Arsene Lupin
349      Sherlock Holmes 6 Book Boxed Set (Collector's ...
348                   The Penguin Complete Sherlock Holmes
1998                        The Memoirs of Sherlock Holmes
45999    The Original Illustrated Sherlock Holmes (Sher...
1984                        The Memoirs of Sherlock Holmes
1709     The Adventures of Sherlock Holmes (Sherlock Ho...
1715                   The Adventures Of Sherlock Holmes (
1740     The Adventures of Sherlock Holmes (Sherlock Ho...
1745                         Adventures of Sherlock Holmes
57539                   Arsene Lupin, Gentleman Cambioleur
5930                                    A Study In Scarlet
Name: Name, dtype: object

In [129]:
# lets try another
index = 20000
title = above_avg.iloc[index].Name
desc = above_avg.iloc[index].Description
author = above_avg.iloc[index].Authors
year = above_avg.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The Long Patrol (Redwall, #10) 
Description: The Long Patrol -- that fighting unit of perilous hares -- is called out to draw off the murderous Rapscallion army, and fight them to the death if need be.And the lead sword of the Long Patrol will be taken up by the young, inexperienced hare Tammo -- in one of the most ferocious battles Redwall has ever faced... 
Author: Brian Jacques 
Year: 1999


In [130]:
get_recommendations(title)

20014         The Long Patrol (Redwall, #10)
37357       Outcast of Redwall (Redwall, #8)
63852                            Space Cadet
10363       Martin the Warrior (Redwall, #6)
9041                   Redwall (Redwall, #1)
28919      The Pearls of Lutra (Redwall, #9)
22040           The Taggerung (Redwall, #14)
31218                 Marlfox (Redwall, #11)
53014               The Return of the Indian
9004                   Redwall (Redwall, #1)
23532                          Birds of Prey
23746                          Birds of Prey
27951        Mariel of Redwall (Redwall, #4)
64189    Beguilement (The Sharing Knife, #1)
Name: Name, dtype: object

In [137]:
# lets try another
index = 20007
title = above_avg.iloc[index].Name
desc = above_avg.iloc[index].Description
author = above_avg.iloc[index].Authors
year = above_avg.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The First American: The Life and Times of Benjamin Franklin 
Description: In the first comprehensive biography of Benjamin Franklin in over sixty years, acclaimed historian H. W. Brands brings vividly to life one of the most delightful, bawdy, brilliant, original, and important figures in American history.<br /><br />A groundbreaking scientist, leading businessman, philosopher, bestselling author, inventor, diplomat, politician, and wit, Benjamin Franklin was perhaps the most beloved and celebrated American of his age, or indeed of any age. Now, in a beautifully written and meticulously researched account of Franklin's life and times, his clever repartee, generous spirit, and earthy wisdom are brought compellingly to the page. <br /><br />His circle of friends and acquaintances extended around the globe, from Cotton Mather to Voltaire, from Edmund Burke to King George III, from Sir Isaac Newton to Immanuel Kant. Franklin was gifted with a restless curiosity, and his scientific e

In [138]:
get_recommendations(title)

37019               The Compleated Autobiography 1757-1790
17298                  Benjamin Franklin: An American Life
18552                  Benjamin Franklin: An American Life
36600             The Americanization of Benjamin Franklin
61735                  Franklin Is Bossy (Franklin Series)
18554                           A Benjamin Franklin Reader
18559                  Benjamin Franklin: An American Life
18555                  Benjamin Franklin: An American Life
83640    Ben and Me: An Astonishing Life of Benjamin Fr...
35731               The Autobiography of Benjamin Franklin
35652           Benjamin Franklin: His Life as He Wrote It
37004    The Autobiography of Benjamin Franklin: From 1...
36445             The Americanization of Benjamin Franklin
18435                  Benjamin Franklin: An American Life
Name: Name, dtype: object

In [139]:
# ok last one really
index = 20009
title = above_avg.iloc[index].Name
desc = above_avg.iloc[index].Description
author = above_avg.iloc[index].Authors
year = above_avg.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The Pleasure Of Finding Things Out: The Best Short Works Of Richard Feynman 
Description: <i>The Pleasure of Finding Things Out</i> is a magnificent treasury of the best short works of Richard Feynman—from interviews and speeches to lectures and printed articles. A sweeping, wide-ranging collection, it presents an intimate and fascinating view of a life in science—a life like no other.From his ruminations on science in our culture and descriptions of the fantastic properties of quantum physics to his report on the Space Shuttle Challenger disaster and his Nobel Prize acceptance speech, this book will fascinate anyone interested in Feynman and anyone interested in the world of ideas. Newcomers to Feynman will be moved by his wit and deep understanding of the natural world, and of the human experience. 
Author: Richard P. Feynman 
Year: 1999


In [140]:
get_recommendations(title)

20008    The Pleasure Of Finding Things Out: The Best S...
19921    The Pleasure of Finding Things Out: The Best S...
7570     What Do You Care What Other People Think? Furt...
12832    QED: The Strange Theory of Light & Matter: Ali...
7565     What Do You Care What Other People Think? (Lib...
12834          QED: The Strange Theory of Light and Matter
45277    The Meaning of It All: Thoughts of a Citizen-S...
7796               The Feynman Lectures on Physics, 3 Vols
2433     Surely You're Joking, Mr. Feynman: Adventures ...
2441     "Surely You're Joking, Mr. Feynman!" Adventure...
45142                                The Meaning of It All
11776                    Six Easy Pieces Book/tape Package
2736                  Vous Voulez Rire, Monsieur Feynman !
74089    Feynman's Rainbow: A Search for Beauty in Phys...
Name: Name, dtype: object