In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings; warnings.simplefilter('ignore')

In [3]:
# Build a full dataset of all datasets in the input dir with pandas
from pathlib import Path
full_dataset = pd.concat([pd.read_csv(str(x.resolve())) for x in Path("/kaggle/input/goodreads-book-datasets-10m/").glob("*k.csv")])
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850310 entries, 0 to 43621
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   RatingDist4            object 
 1   RatingDist1            object 
 2   ISBN                   object 
 3   Authors                object 
 4   Id                     int64  
 5   pagesNumber            float64
 6   Language               object 
 7   RatingDist3            object 
 8   Name                   object 
 9   PublishYear            int64  
 10  CountsOfReview         int64  
 11  RatingDist5            object 
 12  PublishMonth           int64  
 13  RatingDist2            object 
 14  PublishDay             int64  
 15  RatingDistTotal        object 
 16  Rating                 float64
 17  Publisher              object 
 18  Description            object 
 19  Count of text reviews  float64
 20  PagesNumber            float64
dtypes: float64(4), int64(5), object(12)
memory usage: 310.6+

In [4]:
# Each rating column looks like x:432411 where the x is the star count.
# since this is redundant and ruins the data typing, we can take it out.
# turn the RatingDist columns into itegers by splitting of the first part.
for i in range(1,6):
    full_dataset['RatingDist'+str(i)] = full_dataset['RatingDist'+str(i)].str.split(':').str[1]
full_dataset["RatingDistTotal"] = full_dataset["RatingDistTotal"].str.split(':').str[1]

to_convert = ["RatingDist1", "RatingDist2", "RatingDist3", "RatingDist4", "RatingDist5", "RatingDistTotal"]
for col in to_convert:
    full_dataset[col] = full_dataset[col].astype(int)

full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850310 entries, 0 to 43621
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   RatingDist4            int64  
 1   RatingDist1            int64  
 2   ISBN                   object 
 3   Authors                object 
 4   Id                     int64  
 5   pagesNumber            float64
 6   Language               object 
 7   RatingDist3            int64  
 8   Name                   object 
 9   PublishYear            int64  
 10  CountsOfReview         int64  
 11  RatingDist5            int64  
 12  PublishMonth           int64  
 13  RatingDist2            int64  
 14  PublishDay             int64  
 15  RatingDistTotal        int64  
 16  Rating                 float64
 17  Publisher              object 
 18  Description            object 
 19  Count of text reviews  float64
 20  PagesNumber            float64
dtypes: float64(4), int64(11), object(6)
memory usage: 310.6+

In [5]:
# Make our dataset smaller in memory Thanks to : https://www.kaggle.com/aantonova/some-new-risk-and-clusters-feature
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

reduce_mem_usage(full_dataset)

Mem. usage decreased to 187.05 Mb (39.8% reduction)


Unnamed: 0,RatingDist4,RatingDist1,ISBN,Authors,Id,pagesNumber,Language,RatingDist3,Name,PublishYear,...,RatingDist5,PublishMonth,RatingDist2,PublishDay,RatingDistTotal,Rating,Publisher,Description,Count of text reviews,PagesNumber
0,10,2,080959577X,Mrs. Henry Wood,300002,487.0,eng,5,The Channings,2004,...,13,1,4,7,34,3.820312,Wildside Press,,,
1,1,0,1402158718,Mrs. Henry Wood,300005,310.0,,1,"William Allair; Or, Running Away To Sea",2005,...,0,30,0,11,2,3.500000,Adamant Media Corporation,,,
2,23,0,1598180339,Mrs. Henry Wood,300007,380.0,,11,Anne Hereford,2006,...,17,1,1,10,52,4.078125,Aegypan,,,
3,5969,278,0613607406,Celia Rees,300008,261.0,,4461,Witch Child,2002,...,4238,1,1122,4,16068,3.789062,Turtleback Books,,,
4,15,2,0859537803,Arden Druce,300009,32.0,,14,"Witch, Witch Come To My Party",1991,...,37,1,2,10,70,4.191406,Child's Play International,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43617,0,0,0199184836,Liz Miles,1299980,32.0,,0,Oxford Reading Tree: Stage 16A: TreeTops Class...,2006,...,0,28,0,9,0,0.000000,Oxford University Press,,0.0,
43618,1,0,0841913587,Colbert Nepaulsingh,1299992,149.0,,1,Apples of Gold in Filigress of Silver,1995,...,0,1,0,1,2,3.500000,Holmes & Meier Publishers,,0.0,
43619,9,0,0876044909,Michael Mirdad,1299994,109.0,,2,The Seven Initiations of the Spiritual Path,2004,...,11,1,1,3,23,4.300781,A.R.E. Press (Association of Research & Enligh...,"- Foreword by Dr. Michael Abrams, author of th...",2.0,
43620,104,3,0824068378,William Faulkner,1299998,297.0,,44,Unpublished Stories,1987,...,116,1,6,11,273,4.191406,Facsimiles-Garl,"This invaluable volume, which has been republi...",0.0,


In [6]:
# look at the dates
full_dataset[["PublishYear", "PublishMonth", "PublishDay"]].describe()

Unnamed: 0,PublishYear,PublishMonth,PublishDay
count,1850310.0,1850310.0,1850310.0
mean,1997.841,7.70055,7.987211
std,87.89461,7.75694,8.431777
min,1.0,1.0,1.0
25%,1993.0,1.0,1.0
50%,2000.0,6.0,5.0
75%,2005.0,11.0,11.0
max,65535.0,31.0,31.0


In [7]:
# so something is seriously wrong with publish month... 31st month??? huh? lets drop it.
full_dataset.drop(columns=["PublishMonth"], inplace=True)

In [8]:
# take a look at the years in the dataset
full_dataset[(full_dataset['PublishYear'] < 1400) | (full_dataset['PublishYear'] > 2020)]

Unnamed: 0,RatingDist4,RatingDist1,ISBN,Authors,Id,pagesNumber,Language,RatingDist3,Name,PublishYear,CountsOfReview,RatingDist5,RatingDist2,PublishDay,RatingDistTotal,Rating,Publisher,Description,Count of text reviews,PagesNumber
24627,22,3,0785379185,John Kurtz,343405,24.0,eng,19,Disney Princesses (Look and Find),200,9,23,7,12,74,3.740234,Publications International,,,
28788,47,5,0373292813,Margo Maguire,350490,304.0,eng,46,The Virtuous Knight (Medieval Brides #4),299,12,48,14,11,160,3.740234,Harlequin Historical,,,
816,85,3,0076074000,Charlotte Brontë,3001922,,,41,"Jane Eyre, Shirley, Villette & The Professor",3006,3,158,8,25,295,4.308594,Barnes and Noble,"In her fiction, Charlotte Bront",,1167.0
5032,0,0,0521614813,Alexander Sens,3012346,,,0,Hellenistic Epigrams: A Selection,2021,0,0,0,31,0,0.000000,Cambridge University Press,"Greek ""literary"" epigrams constitute one of th...",,320.0
30547,2773,54,3442310148,Michael Robotham,3080023,,ger,1635,"Amnesie (Joseph O'Loughlin, #2)",20016,3,1523,309,21,6294,3.859375,Goldmann Verlag,"Mit Schusswunden im Bein, einem abgeschossenen...",,448.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102144,13,3,1592122515,L. Ron Hubbard,4310347,,,7,Twenty Fathoms Down,2103,3,6,7,22,36,3.330078,Galaxy Press,<b>As daring and defiant as Kirk Douglas journ...,,136.0
160768,0,0,9640005126,نائوم لایکوفسکی,4488314,,,1,قدر يک لبخند,1387,2,3,0,6,4,4.500000,اميرکبير,,,382.0
222257,3,0,0520062418,Daniel D. Arnheim,4670608,,,0,The Power of he Center,65535,1,0,0,15,3,4.000000,,"Using a wealth of examples, Arnheim considers ...",,256.0
278158,0,0,,حمیدرضا جلائی‌پور,4838633,,,0,فراز و فرود جنبش کردی,1385,0,0,0,11,0,0.000000,لوح فکر,,,236.0


In [9]:
# pretty sure 3006 hasn't happenned yet, so let's filter those out
full_dataset = full_dataset[full_dataset['PublishYear'] > 1400]
full_dataset = full_dataset[full_dataset['PublishYear'] < 2022]
full_dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850262 entries, 0 to 43621
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   RatingDist4            1850262 non-null  int32  
 1   RatingDist1            1850262 non-null  int32  
 2   ISBN                   1844343 non-null  object 
 3   Authors                1850262 non-null  object 
 4   Id                     1850262 non-null  int32  
 5   pagesNumber            835059 non-null   float32
 6   Language               251797 non-null   object 
 7   RatingDist3            1850262 non-null  int32  
 8   Name                   1850262 non-null  object 
 9   PublishYear            1850262 non-null  int32  
 10  CountsOfReview         1850262 non-null  int32  
 11  RatingDist5            1850262 non-null  int32  
 12  RatingDist2            1850262 non-null  int32  
 13  PublishDay             1850262 non-null  int8   
 14  RatingDistTotal     

In [10]:
print(full_dataset.Language.unique())
full_dataset["Language"].describe()

['eng' nan 'en-CA' 'en-US' 'spa' 'ger' 'fre' 'ypk' 'en-GB' 'grc' 'mul'
 'por' 'ita' 'tur' 'sco' 'frs' 'nl' 'jpn' 'lao' 'rus' 'zho' 'kor' 'ang'
 'hye' 'raj' 'fil' 'frm' 'tgl' 'gre' 'lat' 'per' 'swe' 'wel' 'slv' 'epo'
 'msa' 'ara' 'ind' 'dan' 'pol' 'nav' 'tli' 'san' 'srp' 'afr' 'chi' 'som'
 'sqi' 'cat' 'gle' 'rar' 'heb' 'rum' 'vie' 'hmn' 'tib' 'nor' 'yid' 'gmh'
 'nob' 'goh' 'jav' 'mri' 'che' 'sna' 'nld' 'ben' 'elx' 'tel' 'scr' 'aze'
 'bul' 'glg' '--' 'enm' 'fin' 'gla' 'mal' 'hin' 'est' 'myv' 'zul' 'cze'
 'tlh' 'nub' 'eus' 'ave' 'mga' 'fan' 'hun' 'cre' 'bos' 'tha' 'urd' 'wak'
 'guj' 'chp' 'tam' 'fro' 'myn' 'lav' 'cop' 'ada' 'zap' 'aus' 'kur' 'nep'
 'nai' 'lit' 'nqo' 'tah' 'slo' 'mar' 'isl' 'mah' 'kan' 'non' 'ira' 'ssw'
 'mis' 'zun' 'dum' 'gem' 'haw' 'sam']


count     251797
unique       124
top          eng
freq      180327
Name: Language, dtype: object

In [11]:
# so it's predominatly english books, but lets change en-GB and en-CA to english
full_dataset["Language"].replace(["en-GB", "en-CA", "en-US"], "eng", inplace=True)
print(full_dataset.Language.unique())
full_dataset["Language"].describe()

['eng' nan 'spa' 'ger' 'fre' 'ypk' 'grc' 'mul' 'por' 'ita' 'tur' 'sco'
 'frs' 'nl' 'jpn' 'lao' 'rus' 'zho' 'kor' 'ang' 'hye' 'raj' 'fil' 'frm'
 'tgl' 'gre' 'lat' 'per' 'swe' 'wel' 'slv' 'epo' 'msa' 'ara' 'ind' 'dan'
 'pol' 'nav' 'tli' 'san' 'srp' 'afr' 'chi' 'som' 'sqi' 'cat' 'gle' 'rar'
 'heb' 'rum' 'vie' 'hmn' 'tib' 'nor' 'yid' 'gmh' 'nob' 'goh' 'jav' 'mri'
 'che' 'sna' 'nld' 'ben' 'elx' 'tel' 'scr' 'aze' 'bul' 'glg' '--' 'enm'
 'fin' 'gla' 'mal' 'hin' 'est' 'myv' 'zul' 'cze' 'tlh' 'nub' 'eus' 'ave'
 'mga' 'fan' 'hun' 'cre' 'bos' 'tha' 'urd' 'wak' 'guj' 'chp' 'tam' 'fro'
 'myn' 'lav' 'cop' 'ada' 'zap' 'aus' 'kur' 'nep' 'nai' 'lit' 'nqo' 'tah'
 'slo' 'mar' 'isl' 'mah' 'kan' 'non' 'ira' 'ssw' 'mis' 'zun' 'dum' 'gem'
 'haw' 'sam']


count     251797
unique       121
top          eng
freq      209661
Name: Language, dtype: object

In [12]:
# we will then drop the rest of the languages as our simple rec-sys will be english only for now.
full_dataset = full_dataset[full_dataset["Language"] == "eng"]
full_dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209661 entries, 0 to 43614
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   RatingDist4            209661 non-null  int32  
 1   RatingDist1            209661 non-null  int32  
 2   ISBN                   207684 non-null  object 
 3   Authors                209661 non-null  object 
 4   Id                     209661 non-null  int32  
 5   pagesNumber            141801 non-null  float32
 6   Language               209661 non-null  object 
 7   RatingDist3            209661 non-null  int32  
 8   Name                   209661 non-null  object 
 9   PublishYear            209661 non-null  int32  
 10  CountsOfReview         209661 non-null  int32  
 11  RatingDist5            209661 non-null  int32  
 12  RatingDist2            209661 non-null  int32  
 13  PublishDay             209661 non-null  int8   
 14  RatingDistTotal        209661 non-nul

In [13]:
# lets check out deplicates
full_dataset.Name.value_counts()[:20]

Romeo and Juliet                                              17
The Call of the Wild                                          14
To the Lighthouse                                             13
Beloved                                                       13
A Christmas Carol                                             13
Gulliver's Travels                                            13
The Trial                                                     12
The Jungle Book                                               12
The Collected Stories                                         12
The Brothers Karamazov                                        12
Bleak House                                                   12
Harry Potter and the Chamber of Secrets (Harry Potter, #2)    11
To Kill a Mockingbird                                         11
The Complete Works                                            11
Flatland: A Romance of Many Dimensions                        11
Aesop's Fables           

In [14]:
# so there are alot of duplicate listings here, lets drop all the books by the excat same name and authors
# this should remove books that were re-pulished by the same author and name but possibly by a different publisher
full_dataset.drop_duplicates(subset=["Authors", "Name"], inplace=True)
full_dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187249 entries, 0 to 43614
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   RatingDist4            187249 non-null  int32  
 1   RatingDist1            187249 non-null  int32  
 2   ISBN                   185638 non-null  object 
 3   Authors                187249 non-null  object 
 4   Id                     187249 non-null  int32  
 5   pagesNumber            125086 non-null  float32
 6   Language               187249 non-null  object 
 7   RatingDist3            187249 non-null  int32  
 8   Name                   187249 non-null  object 
 9   PublishYear            187249 non-null  int32  
 10  CountsOfReview         187249 non-null  int32  
 11  RatingDist5            187249 non-null  int32  
 12  RatingDist2            187249 non-null  int32  
 13  PublishDay             187249 non-null  int8   
 14  RatingDistTotal        187249 non-nul

In [15]:
full_dataset.Name.value_counts()[:20]

The Collected Stories        11
Selected Writings             9
The Arabian Nights            7
Collected Poems               7
The Major Works               7
The Emperor's New Clothes     7
Selected Poems                7
Paradise                      6
The Collected Poems           6
Collected Stories             6
The Snow Queen                6
The Bremen Town Musicians     6
The Complete Works            6
Political Writings            5
When I Grow Up                5
Aesop's Fables                5
Poems                         5
Collected Short Stories       5
Echoes                        5
Hunter's Moon                 5
Name: Name, dtype: int64

In [16]:
full_dataset.head()

Unnamed: 0,RatingDist4,RatingDist1,ISBN,Authors,Id,pagesNumber,Language,RatingDist3,Name,PublishYear,CountsOfReview,RatingDist5,RatingDist2,PublishDay,RatingDistTotal,Rating,Publisher,Description,Count of text reviews,PagesNumber
0,10,2,080959577X,Mrs. Henry Wood,300002,487.0,eng,5,The Channings,2004,3,13,4,7,34,3.820312,Wildside Press,,,
7,1861,92,0763621838,Celia Rees,300017,352.0,eng,1700,"Sorceress (Witch Child, #2)",2003,200,1281,474,3,5408,3.699219,Candlewick,,,
20,0,0,0689843798,Annie Auerbach,300036,12.0,eng,2,Bob's Recycling Day,2001,1,2,0,9,4,4.0,Simon Spotlight,,,
24,70,0,0571207375,Jonathan Sumption,300041,655.0,eng,14,"Trial by Fire: The Hundred Years War, Volume 2",2001,7,111,0,8,195,4.5,Faber & Faber,,,
26,1588,73,1416525033,Karen Hawkins,300043,313.0,eng,1146,"How to Abduct a Highland Lord (MacLean Curse, #1)",2007,222,1407,241,1,4455,3.900391,Pocket Books,,,


In [17]:
# drop some columns we won't need for now
full_dataset.drop(columns=["PagesNumber", "CountsOfReview", "Count of text reviews", "pagesNumber"], inplace=True)

In [18]:
# fill the empty descriptions with the names of the books
full_dataset.Description.fillna(full_dataset.Name, inplace=True)


In [19]:
full_dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187249 entries, 0 to 43614
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   RatingDist4      187249 non-null  int32  
 1   RatingDist1      187249 non-null  int32  
 2   ISBN             185638 non-null  object 
 3   Authors          187249 non-null  object 
 4   Id               187249 non-null  int32  
 5   Language         187249 non-null  object 
 6   RatingDist3      187249 non-null  int32  
 7   Name             187249 non-null  object 
 8   PublishYear      187249 non-null  int32  
 9   RatingDist5      187249 non-null  int32  
 10  RatingDist2      187249 non-null  int32  
 11  PublishDay       187249 non-null  int8   
 12  RatingDistTotal  187249 non-null  int32  
 13  Rating           187249 non-null  float16
 14  Publisher        184576 non-null  object 
 15  Description      187249 non-null  object 
dtypes: float16(1), int32(8), int8(1), objec

In [20]:
# drop remaining columns
full_dataset.dropna(inplace=True)
full_dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183031 entries, 0 to 43614
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   RatingDist4      183031 non-null  int32  
 1   RatingDist1      183031 non-null  int32  
 2   ISBN             183031 non-null  object 
 3   Authors          183031 non-null  object 
 4   Id               183031 non-null  int32  
 5   Language         183031 non-null  object 
 6   RatingDist3      183031 non-null  int32  
 7   Name             183031 non-null  object 
 8   PublishYear      183031 non-null  int32  
 9   RatingDist5      183031 non-null  int32  
 10  RatingDist2      183031 non-null  int32  
 11  PublishDay       183031 non-null  int8   
 12  RatingDistTotal  183031 non-null  int32  
 13  Rating           183031 non-null  float16
 14  Publisher        183031 non-null  object 
 15  Description      183031 non-null  object 
dtypes: float16(1), int32(8), int8(1), objec

In [21]:
# use regex to clean up the descriptions as some of them have embedded HTML tags like <br>
import re
# compile once only
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

In [22]:
full_dataset["Description"] = full_dataset.Description.apply(cleanhtml)
full_dataset.Description.sample(100)

27296     Neighbors were unaware of what went on behind ...
5142      When Perennis Felix, favorite of the Roman Emp...
36945             One Piece, Volume 10: OK, Let's Stand Up!
39721     The Interdependence Handbook is a collection o...
109406                                       Of God and Man
                                ...                        
2284                  Sherlock Holmes: A Baker Street Dozen
51172     Birgivi's Manual Interpreted: Complete Fiqh of...
34355     Invisible Lives: The Erasure of Transsexual an...
1968      Catherine's words help us to experience her pr...
46404                                          Stuck on You
Name: Description, Length: 100, dtype: object

## Simple Weighted Rating

Weighted Rating (WR) = $(\frac{v}{v + q} . R) + (\frac{q}{v + q} . C)$

where,
* *v* is the number of votes for the book (total)
* *q* is the minimum votes required to be listed in the chart
* *R* is the average rating of all books
* *C* is the mean vote across the entire dataset

inspired by: https://www.kaggle.com/rounakbanik/movie-recommender-systems

In [23]:
ratings = full_dataset["Rating"]
rating_count = full_dataset["RatingDistTotal"]
MR = ratings.convert_dtypes(np.float16).mean()
print("Mean Rating (MR) accross all books: ", MR)

q = rating_count.quantile(0.98)
print("Minimum number of required ratings: ", q)


Mean Rating (MR) accross all books:  3.735411229433129
Minimum number of required ratings:  48706.59999999977


### Weighting Metrics
- ``MR`` = Mean rating = 3.75 (out of 5)
- ``q`` = minimum number of ratings required

In [24]:
# our weighted rating algorithm
def weighted_rating(book):
    v = book['RatingDistTotal']
    R = book['Rating']
    return (v/(v+q) * R) + (q/(q+v) * MR)

In [25]:
top_books = full_dataset[(full_dataset["RatingDistTotal"] >= q) & (full_dataset["Rating"] > MR)]
top_books["weighted"] = top_books.apply(weighted_rating, axis=1)
above_average = top_books.sort_values("weighted", ascending=False)
above_average.shape

(3340, 17)

In [26]:
# So clearly Harry Potter is a popular book. No surprise there.
above_average.head()

Unnamed: 0,RatingDist4,RatingDist1,ISBN,Authors,Id,Language,RatingDist3,Name,PublishYear,RatingDist5,RatingDist2,PublishDay,RatingDistTotal,Rating,Publisher,Description,weighted
9868,532119,22089,1551929767,J.K. Rowling,818056,eng,156299,Harry Potter and the Deathly Hallows (Harry Po...,2007,1960662,33318,7,2704487,4.621094,Bloomsbury Publishing Plc/Raincoast Books,Harry is waiting in Privet Drive. The Order of...,4.605425
59051,35788,1628,0739352245,J.K. Rowling,2132220,eng,8335,"Harry Potter Audio Collection (Harry Potter, #...",2007,200090,1665,17,247506,4.738281,Listening Library (Audio),Enjoy the complete Harry Potter series perform...,4.573378
22521,35557,1572,074759368X,J.K. Rowling,1668766,eng,8250,"Harry Potter Boxed Set (Harry Potter, #1-7)",2007,198894,1643,10,245916,4.738281,Bloomsbury,A fabulous opportunity to own all seven Harry ...,4.572488
22520,35557,1572,0747594562,J.K. Rowling,1668764,eng,8250,The Complete Harry Potter Collection Box Set (...,2007,198894,1643,10,245916,4.738281,Bloomsbury,A fabulous opportunity to own all seven Harry ...,4.572488
36205,35392,1549,0747593698,J.K. Rowling,988373,eng,8190,Complete Harry Potter Boxed Set,2007,197903,1634,10,244668,4.738281,Bloomsbury Publishing,A fabulous opportunity to own all seven Harry ...,4.571783


In [27]:
# lets take a random sample
above_average.sample(10).sort_values(by="weighted", ascending=False)

Unnamed: 0,RatingDist4,RatingDist1,ISBN,Authors,Id,Language,RatingDist3,Name,PublishYear,RatingDist5,RatingDist2,PublishDay,RatingDistTotal,Rating,Publisher,Description,weighted
33760,77854,1340,99428512,Diana Gabaldon,864696,eng,27570,Voyager,1995,135066,4110,8,245940,4.390625,Arrow Books Ltd,An alternate cover for this isbn can be found ...,4.282315
43191,29923,1519,892640308,Lao Tzu,276698,eng,14333,A Translation of Lao-tzu's Tao Te Ching and Wa...,1977,58136,3365,1,107276,4.300781,Center for Chinese Studies,A Translation of Lao-tzu's Tao Te Ching and Wa...,4.124241
26702,23449,787,679643249,Michael Shaara,44469,eng,8628,The Killer Angels: A Novel of the Civil War (T...,2004,38309,1948,12,73121,4.320312,Modern Library,The Killer Angels: A Novel of the Civil War (T...,4.086469
23293,26437,1125,141024917,Fyodor Dostoyevsky,843006,eng,11862,Notes from Underground,2006,33148,2754,10,75326,4.160156,Penguin Group(CA),How far would you go to escape the real world?...,3.993362
18503,107052,5919,61020710,Terry Pratchett,533613,eng,65031,"The Color of Magic (Discworld, #1)",2000,109875,15612,2,303489,3.990234,HarperTorch,"The Color of Magic (Discworld, #1)",3.954994
36853,16338,1159,671582089,Anthony Robbins,1285192,eng,8933,Awaken The Giant Within,2000,25414,2391,2,54235,4.148438,Simon Schuster Audio,Wake up and take control of your life! From th...,3.953015
17206,17838,797,312851820,Vernor Vinge,940486,eng,7968,A Fire Upon the Deep,1992,21400,2087,4,50090,4.140625,Tor,"Vernor Vinge has shown, in books like True Nam...",3.940855
14794,34518,972,141185414,John Wyndham,826846,eng,18352,The Day of the Triffids,2001,29634,3484,2,86960,4.019531,Penguin Modern Classics,When Bill Masen wakes up blindfolded in hospit...,3.917527
7970,81669,10630,441000681,William Gibson,313982,eng,49821,"Neuromancer (Sprawl Trilogy, #1)",1994,91835,18238,7,252193,3.900391,Ace Hardcover,"Neuromancer (Sprawl Trilogy, #1)",3.873685
35172,40111,2750,440242398,Sophie Kinsella,3092108,eng,33804,Shopaholic and Baby,2009,34521,8611,28,119797,3.789062,Dell Publishing Company,Becky’s life is blooming! She’s working at Lon...,3.773554


# Content Based Recommendations

The goal here is to see if we can recommend books based on the similarity in the descriptions between books.


In [28]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# sample 20000 points so we don't hit kaggle's memory limit
#samples = full_dataset(20000)

samples = full_dataset[10000:30000] # use if you want the same books to be predicted on

In [30]:
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0,
                     stop_words='english')
tfidf_matrix = tf.fit_transform(samples['Description'])

#### Cosine Similarity

I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two books. Mathematically, it is defined as follows:

$cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||} $

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's **linear_kernel** instead of cosine_similarities since it is much faster.

In [31]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
# write function to get recommendations based on cosine similarity
samples = samples.reset_index()
titles = samples[['Name', 'Authors', 'PublishYear', 'Publisher']]
indices = pd.Series(samples.index, index=samples['Name'])

In [33]:
def get_content_recommendations(title):
    try:
        # handle case in which book by same title is in dataset
        idx = indices[title][0]
    except IndexError:
        idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:15]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [34]:
index = 17136
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: Mathematics Made Simple 
Description: Brushing up on math has never been easier!Just about everyone can use some extra help improving or remembering basic math skills. Finally, all the information you need to master the basics, once and for all, is at your fingertips. Featuring several overviews of a multitude of mathematical concepts, as well as detailed learning plans, Mathematics Made Simple presents the information you need in clear, concise lessons that make math fun to study. Easy-to-use features include:* complete coverage of fractions, decimals, percents, algebra, linear equations, graphs, probability, geometry, and trigonometry* step-by-step solutions to every problem *multiple choice tests to help you monitor your progress * a final, comprehensive test that makes it easy to pinpoint your strengths and target areas that need work* glossaries of relevant mathematical terms* sidebars that introduce real-life applications of mathematical principlesVoid of lengthy explanati

In [35]:
get_content_recommendations(title)

Unnamed: 0,Name,Authors,PublishYear,Publisher
3828,Kaplan Math Power: Essential Guide for Math Su...,Robert Stanton,2001,Kaplan
8592,Mouse Moves House,Nick Sharratt,2000,Candlewick
13757,Introductory Mathematical Analysis for Busines...,Ernest F. Haeussler Jr.,2007,Pearson
13211,"The Complete Book of Math, Grades 3 - 4",American Education Publishing,2000,American Education Publishing
6807,Spectrum Math: Grade 2,Thomas Richards,2002,Frank Schaffer Publications
13389,Games,Ivan Bulloch,2002,Cooper Square Pub
6966,Walk on Maps,Mel Campbell,2006,Rourke Educational Media
2386,Math For All Seasons: Mind-Stretching Math Rid...,Greg Tang,2002,Scholastic Press
13597,Math: Grade 6 (Skill Sharpeners),Jo Ellen Moore,2005,Evan-Moor Educational Publishers
19323,Give Me Half! (Mathstart: Level 2),Stuart J. Murphy,1996,Turtleback Books


In [36]:
index = 18457
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: Superman/Batman, Vol. 7: The Search for Kryptonite 
Description: From Michael Green and Mike Johnson, two of the writers of the hit NBC TV series Heroes, comes this fast-paced new volume in the SUPERMAN/BATMAN series.The Man of Steel and the Dark Knight have decided that they must undertake the impossible task of recovering all the Kryptonite on Earth. But there are plenty of roadblocks in the way, as they realize that not everyone's willing to hand over the deadly substance. And one of those who's not ready to cooperate is none other than their fellow Justice League member, Aquaman.Collecting SUPERMAN/BATMAN #44-49. 
Author: Michael  Green 
Year: 2008


In [37]:
get_content_recommendations(title)

Unnamed: 0,Name,Authors,PublishYear,Publisher
14085,"World's Finest Comics Archives, Vol. 2",Dick Sprang,2002,DC Comics
10437,"The Supergirl Archives, Vol. 1",Jerry Siegel,2001,DC Comics
11748,Batman: The Dark Knight Returns,Frank Miller,2008,Paw Prints
8080,Batman and Other Animals,Linda M. Jennings,1992,Blackie Children's Books
2821,Out of Their League,Dave Meggyesy,1971,Paperback Library
16194,JLA/Avengers,Kurt Busiek,2008,DC Comics
17056,Batman: Harvest Breed,George Pratt,2000,DC Comics
14081,"JLA: Exterminators (Justice League of America,...",Christopher Golden,2008,Graphic Audio
11344,"Justice League of America, Vol. 4: Sanctuary",Dwayne McDuffie,2009,DC Comics
9712,Age of TV Heroes: The Live-Action Adventures o...,Jason Hofius,2010,Two Morrows Publishing


In [38]:
index = 10984
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The Grave Tattoo 
Description: From bestselling author Val McDermid comes a modern thriller about an ancient murder set on the high seas…After summer rains uncover a corpse bearing tattoos like those of eighteenth-century seafarers, many residents of the English Lake District can’t help but wonder whether it’s the body of one of the town’s most legendary fugitives.Scholar and native Lakelander Jane Gresham feels compelled to finally discover the truth about the myths and buried secrets rooted in her hometown. What she never expected was to find herself at the heart of a 200-year-old mystery that still has the power to put lives on the line. And with each new lead she pursues, death follows hard on her heels…. 
Author: Val McDermid 
Year: 2008


In [39]:
get_content_recommendations(title)

Unnamed: 0,Name,Authors,PublishYear,Publisher
16117,Woman in Red,Eileen Goudge,2008,Vanguard Press
14,"The Norton Anthology of English Literature, Vo...",M.H. Abrams,2005,W.W. Norton
2606,Japanese High Seas Fleet (Ballantine's Illustr...,Richard Humble,1973,"Ballantine Books, Inc."
16607,Death Drop,B.M. Gill,1980,Scribner Book Company
6444,"Crack Down (Kate Brannigan, #3)",Val McDermid,2002,Spinsters Ink Books
12249,Red Navy at Sea: Soviet Naval Operations on th...,Bruce W. Watson,1982,Westview Press/Arms and Armour Press
2615,He Saves the Day,Marsha Hayles,2002,Putnam Juvenile
8865,The Harvest of the Hills: Rural Life in Northe...,Angus J. L. Winchester,2000,Edinburgh University Press
9578,Ancient Israel Myths & Legends 3 in 1 (Myths a...,Angelo Solomon Rappoport,1988,Random House Value Publishing
16210,Dead Not Buried,Martin Beales,1995,Hale


# Combined recommendation system (content + weighted rating)

So here were going to combine our content based recommendations with the weighted raiting scheme we cooked up earlier. 

In [40]:
titles = samples[['Name', 'Authors', 'PublishYear', 'Publisher', "Rating", "RatingDistTotal"]]

def get_weighted_content_recommendations(title):
    try:
        # handle case in which book by same title is in dataset
        idx = indices[title][0]
    except IndexError:
        idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # get the top 40 recommendations
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:40]

    book_indices = [i[0] for i in sim_scores]
    books = titles.iloc[book_indices]

    # now sort by weighted rating
    ratings = books["Rating"]
    rating_count = books["RatingDistTotal"]
    MR = ratings.convert_dtypes(np.float16).mean()
    print("Mean Rating (MR) accross all books: ", MR)

    # relax to 75% quantile
    q = rating_count.quantile(.75)
    print("Minimum number of required ratings: ", q)
    
    books["wRating"] = books.apply(weighted_rating, axis=1)
    return books.sort_values(by="wRating", ascending=False)


In [41]:
index = 17136
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: Mathematics Made Simple 
Description: Brushing up on math has never been easier!Just about everyone can use some extra help improving or remembering basic math skills. Finally, all the information you need to master the basics, once and for all, is at your fingertips. Featuring several overviews of a multitude of mathematical concepts, as well as detailed learning plans, Mathematics Made Simple presents the information you need in clear, concise lessons that make math fun to study. Easy-to-use features include:* complete coverage of fractions, decimals, percents, algebra, linear equations, graphs, probability, geometry, and trigonometry* step-by-step solutions to every problem *multiple choice tests to help you monitor your progress * a final, comprehensive test that makes it easy to pinpoint your strengths and target areas that need work* glossaries of relevant mathematical terms* sidebars that introduce real-life applications of mathematical principlesVoid of lengthy explanati

In [42]:
get_weighted_content_recommendations(title)[:10]

Mean Rating (MR) accross all books:  3.32607171474359
Minimum number of required ratings:  98.5


Unnamed: 0,Name,Authors,PublishYear,Publisher,Rating,RatingDistTotal,wRating
19866,Linear Algebra Done Right,Sheldon Axler,1997,Springer,4.308594,815,3.744844
2386,Math For All Seasons: Mind-Stretching Math Rid...,Greg Tang,2002,Scholastic Press,4.089844,421,3.738449
17023,The Unexpected Hanging and Other Mathematical ...,Martin Gardner,1986,Simon & Schuster,4.300781,112,3.736708
19323,Give Me Half! (Mathstart: Level 2),Stuart J. Murphy,1996,Turtleback Books,3.919922,319,3.736612
15085,Life of Fred: Decimals and Percents,Stanley F. Schmidt,2007,Polka Dot Publishing,4.269531,85,3.736342
9285,Missing Math,Loreen Leedy,2008,Two Lions,4.0,161,3.736283
7737,50 Mathematical Ideas You Really Need to Know,Tony Crilly,2007,Quercus,3.769531,977,3.736082
13757,Introductory Mathematical Analysis for Busines...,Ernest F. Haeussler Jr.,2007,Pearson,3.859375,147,3.735784
11869,The Mission of Addition,Brian P. Cleary,2005,Millbrook Press,3.869141,119,3.735737
534,Introduction to Stochastic Processes,Gregory F. Lawler,2006,CRC Press,4.328125,21,3.735667


In [43]:
index = 18457
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: Superman/Batman, Vol. 7: The Search for Kryptonite 
Description: From Michael Green and Mike Johnson, two of the writers of the hit NBC TV series Heroes, comes this fast-paced new volume in the SUPERMAN/BATMAN series.The Man of Steel and the Dark Knight have decided that they must undertake the impossible task of recovering all the Kryptonite on Earth. But there are plenty of roadblocks in the way, as they realize that not everyone's willing to hand over the deadly substance. And one of those who's not ready to cooperate is none other than their fellow Justice League member, Aquaman.Collecting SUPERMAN/BATMAN #44-49. 
Author: Michael  Green 
Year: 2008


In [44]:
get_weighted_content_recommendations(title)[:10]

Mean Rating (MR) accross all books:  3.726862980769231
Minimum number of required ratings:  529.0


Unnamed: 0,Name,Authors,PublishYear,Publisher,Rating,RatingDistTotal,wRating
11748,Batman: The Dark Knight Returns,Frank Miller,2008,Paw Prints,4.261719,160749,4.139332
7511,Batman: The Long Halloween,Jeph Loeb,1999,Turtleback Books,4.320312,75624,4.091177
7103,Robin Year One,Chuck Dixon,2008,DC Comics,4.039062,4566,3.761437
16194,JLA/Avengers,Kurt Busiek,2008,DC Comics,4.0,3607,3.753655
18201,Superman: Brainiac,Geoff Johns,2009,DC Comics,4.019531,3101,3.752418
14795,Batman: The Black Glove,Grant Morrison,2008,DC Comics,3.880859,4953,3.748837
8531,The Dark Knight (Dark Knight Trilogy #2),Dennis O'Neil,2008,Berkley,4.148438,1255,3.745786
6019,Justice League Of America Vol. 1: Tornado's Path,Brad Meltzer,2008,DC Comics,3.849609,2091,3.740112
11345,"Justice League International, Vol. 3",Keith Giffen,2008,DC Comics,4.019531,535,3.738498
15204,"Booster Gold, Vol. 2: Blue and Gold",Geoff Johns,2008,DC Comics,4.03125,474,3.738263


In [45]:
index = 10984
title = samples.iloc[index].Name
desc = samples.iloc[index].Description
author = samples.iloc[index].Authors
year = samples.iloc[index].PublishYear
print("Title:", title, "\nDescription:", desc, "\nAuthor:", author, "\nYear:", year)

Title: The Grave Tattoo 
Description: From bestselling author Val McDermid comes a modern thriller about an ancient murder set on the high seas…After summer rains uncover a corpse bearing tattoos like those of eighteenth-century seafarers, many residents of the English Lake District can’t help but wonder whether it’s the body of one of the town’s most legendary fugitives.Scholar and native Lakelander Jane Gresham feels compelled to finally discover the truth about the myths and buried secrets rooted in her hometown. What she never expected was to find herself at the heart of a 200-year-old mystery that still has the power to put lives on the line. And with each new lead she pursues, death follows hard on her heels…. 
Author: Val McDermid 
Year: 2008


In [46]:
get_weighted_content_recommendations(title)[:10]

Mean Rating (MR) accross all books:  3.6829427083333335
Minimum number of required ratings:  2418.5


Unnamed: 0,Name,Authors,PublishYear,Publisher,Rating,RatingDistTotal,wRating
5461,"Wanted (Pretty Little Liars, #8)",Sara Shepard,2010,HarperTeen,4.128906,41294,3.915954
10204,"Chasing Darkness (Elvis Cole, #11)",Robert Crais,2008,Wheeler Publishing,4.199219,15775,3.848879
14230,Beneath The Bleeding (Tony Hill & Carol Jordan...,Val McDermid,2008,HarperCollins Publishers,4.109375,6771,3.781053
2761,Old Filth,Jane Gardam,2004,Chatto & Windus,3.939453,13897,3.780705
3469,"A Thin Dark Line (Broussard and Fourcade #1, D...",Tami Hoag,1997,Bantam,3.990234,8108,3.771777
8310,"Ultimate Weapon (McClouds & Friends, #6)",Shannon McKenna,2008,Brava,4.148438,2220,3.753416
15453,"Runner (Jane Whitefield, #6)",Thomas Perry,2009,Houghton Mifflin Harcourt,4.050781,2617,3.751492
18274,One Day at a Time,Danielle Steel,2009,Delacorte Press,3.859375,6954,3.750899
19105,Kindred Crimes (Jeri Howard Mystery #1),Janet Dawson,1990,St. Martin's Press,4.199219,1088,3.745545
10055,Lake News,Barbara Delinsky,1999,Headline,3.849609,4523,3.745115


In [47]:
# comparison!
print("BASE CONTENT RECOMMENDATIONS")
print(get_content_recommendations(title)[:10])

print("\n\nWEIGHTED CONTENT RECOMMENDATIONS")
get_weighted_content_recommendations(title) # see all top 40 for ranking comparison

BASE CONTENT RECOMMENDATIONS
                                                    Name  \
16117                                       Woman in Red   
14     The Norton Anthology of English Literature, Vo...   
2606   Japanese High Seas Fleet (Ballantine's Illustr...   
16607                                         Death Drop   
6444                     Crack Down (Kate Brannigan, #3)   
12249  Red Navy at Sea: Soviet Naval Operations on th...   
2615                                    He Saves the Day   
8865   The Harvest of the Hills: Rural Life in Northe...   
9578   Ancient Israel Myths & Legends 3 in 1 (Myths a...   
16210                                    Dead Not Buried   

                        Authors  PublishYear  \
16117             Eileen Goudge         2008   
14                  M.H. Abrams         2005   
2606             Richard Humble         1973   
16607                 B.M. Gill         1980   
6444               Val McDermid         2002   
12249           Bruce 

Unnamed: 0,Name,Authors,PublishYear,Publisher,Rating,RatingDistTotal,wRating
5461,"Wanted (Pretty Little Liars, #8)",Sara Shepard,2010,HarperTeen,4.128906,41294,3.915954
10204,"Chasing Darkness (Elvis Cole, #11)",Robert Crais,2008,Wheeler Publishing,4.199219,15775,3.848879
14230,Beneath The Bleeding (Tony Hill & Carol Jordan...,Val McDermid,2008,HarperCollins Publishers,4.109375,6771,3.781053
2761,Old Filth,Jane Gardam,2004,Chatto & Windus,3.939453,13897,3.780705
3469,"A Thin Dark Line (Broussard and Fourcade #1, D...",Tami Hoag,1997,Bantam,3.990234,8108,3.771777
8310,"Ultimate Weapon (McClouds & Friends, #6)",Shannon McKenna,2008,Brava,4.148438,2220,3.753416
15453,"Runner (Jane Whitefield, #6)",Thomas Perry,2009,Houghton Mifflin Harcourt,4.050781,2617,3.751492
18274,One Day at a Time,Danielle Steel,2009,Delacorte Press,3.859375,6954,3.750899
19105,Kindred Crimes (Jeri Howard Mystery #1),Janet Dawson,1990,St. Martin's Press,4.199219,1088,3.745545
10055,Lake News,Barbara Delinsky,1999,Headline,3.849609,4523,3.745115


# Conclusion

So above we can qualitatively see that our recommendations that include the weighted rating are better on average as they aren't suggesting books that have zero ratings (which are probably books that no-one has read).

There is definetly a biased towards books that are most frequently reviewed, but that can be adjusted in the weighted rating if novelty is important for the recommenation system.

Futher work could be done to include user rating data directly on a per user basis, but I will leave that as an exercise for the user.

# Hope you enjoyed!! 

Please feel free to use but be sure to link to any notebooks I got inspiration from (that I linked to)!