# Goodreads Dataset Exploration

I like books and I like data. So I wanted to create fun and interesting visualistions where these two pleasures of mine intersect. I wanted to see if there were any immediately observable trends about popular books in the data.

The main questions I wanted to address are:
1. What does the average book look like?
2. Does the number of people who have read a book correlate to the rating of said book? ie does the popularity of a book suggest its “goodness”
3. Which genres perform better?

### Importing Useful Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import math
import re
import statistics

In [None]:
os.getcwd()

### Gather:
Importing the dataset

In [None]:
books = pd.read_csv('goodreads_books.csv')

### Assess
Looking at the values of observations in the datset

In [None]:
books.head()

### Clean
Removing columns that are not useful, for example, columns that contain web links

In [None]:
remove_col = ['link','cover_link', 'author_link', 'isbn13', 'isbn', 'asin', 'amazon_redirect_link', 'worldcat_redirect_link', 'recommended_books', 'books_in_series', 'description']
books = books.drop(columns = remove_col)

In [None]:
books.head()

In [None]:
books.info()

Rows with no rating were dropped, as ratings suggest that people have read the book. Therefore no conclusion can be drawn about the appeal or popularity of the books if no one has rated it.  

In [None]:
books = books[~books.rating_count.isin([0])]

The `number_of_pages` is the only numerical variable that has some missing values. The missing values were filled with the mean values. Books ideally will not have zero pages, so it was not be good to replace `NaN` with zero, nor is the variable central to understanding the popularity of the book, so there is no need to drop such rows. Mean values works well as an approximation of the book size. 

In [None]:
books.number_of_pages = books.number_of_pages.fillna(books.number_of_pages.mean())

The missing values in `genre_and_votes` were replaced with a label `Unknown`, to act as a categorical label for ease of analysis. 

In [None]:
books['genre_and_votes'] = books['genre_and_votes'].replace(np.NaN, 'Unknown')

In order to simplify the `genre_and_votes` variable, a new variable was created that only listed the most voted for genre. 

In [None]:
genres = []
for i, text in enumerate(books['genre_and_votes']):
    genre = re.split('\d',text,1)[0].rstrip()
    genre = re.split('-',genre,1)[0].rstrip()
    genres.append(genre)
    
print(genres[0:10])

In [None]:
books['genre'] = genres
print(len(np.unique(genres)))

I created a binary variable to classify a book as being part of a series or not. 

In [None]:
is_series = []

for i in range(len(books)):
    series = 1 if type(books.iloc[i,2]) == str else 0
    is_series.append(series)
    
print(is_series[0:5])

In [None]:
books['is_series'] = is_series

Similarly, another binary variable was created to classify if a book had been nomineed for at least one award

In [None]:
is_nominee = []

for i in range(len(books)):
    nominee = 1 if type(books.iloc[i,-3]) == str else 0
    is_nominee.append(nominee)
    
print(is_nominee[0:5])
books['is_nominated'] = is_nominee

A dataframe that focuses on the numerical values only was created. 

In [None]:
books_num = books[['rating_count', 'review_count', 'average_rating', 'five_star_ratings', 'four_star_ratings','three_star_ratings', 
       'two_star_ratings', 'one_star_ratings', 'number_of_pages']]

In [None]:
books_num.info()

### Visualise

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
#Checking correlation between variables
plt.figure(figsize=(15,4))
corr_matrix = books_num.corr()
sns.heatmap(corr_matrix, annot=True)


In [None]:
def distribution_plots(df, column_names, num_cols = 2, fig_size=(20,16), save = False, **kwarg):
    """
    Description: This function can be used to visualise multiple distribution plots using subplots

    Arguments:
        df: the dataframe 
        column_names: list, column names of 
                        which the distributions to be plotted 
        num_cols: int, number of columns to split the plot space into subplots. Default = 2
        fig_size: tuple, size of the plot space. Default = (20,16)
        save: bool, decides to save the plot. Default = False
        name: optional string, file name to save image as. Only valid if `save` = True
        
    Returns:
        None
    """
    
    num_plots = len(column_names)  
    num_rows = math.ceil(num_plots/num_cols)
    fig = plt.subplots(num_rows, num_cols, figsize=fig_size)
    
    for i, col in enumerate(column_names):
        plt.subplot(num_rows, num_cols, (i+1))
        plt1 = sns.kdeplot(df[col])
        plt1.set_title(("Distribution of "+ col))
        
    if save:    
        plt.savefig(kwarg['name'] +'.png')

In [None]:
# DISTRIBUTION PLOTS
column_list = ["rating_count", "average_rating", 'number_of_pages', 'five_star_ratings']
distribution_plots(books_num, column_names = column_list, num_cols = 2, fig_size=(20,16), save = True, name = "distribution_plots")

In [None]:
# A better look at the distributions of books with lengths less than 1000 pages, as the original plot is skewed. 

plt1 = sns.kdeplot(books_num[books_num['number_of_pages'] < 1000]["number_of_pages"])
plt1.set_title("Distribution of number of pages of books with less than 1000pages")

### Analyse

#### The Average Book

In [None]:
#Desriptive Statistics of numerical variables
books.describe()

In [None]:
#Most common genre
print(statistics.mode(books['genre']))

#Second most common genre
print(statistics.mode(books[books['genre']!= 'Fantasy']['genre']))

In [None]:
#number of books in 2 most common genres
print(len(books[books['genre'] == 'Fantasy']['genre']))
print(len(books[books['genre'] == 'Fiction']['genre']))

In [None]:
#number of books that are part of a series
print(len(books[books['is_series'] == True]['is_series']))

#determine if being part of a series is the most frequent reality of books
print(statistics.mode(books['is_series']))

In [None]:
#number of books that are nominated for any award
print(len(books[books['is_nominated'] == True]['is_nominated']))

#determine if being award nominated is the most frequent reality of books
print(statistics.mode(books['is_nominated']))

In [None]:
books_num.loc[:,'proportional_five_stars'] = books_num.loc[:,'five_star_ratings']/books_num.loc[:,'rating_count']

In [None]:
books_num.loc[:,'review_rating_ratio'] = books_num.loc[:,'review_count']/books_num.loc[:,'rating_count']

In [None]:
#Distribution plots of new variables
column_list = ["proportional_five_stars", "review_rating_ratio"]
distribution_plots(books_num, column_names = column_list, num_cols = 2, fig_size=(16,8), 
                   save = True, name = "distribution_plots_3")

In [None]:
books_num.head()

In [None]:
books_num.describe()

In [None]:
sns.pairplot(books_num)

#### Five Star Rated Books
Does the number of people who have read book correlate to the rating of said book? ie does the popularity of a book suggest its “goodness”

In [None]:
#Books with average rating of 5
five_star_books = books_num[books_num['average_rating'] == 5]

In [None]:
#average values of variables for 5-rated books
five_star_books.mean()

In [None]:
#highest values of variables for 5-rated books
five_star_books.max()

In [None]:
#Identifying five star book with most ratings
five_star_books[five_star_books['rating_count'] == 375]

In [None]:
books.loc[44353,"title"]

In [None]:
#Identifying five star book with most reviews
five_star_books[five_star_books['review_count'] == 17]

In [None]:
books.loc[12217,["title","author"]]

In [None]:
#descriptive statistics for five star books
five_star_books.describe()

In [None]:
#Distribution plot of five star books' rating count
column_list = ["rating_count"]
distribution_plots(five_star_books, column_names = column_list, num_cols = 1, fig_size=(8,6), 
                   save = True, name = "distribution_plots_2")

#### Genre
Which genres perform better?

In [None]:
#Looking at some variable by their mean, grouped by genre
mean_by_genre = books.groupby('genre').mean().sort_values("average_rating", ascending=False)[['average_rating','number_of_pages', 'rating_count','review_count','is_series', 'is_nominated']]

In [None]:
#Looking at some variable by their total, grouped by genre
sum_by_genre = books.groupby('genre').sum().sort_values('rating_count', ascending=False)[['rating_count','review_count','is_series', 'is_nominated']]

In [None]:
mean_by_genre.head(10)

In [None]:
top_10_genres = mean_by_genre.head(10)

In [None]:
fig = plt.subplots(1,1, figsize=(16,8))
plt6 = sns.barplot(x = top_10_genres.index, y = 'average_rating', data = top_10_genres, orient = None)
plt6.set_title("Average Rating of Top 10 Genres")
plt.savefig('chart.png')

In [None]:
sum_by_genre.head()

In [None]:
by_genre = mean_by_genre.join(sum_by_genre, on = 'genre')

In [None]:
by_genre.head()

In [None]:
#correlation matrix of variables grouped by genre
corr_matrix = by_genre.corr()
sns.heatmap(corr_matrix, annot=True)

#### Bonus: Publisher
A look at the impact a publishing house has on popularity of books

In [None]:
books.publisher = books.publisher.fillna("Unknown")

In [None]:
len(np.unique(books.publisher))

In [None]:
sum_by_publisher = books.groupby('publisher').sum().sort_values('rating_count', ascending=False)[["rating_count","review_count","average_rating","number_of_pages","is_series","is_nominated"]]
top10_publishers = sum_by_publisher.head(10)

In [None]:
top10_publishers.index

In [None]:
mean_by_publisher = books.groupby('publisher').mean().sort_values('rating_count', ascending=False)[["rating_count","review_count","average_rating","number_of_pages","is_series","is_nominated"]]


In [None]:
top10_publishers_mean = mean_by_publisher[mean_by_publisher.index.isin(top10_publishers.index)]

In [None]:
fig = plt.subplots(1,1, figsize=(16,8))
plt7 = sns.barplot(x = top10_publishers_mean.index, y = 'average_rating', data = top10_publishers_mean, orient = None)
plt7.set_title("Average Rating of Top 10 Publishers")
