# Introduction

This data was entirely scraped via the Goodreads API, so kudos to them for providing such a simple interface to scrape their database. The basic idea behind analysing the Goodreads dataset is to get a fair idea about the relationships between the multiple attributes a book might have.

**Our data includes:**
* bookID
* title
* authors
* average_rating - the average rating of the books, as decided by the Goodreads users
* isbn - unique number to identify the book, the International Standard Book Number
* isbn13 - a 13-digit ISBN to identify the book, instead of the standard 11-digit ISBN
* language_code
* num_pages - number of pages
* ratings_count - total number of ratings the book received
* text_reviews_count - total number of written text reviews the book received.
* publication_date 
* publisher

# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.precision', 5)
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Reading in the data, reshaping and cleaning

In [None]:
# Reading in the csv file
data = pd.read_csv('../input/goodreadsbooks/books.csv', error_bad_lines = False) # Not every line can be read correctly

# The index should be the bookID
data.index = data['bookID']

In [None]:
# Checking the data's columns
data.info()

In [None]:
# Checking the shape of the data
print('Data shape:', data.shape)

In [None]:
# Column renaming 
data.rename(columns={'  num_pages':'total_pages'},inplace=True)

# Printing out the first 5 rows
display(data.head(6))

In [None]:
# Fixing J.K. Rowling rows
data.replace(to_replace='J.K. Rowling-Mary GrandPré', value = 'J.K. Rowling', inplace=True)

# Creating a new columns for the published year, month, day (mm/dd/yyyy)
data['publish_year'] = data['publication_date'].str.split('/').apply(lambda x: x[2]).astype(int)
data['publish_month'] = data['publication_date'].str.split('/').apply(lambda x: x[0]).astype(int)
data['publish_day'] = data['publication_date'].str.split('/').apply(lambda x: x[1]).astype(int)

# Creating missing data table
# Total - total number of missing data
# Percent - percentage of the dataset
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
# Describe columns with numerical datatypes (float and int)
data.describe()

In [None]:
# Checking unique values in each feature
for col in data.columns:
    unique_vals = data[col].unique()
    if len(unique_vals) < 5:
        print('Unique values for column {}: {}'.format(col, unique_vals))
    else:
        if is_string_dtype(data[col]):
            print('Column {} has values string type'.format(col))
        elif is_numeric_dtype(data[col]):
            print('Column {} is numerical'.format(col))

In [None]:
# Deleting ISBN (International Standard Book Number) numbers from the data set
del data['isbn']
del data['isbn13']

# EDA

## NOT visualisation based

In [None]:
print('Number of unique authors in this dataset:', data['authors'].nunique())
print('Number of total books: {}'.format(data['title'].count()))
print('Number of unique books: {}'.format(data['title'].value_counts().count()))
print('There are ' + str(data['ratings_count'].isin([0]).sum()) +' books with 0 reviews.')

In [None]:
# Average rating of the 15 most rated books
data.sort_values(by=['ratings_count'], ascending=False).loc[:,('title','average_rating', 'ratings_count')].head(15)

In [None]:
# Average rating, rating count of the 15 longest books
data.sort_values(by=['total_pages'], ascending=False).loc[:,('title', 'average_rating', 'ratings_count', 'total_pages')].head(15)

In [None]:
# Checking how many books have 5.0 rating
best_rating = data.loc[data['average_rating'] == 5.0]
print(f'Number of books: {best_rating.shape[0]}')
best_rating

In [None]:
# Checking how many books have 1.0 rating
worst_rating = data.loc[data['average_rating'] == 1.0]
print(f'Number of books: {worst_rating.shape[0]}')
worst_rating

In [None]:
# The 15 shortest book
data[data['total_pages'] > 0].sort_values('total_pages').head(15)

## Visualisation based

In [None]:
# Correlation heatmap
plt.figure(figsize=(20, 10))

# Compute the correlation matrix
corr = data.apply(lambda x: pd.factorize(x)[0]).corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Draw the heatmap with the mask and correct aspect ratio
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)

In [None]:
# Most books written by an author (TOP 15)
plot_data = pd.DataFrame() 
plot_data['authors'] = data.authors.value_counts()
plot_data = plot_data.sort_index(ascending=True)
plot_data = plot_data.nlargest(15, 'authors')
fig = px.bar(plot_data, x=plot_data.index, y=plot_data['authors'], color='authors', labels={'index': "Author's name", 'authors': 'Number of books written'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books written', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text="Author's name", row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=850, height=600,
    title_font=dict(size=25, family='Courier'),
    title='Most books written by an author (TOP 15)',
)

fig.show()

In [None]:
# Most books published by publisher (TOP 15)
plot_data = pd.DataFrame() 
plot_data['publisher'] = data.publisher.value_counts()
plot_data = plot_data.sort_index(ascending=True)
plot_data = plot_data.nlargest(15, 'publisher')
fig = px.bar(plot_data, x=plot_data.index, y=plot_data['publisher'], color='publisher', labels={'index': 'Publisher', 'publisher': 'Number of books published'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books published', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Publisher', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=850, height=600,
    title_font=dict(size=25, family='Courier'),
    title='Most books published by publisher (TOP 15)',
)

fig.show()

In [None]:
# Books by language
plot_data = pd.DataFrame() 
plot_data['language_code'] = data.language_code.value_counts()
plot_data = plot_data.sort_index(ascending=True)
fig = px.bar(plot_data, x=plot_data.index, y=plot_data['language_code'], color='language_code', labels={'index': 'Language', 'language_code': 'Number of books'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Language', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=850, height=750,
    title_font=dict(size=25, family='Courier'),
    title='Books by language',
)

fig.show()

In [None]:
plot_data = pd.DataFrame() 
plot_data['most_rated'] = data.groupby('title')['text_reviews_count'].sum().sort_values(ascending=False)
plot_data = plot_data.nlargest(10, 'most_rated')
fig = px.bar(plot_data, x=plot_data.index, y=plot_data['most_rated'], color='most_rated', labels={'title': "Book's name", 'most_rated': 'Number of reviews'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of reviews', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text="Book's name", row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=850, height=750,
    title_font=dict(size=25, family='Courier'),
    title='Most reviewed books (TOP 10)',
)

fig.show()

In [None]:
# Creating 3 plots using average_rating as X and the stats list's elements for Y axis
stats = ['total_pages', 'ratings_count', 'text_reviews_count']

for i, st in enumerate(stats):
    fig = px.scatter(data, x='average_rating', y=st, color=st, labels={'average_rating': 'Average rating'})
    
    # Update yaxis properties
    fig.update_yaxes(title_text=st, row=1, col=1)
    # Update xaxis properties
    fig.update_xaxes(title_text='Average rating', row=1, col=1)

    # Update size and title
    fig.update_layout(autosize=True, width=900, height=500,
        title_font=dict(size=25, family='Courier'),
        title='Average rating and '+ st,
    )
    
    fig.show()

In [None]:
fig = px.scatter(data, x='average_rating', marginal_x='histogram', color='ratings_count', labels={'average_rating': 'Average rating', 'ratings_count': 'Number of ratings'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Rating', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=1200, height=500,
    title_font=dict(size=25, family='Courier'),
    title='Average rating distribution for all books',
)


fig.show()

In [None]:
fig = px.scatter(data, x='total_pages', marginal_x='histogram', color='ratings_count', labels={'total_pages': 'Number of pages', 'ratings_count': 'Number of ratings'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Pages', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=900, height=500,
    title_font=dict(size=25, family='Courier'),
    title='Average page number distribution for all books',
)


fig.show()

In [None]:
fig = px.scatter(data, x='publish_year', marginal_x='histogram', color='ratings_count', labels={'publish_year': 'Year', 'ratings_count': 'ratings_count'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Year', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=900, height=500,
    title_font=dict(size=25, family='Courier'),
    title='Publiaction date distribution (Yearly)',
)

fig.show()

In [None]:
fig = px.scatter(data, x='publish_month', marginal_x='histogram', color='publish_year', labels={'publish_month': 'Month', 'publish_year': 'Year'})

# Update yaxis properties
fig.update_yaxes(title_text='Number of books', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Year', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=900, height=500,
    title_font=dict(size=25, family='Courier'),
    title='Publiaction date distribution (Monthly)',
)

fig.show()