In [1]:
from docx import Document
from docx.shared import Inches
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.plotting import table
from matplotlib.figure import Figure
import matplotlib
import os


# Matplotlib configuration settings
matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['figure.autolayout'] = True
matplotlib.rcParams['savefig.transparent'] = True
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['axes.spines.top'] = False
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.use('agg')

# 1. Load a dataset 

In [2]:
data = sns.load_dataset('mpg')  
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [3]:
# Customizable options
DOC_TITLE = 'Exploratory Data Analysis Report'
GRAPH_COLOR = 'cyan'
TABLE_STYLE = 'Table Grid'
OUTPUT_FILE = 'basic-report.docx'


# Ensure 'images' folder exists
if not os.path.isdir('images'):
    os.mkdir('images')

# 2. Descriptive Statistics

## 2.1 Title page

In [4]:
def get_intro_text():
    """Get a brief summary with the shape of the data."""
    num_rows, num_cols = data.shape
    num_numeric = data.select_dtypes(include='number').columns.size

    if not num_numeric:
        num_numeric = "None"

    row_info = f"The data contains {num_rows} observations (rows)"
    col_info = f"and {num_cols} variables (columns),"
    numeric_cols = f"{num_numeric} of which are numeric:"

    document.add_paragraph(' '.join([row_info, col_info, numeric_cols]))


def get_numerics_scatterplot():
    """Create a joint scatter-plot of all numeric columns."""
    # add scatterplot picture
    fig = sns.pairplot(data, height=1.75, plot_kws={'color': GRAPH_COLOR},
                       diag_kws={'color': GRAPH_COLOR})
    fig.fig.suptitle('Scatter-plots of Numeric Columns', x=0.5, y=1.04, size=20)
    fig.savefig('images/pair_plot.png')
    document.add_picture('images/pair_plot.png', width=Inches(6.5))

## 2.2 Individual Variables

In [5]:
def create_summary_statistics_table(summary_stats_df):
    """Create a table of summary statistics.
    
    Parameters:
    ----------
    summary_stats_df: pandas.DataFrame
        A dataframe of summary statistics.
    """
    table = document.add_table(rows=len(summary_stats_df), cols=2)
    table.style = document.styles[TABLE_STYLE]
    table.columns[0].width = Inches(2.5)
    table.columns[1].width = Inches(2)
    items = [(label, str(val)) for label, val in summary_stats_df.items()]

    for idx, row in enumerate(table.rows):
        label, value = items[idx]
        row.cells[0].text = label
        row.cells[1].text = value


def create_most_frequent_table(data_series):
    """Create a table with information about the 5 most frequent values.
    
    Parameters:
    ----------
    data_series: pandas.Series
    """
    document.add_heading('Most Frequent', level=4)

    top5_items = data_series.value_counts().head()
    percentages = top5_items / len(data_series) * 100
    top5 = pd.concat([top5_items, percentages], axis=1)
    top5_list = list(top5.itertuples())
    
    table = document.add_table(rows=len(top5), cols=2)
    table.style = document.styles[TABLE_STYLE]
    table.columns[0].width = Inches(2.5)
    table.columns[1].width = Inches(2)
   
    for idx, row in enumerate(table.rows):
        item, freq, pct = top5_list[idx]
        row.cells[0].text = item
        row.cells[1].text = f"{freq:,} ({pct:.2f}%)"

        
def get_statistics_numeric(numeric_series):
    """Get summary statistics for numeric columns.
    
    Parameters:
    ----------
    numeric_series: pandas.Series
        A series with numeric data.
    """
    document.add_heading('Summary Statistics', level=4)

    summary = numeric_series.describe()
    summary.index = ['Number of observations', 'Average', 'Standard Deviation',
                     'Minimum', 'Lower Quartile', 'Median', 'Upper Quartile', 
                     'Maximum']
    summary['Skewness'] = numeric_series.skew()
    summary['Kurtosis'] = numeric_series.kurt()
    summary = summary.round(4)
    create_summary_statistics_table(summary)


def get_statistics_categorical(categorical_series):
    """Get summary statistics for categorical columns.
    
    Parameters:
    ----------
    categorical_series: pandas.Series
        A Series with categorical data.
    """
    document.add_heading('Summary', level=4)

    count = categorical_series.count()
    unique = categorical_series.nunique()
    commonest_item = categorical_series.value_counts().idxmax()
    summary = pd.Series([count, unique, commonest_item],
                     index = ['Number of observations', 'Unique values', 
                              'Mode (Highest occurring value)'])

    create_summary_statistics_table(summary)

## 2.3 Boxplots, histograms and barplots

In [6]:
def plot_numeric_col(data_series):
    """Get a boxplot and a histogram.
    
    Parameters:
    ----------
    data_series: pandas.Series
        A series with numeric data.
    """
    name = data_series.name.title()
    fig = Figure(figsize=(6, 6))
    ax1, ax2 = fig.subplots(2, 1)
    
    ax1.boxplot(data_series.dropna(), vert=False, notch=True)
    ax1.set_yticklabels([''])
    ax1.set_xlabel(f'{name}')
    ax1.set_title(f'Box-plot of {name}', size=12)
    
    ax2.set_title(f'Distribution plot of {name}', size=12)
    sns.histplot(x=data_series, kde=True, ax=ax2, color=GRAPH_COLOR)
    fig.savefig(f'images/{name}.png')


def plot_categorical_col(data_series):
    """Get a bar-plot of the given categorical data.

     Parameters:
    ----------
    data_series: pandas.Series
        A series with categorical data.
    """
    name = data_series.name.title()
    
    fig = Figure(figsize=(6, 4))
    ax = fig.subplots()
    sns.countplot(x=data_series, color=GRAPH_COLOR, ax=ax)
    ax.set_title(f'Bar-plot of {name}', size=12)
    fig.savefig(f'images/{name}.png')

## 2.4 Descriptive summary

In [7]:
def compute_missing(data_series):
    """Get the proportion of missing values.
    
    Parameters:
    ----------
    data_series: pandas.Series
    """
    num_missing = data_series.isna().sum()
    if num_missing == 0:
        return "None"
    else:
        return f"{num_missing} ({num_missing / len(data_series):.2%})"


def get_variable_summary(idx, data_series):
    """Get a brief summary of the given variable.
    
    Parameters:
    ----------
    idx: int
        Variable position(index). Ranges from 1 to number-of-columns.
    data_series: pandas.Series
    """
    name = data_series.name.capitalize()
    
    document.add_heading(f'{idx}. {name.title()}', level=2)
    
    unique_values = f"""{name} has {data_series.nunique()} unique values."""
    missing_values = f"{compute_missing(data_series)} of its values are missing."

    document.add_paragraph(' '.join([unique_values, missing_values]))
    
    
def compile_variable_summaries():
    """Combine variable summaries, tables & graphs, and add them to the report."""
    numeric = data.select_dtypes(include='number')
    categorical = data.select_dtypes(include='object')
    idx = 1
    
    if numeric.shape[1] > 0:
        for col_name, series in numeric.items():
            get_variable_summary(idx, series)
            
            get_statistics_numeric(series)            
            document.add_paragraph()
            
            plot_numeric_col(series)
            document.add_picture(f'images/{col_name.title()}.png', width=Inches(5))
            document.add_page_break()
            idx += 1

    if categorical.shape[1] > 0:
        for col_name, series in categorical.items():
            get_variable_summary(idx, series)
            
            get_statistics_categorical(series)            
            document.add_paragraph()
            
            create_most_frequent_table(series)            
            document.add_paragraph()
            
            plot_categorical_col(series)
            document.add_picture(f'images/{col_name.title()}.png', width=Inches(5))
            document.add_page_break()
            idx += 1

# 3. Bivariate Analysis

In [8]:
correlation_df = data.corr()


def plot_joint_correlation():
    """Plot a heatmap of the correlation in all numeric columns in the data."""
    fig = Figure(figsize=(6, 6))
    ax = fig.subplots()
    sns.heatmap(correlation_df, annot=True, mask=np.triu(correlation_df), ax=ax,
                yticklabels=True, cmap=sns.light_palette(GRAPH_COLOR, as_cmap=True))
    ax.tick_params(rotation=45)
    fig.suptitle('Correlation in Numeric Columns', size=20)
    fig.savefig('images/joint_corr.png')

    
def bivariate_analysis_intro():
    """Get introductory text for the bivariate analysis."""
    document.add_heading('Bivariate Analysis (Correlation)\n', level=1)
    
    plot_joint_correlation()    
    document.add_picture('images/joint_corr.png', width=Inches(6.7))
    document.add_page_break()


def get_var_pairs():
    """Get a list of unique pairings of the numeric variables"""
    return [(correlation_df.index[x], correlation_df.index[y])
            for x, y in zip(*np.triu_indices(len(correlation_df)))
            if x != y]


def plot_regression(var1, var2):
    """Create a scatterplot with a fitted linear regression line.
    
    Parameters:
    ----------
    var1, var2: string
        A pair of numeric column(variable) names.
    """
    fig = Figure(figsize=(8.2, 4))
    ax1, ax2 = fig.subplots(1, 2)
    sns.regplot(x=var1, y=var2, data=data, ax=ax1, truncate=False, color=GRAPH_COLOR)
    sns.regplot(x=var2, y=var1, data=data, ax=ax2, truncate=False, color=GRAPH_COLOR)
    ax1.set_title(f'Scatterplot - {var1.title()} vs {var2.title()}', size=9)
    ax2.set_title(f'Scatterplot - {var2.title()} vs {var1.title()}', size=9)
    fig.savefig(f'images/{var1}~{var2}.png')

    
def quantify_correlation(var1, var2):
    """Explain the magnitude of correlation.
    
    Parameters:
    ----------
    var1, var2: string
        A pair of numeric column(variable) names.
    """
    value = correlation_df.loc[var1, var2]
    nature = ' positive' if value > 0 else ' negative'
    
    if abs(value) >= 0.9:
        strength = 'very strong'
    elif 0.7 <= abs(value) < 0.9:
        strength = 'strong'
    elif 0.5 <= abs(value) < 0.7:
        strength = 'moderate'
    elif 0.3 <= abs(value) < 0.5:
        strength = 'weak'
    elif 0.1 <= abs(value) < 0.3:
        strength = 'very weak'
    else:
        strength = 'virtually no'
        nature = ''
    
    return f'{strength}{ nature} correlation ({value:.2f})'


def compare_variable_pairs():
    """
    Get a brief summary of the nature of correlation between pairs of numeric variables.
    """
    related = get_var_pairs()
    idx = 1
    for var1, var2 in related:
        document.add_heading(f'{idx}. {var1.title()} vs {var2.title()}', level=2)
        corr_info = f'\n{var1.capitalize()} and {var2.capitalize()} have {quantify_correlation(var1, var2)}.'
        document.add_paragraph(corr_info)
        
        plot_regression(var1, var2)
        document.add_picture(f'images/{var1}~{var2}.png', width=Inches(6))
        document.add_paragraph()
        
        idx += 1        

# 4. Compile & export the report

In [9]:
# Create the document
document = Document()
document.add_heading(f'{DOC_TITLE}', level=0)

# title page content
get_intro_text()
get_numerics_scatterplot()
document.add_page_break()

# summaries and graphs
compile_variable_summaries()

# joint correlation plot and pairwise correlation 
bivariate_analysis_intro()
compare_variable_pairs()

# Save as a Word .docx file
document.save(OUTPUT_FILE)