In [1]:
from docx import Document
from docx.shared import Inches
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.plotting import table
from matplotlib.figure import Figure
import matplotlib


matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['figure.autolayout'] = True
matplotlib.rcParams['savefig.transparent'] = True
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['axes.spines.top'] = False
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.use('agg')

In [2]:
df = sns.load_dataset('penguins')
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [3]:
DATA_NAME = 'Penguin Data'
GRAPH_COLOR = 'cyan'



def populate_title_page(data):
    
    num_of_cols = f"There are {data.shape[1]} variables in the data,"
    num_numeric = len(data.select_dtypes(include='number').columns)
    num_numeric = "None" if not num_numeric else num_numeric
    numeric_cols = f"{num_numeric} of which are numeric:"
    
    document.add_paragraph(' '.join([num_of_cols, numeric_cols]))
    
    fig = sns.pairplot(data, height=1.75, plot_kws={'color': GRAPH_COLOR},
                       diag_kws={'color': GRAPH_COLOR})
    fig.fig.suptitle('Scatter-plots of Numeric Columns', x=0.5, y=1.04, size=20)
    fig.savefig('images/pair_plot.png')
    document.add_picture('images/pair_plot.png', width=Inches(6.5))


def compute_missing(data):
    missing = data.isna().sum()
    if missing == 0:
        return "None"
    else:
        return f"{missing} ({missing / len(data) * 100 :.2}%)"

    
def summary_statistics_numeric(data):
    summary = data.describe()
    summary.index = ['Number of observations', 'Average', 'Standard Deviation',
                         'Minimum', 'Lower Quartile', 'Median', 'Upper Quartile', 
                         'Maximum']
    summary['Skewness'] = data.skew()
    summary['Kurtosis'] = data.kurt()
    return summary.round(4)



def summary_statistics_categorical(data):
    count = data.count()
    unique = data.nunique()
    commonest_item = data.value_counts().idxmax()
    return pd.Series([count, unique, commonest_item],
                     index = ['Number of observations', 'Unique values', 
                              'Mode (Highest occurring value)'])


def summary_statistics_table(summary):
    table = document.add_table(rows=len(summary), cols=2)
    table.style = document.styles['Table Grid']
    table.columns[0].width = Inches(2.5)
    table.columns[1].width = Inches(2)
    records = [(col, str(val)) for col, val in summary.items()]

    for idx, row in enumerate(table.rows):
        label, value = records[idx]
        row.cells[0].text = label
        row.cells[1].text = value

        
def most_frequent_table(data):
    top5 = data.value_counts().head()
    percentage = top5 / len(data) * 100
    top5 = pd.concat([top5, percentage], axis=1, keys=['count', 'pct'])
    top5 = list(top5.itertuples())
    
    table = document.add_table(rows=len(top5), cols=2)
    table.style = document.styles['Table Grid']
    table.columns[0].width = Inches(2.5)
    table.columns[1].width = Inches(2)
   
    for idx, row in enumerate(table.rows):
        item, freq, pct = top5[idx]
        row.cells[0].text = item
        row.cells[1].text = f"{freq:,} ({pct:.2f}%)"


        
def plot_numeric(var_name, data):
    fig = Figure(figsize=(6, 6))
    ax1, ax2 = fig.subplots(2, 1)
    ax1.boxplot(data.dropna(), vert=False, notch=True)
    ax1.set_yticklabels([''])
    ax1.set_xlabel(f'{var_name}')
    ax1.set_title(f'Box-plot of {var_name.title()}', size=12)
    ax2.set_title(f'Distribution plot of {var_name.title()}', size=12)
    sns.histplot(x=data, kde=True, ax=ax2, color=GRAPH_COLOR)
    fig.savefig(f'images/{var_name}.png')


def plot_categorical(var_name, data):
    fig = Figure(figsize=(6, 4))
    ax = fig.subplots()
    sns.countplot(x=data, color=GRAPH_COLOR, ax=ax)
    ax.set_title(f'Bar-plot of {var_name.title()}', size=12)
    fig.savefig(f'images/{var_name}.png')

    
def describe_variable(idx, var_name, data):
    document.add_heading(f'{idx}. {var_name.title()}', level=2)
    
    unique_values = f"""
    {var_name.title()} has {data.nunique()} unique values."""
    missing_values = f"{compute_missing(data)} of its values are missing."

    document.add_paragraph(' '.join([unique_values, missing_values]))
    
    
def analyse_data(data):
    numeric = data.select_dtypes(include='number')
    categorical = data.select_dtypes(include='object')
    idx = 1
    
    if numeric.shape[1] > 0:
        for col, series in numeric.items():
            describe_variable(idx, col, series)
            
            document.add_heading('Summary Statistics', level=4)
            summary = summary_statistics_numeric(series)
            summary_statistics_table(summary)
            document.add_paragraph()
            
            plot_numeric(col, series)
            document.add_picture(f'images/{col}.png', width=Inches(5))
            document.add_page_break()
            idx += 1

    if categorical.shape[1] > 0:
        for col, series in categorical.items():
            describe_variable(idx, col, series)
            
            document.add_heading('Summary', level=4)
            summary = summary_statistics_categorical(series)
            summary_statistics_table(summary)
            document.add_paragraph()
            
            document.add_heading('Most Frequent', level=4)
            most_frequent_table(series)            
            document.add_paragraph()
            plot_categorical(col, series)
            document.add_picture(f'images/{col}.png', width=Inches(5))
            document.add_page_break()
            idx += 1

In [4]:
document = Document()

# Top level heading
document.add_heading(f'Exploratory Data Analysis Report - {DATA_NAME.title()}', level=0)

populate_title_page(df)

document.add_page_break()

# Per-variable reports
analyse_data(df)


document.save('basic-report.docx')