In [7]:
# if code is running on IBM Cloud Pak, uncomment
# %%writefile EDA.py

from timeit import default_timer as timer
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from loguru import logger
import warnings
warnings.filterwarnings('ignore')

# data visualizations
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# apply some cool styling
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [15, 15]

In [9]:
class EDA:

    def explore(self, df, feature):
              
        cols_num = self.cols_group["cols_num"]
        cols_categ = self.cols_group["cols_categ"]

        logger.info('Started Exploratory Data Analysis...')
        
        EDA._plothistwithsubplots(df, cols_num)
        EDA._plotbarwithsubplots(df, cols_categ)
        EDA._correlationmat(df, cols_num)
        EDA._pairplot(df, feature)  
        
        logger.info('Completed Exploratory Data Analysis. All visualizations are saved to .png files.')


    def _plothistwithsubplots(df, cols_num):
        # displays and saves histograms of all numerical columns, structured in subplots
        numeric_no = len(cols_num)
        col_no = int(np.ceil(np.sqrt(numeric_no)))
        row_no = int(np.ceil(np.sqrt(numeric_no)))
        if col_no * (row_no - 1) >= numeric_no:
            row_no -= 1
            
        figure, axes = plt.subplots(row_no, col_no, figsize=(15,15))
        row_i = 0
        col_i = 0
        for num in cols_num:
            df[num].plot(ax=axes[row_i, col_i], kind='hist', title = num,  grid=True)
            col_i += 1
            if col_i == col_no:
                row_i += 1
                col_i = 0
        
        figure.tight_layout()
        figure.savefig(os.path.join(os.getcwd(), 'numerical_columns_histograms.png'))   # save the figure to file
        plt.close(figure)
        
        
    def _plotbarwithsubplots(df, cols_categ):
        # displays and saves bar plots of all categorical columns, structured in subplots
        categ_no = len(cols_categ)
        col_no = int(np.ceil(np.sqrt(categ_no)))
        row_no = int(np.ceil(np.sqrt(categ_no)))
        if col_no * (row_no - 1) >= categ_no:
            row_no -= 1
        figure, axes = plt.subplots(row_no, col_no, figsize=(15,15))
        row_i = 0
        col_i = 0
        for cat in cols_categ:
            df[cat].value_counts().plot(ax=axes[row_i, col_i], kind='bar', title = cat, grid=True)
            col_i += 1
            if col_i == col_no:
                row_i += 1
                col_i = 0
        figure.tight_layout()
        figure.savefig(os.path.join(os.getcwd(), 'categorical_columns_barplots.png'))   # save the figure to file
        plt.close(figure)

    def _pairplot(df, colm):
        # displays and saves pairplots of each feature against the given column
        g = sns.PairGrid(df, x_vars=df.columns, y_vars=colm)
        g.map_diag(plt.hist)
        g.map_offdiag(sns.scatterplot)
        g.tight_layout()

        plt.savefig(os.path.join(os.getcwd(), 'pairplots.png'))    # save the figure to file
    
    def _correlationmat(df, cols_num):
        # displays and saves the correlation matrix of the features on a heatmap, returns the filtered dataframe
        # filtered dataframe only has the features with strong correlation
        # threshold coefficient for strong correlation is determined according to IBM SPSS Modeler default settings
        
        threshold = 0.666
        
        corrmat = df[cols_num].corr()
        df_filtered = corrmat[((corrmat >= threshold) | (corrmat <= -threshold)) & (corrmat !=1.000)]
        plt.figure(figsize=(30,10))
        sns.heatmap(df_filtered, annot=True, cmap="Reds")
        
        plt.savefig(os.path.join(os.getcwd(), 'high_correlation_heatmap.png'))
