In [1]:
import pandas as pd
import numpy as np
import altair as alt
from scipy import stats
from IPython.display import display
from sklearn.datasets import make_regression
from tabulate import tabulate

In [2]:
test_data_empty = pd.DataFrame( { 'cat_empty': [], 'num_empty': []})

test_data = pd.DataFrame(
    {
        'cat_1': [ 1, 1, 1, 1, 1, 1],
        'cat_2': [ 1, 1, 1, 2, 2, 2],
        'cat_3': [ 1, 1, 2, 2, 3, 3],
        'cat_4': [ 'one', 'one', 'two', 'two', 'three', 'three'],
        'cat_5': [ 'one', 'one', 'one', 'two', 'two', 'two'],
        'num_constant': [ 20, 20, 20, 20, 20, 20],
        'num_variance': [ 20, 20, 21, 22, 23, 24],
        'num_variable': [ 10, 20, 50, 100, 200, 300],
        'num_na': [ None, 4, None, 8, None, 12 ]
    }
)

np.random.seed(123)
num_var1 = []
num_list = []
for i in np.arange(0, 1000, 100):
    num_list.append(np.arange(i, i+100))

for j in num_list:
    num_var1 = np.append(num_var1, np.random.choice(j, 100))

test_data2 = pd.DataFrame(
    {
        'num_var1': num_var1,
        'num_var2': np.arange(0, 1000)
    }
)

X, y = make_regression(n_samples = 2, n_features = 100, random_state=123)

test_data3 = pd.DataFrame(
    {
        'num_var1': X[0],
        'num_var2': X[1]
    }
)

In [3]:
def num_dist_scatter(num1, num2, data, title = '', stat = False, trend = None):
    '''
    Creates a scatter plot given two numerical features. Plot can provide regression trendline as linear, polynomial, or loess.
    Statistics such as number of NaNs, mean, median, and standard deviations will be returned as options.
    Spearman and Pearson's correlation will also be returned to aid the user to determining feature relationship.

    Parameter
    ---------
    num1: string
        Name of the column for the first numeric feature.
    num2: string
        Name of the column for the second numeric feature.
    data: pandas.DataFrame
        Target data frame for visualization.
    title: string, default ''
        Title for the chart.
    stat: bool, default False
        Boolean to provide simple statistics.
    trend: string, default None
        Type of trendline. Options are: 'None', lin', 'poly'.
    
    Return
    ------
    altair.Chart
        A chart consists of a scatterplot with out without trendlines.
    string
        Spearman and Pearson's correlation numbers.
    '''
    data1 = data.copy()
    # check if feature is numeric
    assert data[num1].dtype.kind in 'iufc', 'num1 column must be numeric!'
    assert data[num2].dtype.kind in 'iufc', 'num2 column must be numeric!'
    assert data[num1].nunique() != 1, 'num1 column is constant, consider using functions for categorical variables'
    assert data[num2].nunique() != 1, 'num1 column is constant, consider using functions for categorical variables'

    # feature statistics
    stats_df = pd.DataFrame()
    feat_list = [num1, num2]

    for i in feat_list:
        output = []
        output.append(data[i].isna().sum())
        output.append(round(np.mean(data[i]), 3))
        output.append(np.median(data[i]))
        output.append(round(np.std(data[i], ddof=1), 3))
        stats_df[i] = output

    stats_df = stats_df.T.rename(columns={0: 'Num NaN', 1:'Mean', 2: 'median', 3:'Stdev'})
    if stat == True:
        print(stats_df)

    # replace NaN (if any) with mean column value
    if stats_df.iloc[0,0] != 0:
        data1[num1] = data1[num1].fillna(stats_df.iloc[0,1])
        print(f'**num1 NaN replaced with mean {stats_df.iloc[0,1]:.2f}**')
    if stats_df.iloc[1,0] != 0:
        data1[num2] = data1[num2].fillna(stats_df.iloc[1,1])
        print(f'**num2 NaN replaced with mean {stats_df.iloc[1,1]:.2f}**')

    # correlation statistics
    pear = stats.pearsonr(data1[num1], data1[num2])[0]
    pear_p = stats.pearsonr(data1[num1], data1[num2])[1]
    spear = stats.spearmanr(data1[num1], data1[num2]).correlation
    spear_p = stats.spearmanr(data1[num1], data1[num2]).pvalue
    table = [ [ 'Pearson\'s', f'{pear:.2f}', f'{pear_p:.4f}'], [ 'Spearman\'s', f'{spear:.2f}', f'{spear_p:.4f}']]
    print(f"The Pearson's correlation is {pear:.2f} with p-value of {pear_p:.4f}.")
    print(f"The Spearman's correlation is {spear:.2f} with p-value of {spear_p:.4f}.")
    print( tabulate( table, headers = [ '', 'Correlation', 'p']))


    # scatter plot
    scatter = alt.Chart(data1).mark_point(opacity=0.8).encode(
    alt.X(num1, title=num1),
    alt.Y(num2, title=num2)
    ).properties(
        height = 500,
        width = 500,
        title = title
    )
    
    # linear regression line
    lr = scatter.mark_line(size=2).transform_regression(
    num1, num2)

    # polynomial line
    poly = scatter.mark_line(size=3).transform_regression(
    num1, num2, method='poly')

    # loess line, 'locally estimated scatterplot smoothing'
    loess = scatter.mark_line(size=3).transform_loess(
    num1, num2)

    if trend == 'lin':
        plot = scatter + lr
    elif trend =='poly':
        plot = scatter + poly
    elif trend == 'loess':
        plot = scatter + loess
    else: 
        plot = scatter

    return plot

In [4]:
num_dist_scatter('num_variable', 'num_na', test_data, title='test', stat=True, trend = 'poly')
#type( num_dist_by_cat( v_cat = 'cat_2', v_num = 'num_variable', data = test_data)).__name__

              Num NaN     Mean  median    Stdev
num_variable      0.0  113.333    75.0  114.833
num_na            3.0    8.000     NaN    4.000
**num2 NaN replaced with mean 8.00**
The Pearson's correlation is 0.77 with p-value of 0.0726.
The Spearman's correlation is 0.68 with p-value of 0.1404.
              Correlation       p
----------  -------------  ------
Pearson's            0.77  0.0726
Spearman's           0.68  0.1404


In [5]:
import sys
sys.path.append('src/prelim_eda_helper')
from prelim_eda_helper import *

In [7]:
num_dist_scatter('num_variable', 'num_na', test_data, title='test', stat=True, trend = 'poly')

              Num NaN     Mean  median    Stdev
num_variable      0.0  113.333    75.0  114.833
num_na            3.0    8.000     NaN    4.000
**num2 NaN replaced with mean 8.00**
              Correlation       p
----------  -------------  ------
Pearson's            0.77  0.0726
Spearman's           0.68  0.1404


In [18]:
from io import StringIO
captured_output = StringIO()
sys.stdout = captured_output
num_dist_scatter('num_variable', 'num_na', test_data, title='test', stat=True)
captured_output.getvalue().strip()[-6:]

'0.1404'