In [None]:
import pandas as pd
import ppscore as pps
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr, chi2_contingency
import warnings
warnings.filterwarnings('ignore')

Checking the importance of the new feature<br>
We will be using the following methodology to check whether a new engineered feature is meaningful or not: <br>
1> Measure the dependence of all the features with each other<br> 
2> Check the Predictive Power score between all the features and the target<br>
3> Measure the dependence b/w all the features and target(chi square, t-test)


In [None]:
def get_pps(df, target):
    """
    Calculates the PPS score between all the features and the target
    ------
    Parameters:
    df(pandas dataframe) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    ------
    Returns:
    array of dict : An array of dictionary in which each element of the array is a dictionary representing the complete pps
    procedure of each feature wrt target
    """
    
    # pps score
    pps_feat_tar = []
    for feature in df.drop([target], axis = 1).columns:
        pps_feat_tar.append(pps.score(df, feature, target))
        
    return pps_feat_tar

    

In [None]:
def get_dependence_heatmap(df, target):
    """
    Calculates the PPS matrix and correlation matrix between all the features only
    ------
    Parameters:
    df(pandas dataframe) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    ------
    Returns:
    pandas dataframe : **spearman correlation and pps score** dataframe matrix b/w all possible features including
    the new one.
    """
    
    # spearman correlation and pps score b/w all the features
    spearman_feat_2_feat = df.drop([target], axis = 1).corr(method = 'spearman')
    
    # pps score b/w all the features
    pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    
    plt.figure(figsize = (16, 8))
    print("Correlation heatmap")
    sns.heatmap(spearman_feat_2_feat, robust = 1, linewidth = 2, annot = True)
    plt.show()

    plt.figure(figsize = (16, 8))
    print("PPS Score heatmap")
    sns.heatmap(pps_feat_2_feat, robust = 1, linewidth = 2, annot = True)
    plt.show()
    
    return spearman_feat_2_feat, pps_feat_2_feat

In [None]:
def correlation_test(df, target):
    """
    Calculates the correlation b/w all the features and the target.
    ------
    Parameters:
    dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    echo(bool) : whether to print the results or not. default : True
    ------
    Returns:
    dict : A dictionary in which the keys are the features and the values are a tuple of the form
    (statistic, p-value). The statistic is a t-value if the feature is integer or float or a chi_square
    value if the feature is a sting or boolean.
    """
    
    r = {}
    
    for new_feature in df.drop([target], axis = 1).columns:
        if df[new_feature].dtype == 'bool' or df[new_feature].dtype == 'O':
            r[new_feature] = (chi2_contingency(pd.crosstab(df[new_feature], df[target]))[0], chi2_contingency(pd.crosstab(df[new_feature], df[target]))[1])
        elif df[new_feature].dtype == 'int64' or df[new_feature].dtype == 'float64':
            r[new_feature] = (pointbiserialr(df[new_feature], df[target])[0], pointbiserialr(df[new_feature], df[target])[1])
        else:
            print("check you new feature data type---should be one among [int64, bool, object]")
    
    return r

In [None]:
get_pps(dataframe, 'IsFirstDefault')

In [None]:
ppscore_feat, corr_value_feat = get_dependence_heatmap(dataframe, 'IsFirstDefault')

In [None]:
correlation_test(dataframe, 'IsFirstDefault')