In [None]:
import warnings
warnings.filterwarnings('ignore')

Checking the importance of the new feature<br>
We will be using the following methodology to check whether a new engineered feature is meaningful or not: <br>
1> Measure the dependence of all the features with each other<br> 
2> Check the Predictive Power score between all the features and the target<br>
3> Measure the dependence b/w all the features and target(chi square, t-test)


In [None]:
def get_pps(dataframe, target, echo = True):
    """
    Calculates the PPS score between all the features and the target
    ------
    Parameters:
    dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    echo(bool) : whether to print the results or not. default : True
    ------
    Returns:
    dict : A dictionary in which the keys are the features and the values are their corresponding 
    PPS wrt target
    """
    
    df = dataframe
    
    import pandas as pd
    
    # pps score
    pps_score = pps.matrix(df)
    pps_feat_tar = {}
    for feature in df.drop([target], axis = 1).columns:
        pps_feat_tar[feature] = pps_score.loc[target, feature]
        
    if echo:
        print("PPS score details\n", pps_feat_tar)
        
    return pps_feat_tar

    

In [None]:
def get_dependence_heatmap(dataframe, target, echo = True):
    """
    Calculates the PPS matrix and correlation matrix between all the features only
    ------
    Parameters:
    dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    echo(bool) : whether to print the results or not. default : True
    ------
    Returns:
    seaborn heatmap : **spearman correlation and pps score** b/w all possible features including
    the new one in a heatmap form.
    """
    
    df = Dataframe
    
    import pandas as pd
    import ppscore as pps
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # spearman correlation and pps score b/w all the features
    if return_corr_matrix:
        spearman_feat_2_feat = df.drop([target], axis = 1).corr(method = 'spearman')
    
    # pps score b/w all the features
    if return_pps_matrix:
        pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    if echo:
        plt.figure(figsize = (16, 8))
        print("Correlation heatmap")
        sns.heatmap(spearman_feat_2_feat, robust = 1, linewidth = 2, annot = True)
        plt.show()

        plt.figure(figsize = (16, 8))
        print("PPS Score heatmap")
        sns.heatmap(pps_feat_2_feat, robust = 1, linewidth = 2, annot = True)
        plt.show()
    
    return spearman_feat_2_feat, pps_feat_2_feat

In [None]:
def correlation_test(dataframe, target, echo = True):
    """
    Calculates the correlation b/w all the features and the target.
    ------
    Parameters:
    dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
    target(string) : Name of the target, as in the dataframe
    echo(bool) : whether to print the results or not. default : True
    ------
    Returns:
    dict : A dictionary in which the keys are the features and the values are a tuple of the form
    (statistic, p-value). The statistic is a t-value if the feature is integer or float or a chi_square
    value if the feature is a sting or boolean.
    """
    
    # correlation testing
    
    df = dataframe
    
    import pandas as pd
    from scipy.stats import pointbiserialr, chi2_contingency
    
    r = {}
    
    for new_feature in df.drop([target], axis = 1).columns:
        if df[new_feature].dtype == 'bool' or df[new_feature].dtype == 'O':
            r[new_feature] = (chi2_contingency(pd.crosstab(df[new_feature], df[target]))[0], chi2_contingency(pd.crosstab(df[new_feature], df[target]))[1])
        elif df[new_feature].dtype == 'int64' or df[new_feature].dtype == 'float64':
            r[new_feature] = (pointbiserialr(df[new_feature], df[target])[0], pointbiserialr(df[new_feature], df[target])[1])
        else:
            print("check you new feature data type---should be one among [int64, bool, object]")
    
    if echo:
        print("statistics-value & p-value of features wrt target(if feature was of boolean or object type, chi-square value shown) : ", r)
    
    return r