In [None]:
import warnings
warnings.filterwarnings('ignore')
import ppscore as pps
import pandas as pd
import seaborn as sns
from scipy.stats import pointbiserialr, spearmanr, chi2_contingency
import matplotlib.pyplot as plt

Checking the importance of the new feature  

We will be using the following methodology to check whether a new engineered feature is meaningful or not:  

1> Measure the dependence of all the features with the target(correlation, chi-square test etc.)  

  
2> Check the Predictive Power score between all the features and the target  


In [None]:
def feature_importance(dataframe, new_feature, target):
    """
    Checks how important new_feature is w.r.t. the target on the basis of the PPS and spearman correlation.
    Uses the PPS module to calculate Predictive Power Score.
    ----------
    Parameters:
       dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
       feature(string) : Name of the new feature, as in the dataframe
       target(string) : Name of the target, as in the dataframe
       threshold(float) : default = 0.9; Multicolinearity threshold.
       return_corr_matrix : default = True; Return the pps for the entire feature-space(not including the target).
       return_pps_matrix : default = True; Return the pps for the entire feature-space(not including the target).
    ----------   
    Returns:
       seaborn heatmap : **spearman correlation and pps score** b/w all possible features including the new one in a heatmap form.
       float : **PPS score** of the new_feature with the target.
       sloat : **Correlation** of new_feature with target if its numeric.
       (float, float) : the **correlation and p-value** after the hypothesis testing(t-test for corr b/w new_feature and target).
       list of tuples : tuples of all the features combinations which have higher pps, than the threshold, with the new feature.
    """
    
    
    
    df = dataframe
    
    # pps score
    pps_score = pps.matrix(df)
    pps_feat_tar = {}
    for feature in df.drop([target], axis = 1).columns:
        pps_feat_tar[feature] = pps_score.loc[target, feature]
    
    
    # spearman correlation and pps score b/w all the features
    if return_corr_matrix:
        spearman_feat_2_feat = df.drop([target], axis = 1).corr(method = 'spearman')
        pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    # pps score b/w all the features
    if return_pps_matrix:
        pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    # correlation testing
    r = {}
    for new_feature in df.drop([target], axis = 1).columns:
        if df[new_feature].dtype == 'bool' or df[new_feature].dtype == 'O':
            r[new_feature] = (chi2_contingency(pd.crosstab(df[new_feature], df[target]))[0], chi2_contingency(pd.crosstab(df[new_feature], df[target]))[1])
        elif df[new_feature].dtype == 'int64' or df[new_feature].dtype == 'float64':
            r[new_feature] = (pointbiserialr(df[new_feature], df[target])[0], pointbiserialr(df[new_feature], df[target])[1])
        else:
            print("check you new feature data type---should be one among [int64, bool, object]")
    
    corr_tuplist = []
    corr = pps_feat_2_feat
    cols = len(corr.columns)
    for i in range(corr.shape[0]):
        for j in range(corr.shape[0]):
            if (abs(corr.iloc[i, j]) > 0.9) and (i != j):
                corr_tuplist.append((corr.index[i], corr.columns[j]))
    
    
    
    print("PPS score details\n", pps_feat_tar)
    print("\n")
    
    print("statistics-value & p-value of features wrt target(if feature was of boolean or object type, chi-square value shown) : ", r)
    print("\n")
    
    

In [None]:
def get_pps():
    pass

    

In [None]:
def get_dependence():
    pass