### V-Crammer Correlation
Correlation between categorical features and a binary target

In [12]:
# Option 1
def calculate_crammer_coef_corr(confusion_matrix):
    chi2_val = sts.chi2_contingency(confusion_matrix)[0]
    n_observations = confusion_matrix.sum().sum()
    phi_val = chi2_val/n_observations
    n_rows, n_colmns = confusion_matrix.shape
    
    # Корректировка значений phi_val, n_rows и n_colmns
    phi_val_corr = max(0, phi_val - ((n_colmns-1)*(n_rows-1))/(n_observations-1)) 
    n_rows_corr = n_rows - ((n_rows-1)**2)/(n_observations-1)
    n_colmns_corr = n_colmns - ((n_colmns-1)**2)/(n_observations-1)
    return np.sqrt(phi_val_corr / min( (n_colmns_corr-1), (n_rows_corr-1)))

# Option 2 (preffered)
def calculate_crammer_coef(feature_df, target, correction=True, return_p_value=True, ascending=False):
    """
    feature_df: DataFrame
        Main DataFrame with only categorical features and without target feature
    target: Series 
        Target feature 
        
    If return_p_value = True return a tuple (Feature Name, p-value, V-Crammer Value)
    Otherwise the tuple (Feature Name, V-Crammer Value)
    
    If ratio doesn't meet the conditions, the feature will be excluded from calculation
    
    """
    crammer_corrs = [] # for storing the calculated correlations
    
    for feature in feature_df.columns:
        confusion_matrix = pd.crosstab(feature_df[feature], target)
        n_observations = feature_df[feature].shape[0]
        n_rows, n_colmns = confusion_matrix.shape
        
         # Confusion Matrix must follow some conditions before applying the method
        ratio = (np.sum((confusion_matrix.loc[:, -1] < 5)) + np.sum((confusion_matrix.loc[:, -1] < 5)))/confusion_matrix.size
        
        if ratio <= 0.2:
            # p-values option
            if return_p_value:
                chi2_res = sts.chi2_contingency(confusion_matrix, correction=correction)
                crammer_corrs.append((feature_df[feature].name, chi2_res[1], np.sqrt(chi2_res[0]/(n_observations*(min(n_rows, n_colmns)-1)))))
            else:
                chi2_val = sts.chi2_contingency(confusion_matrix, correction=correction)[0]
                crammer_corrs.append((feature_df[feature].name, np.sqrt(chi2_val/(n_observations*(min(n_rows, n_colmns)-1)))))
    
    return pd.DataFrame(crammer_corrs, columns=['Feature', 'p-value', 'V_Crammer_Value']).sort_values(by='V_Crammer_Value', ascending=ascending)

### Mathematical Expectation Difference
Correlation between numerical features and a binary target

In [10]:
def math_exp_differences(feat_df, target, ascending=False):
    
    """
    feat_df: DataFrame
        Main DataFrame with only numerical features and without target feature
        
    target: Series
        Target feature
        
    ascending: Ascending type
    
    """
    main_df = pd.concat([feat_df, target], axis=1) # for combining features with the target
    main_df = main_df.dropna() # DataFrame must be without NaN values because it affects the result
    
    
    mat_exp_diff = [] # for storing the results
    
    for feature in feat_df.columns:
        group_means = main_df[[feature, target.name]].groupby(by=target.name).mean()
        means_diff = group_means.iloc[0, 0] - group_means.iloc[1, 0]
        
        mat_exp_diff.append(means_diff)
        
    df = abs(pd.DataFrame({'Corr_ME_diffs':mat_exp_diff}, index=feat_df.columns))

    return df.sort_values(by='Corr_ME_diffs', ascending=ascending)

### Chi2
Features must not have NaN and must be scaled

In [1]:
# from sklearn.feature_selection import SelectKBest, chi2

# feat_selector = SelectKBest(score_func=chi2, k=5)
# feat_selector.fit(num_feat_testing, y_train)

# num_feat_chi_2_df = pd.DataFrame({'chi_2_score': feat_selector.scores_,
#                                   'p_value': feat_selector.pvalues_}, index=num_features.columns)

# num_feat_chi_2_df = num_feat_chi_2_df.sort_values(['chi_2_score'] , ascending=False)
# num_feat_chi_2_df