In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [60]:
ckd=pd.read_csv("final.csv")

In [61]:
y = ckd['Class']

In [62]:
new_ckd = ckd.drop('Class', axis=1)
new_ckd = new_ckd.drop('Age', axis=1)

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_ckd, y, test_size=0.3, random_state=0)

In [64]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything else
# without any further insight.

def correlation(dataset, threshold):
    
    # create a set where I will store the names of correlated columns
    col_corr = set()
    
    # create the correlation matrix
    corr_matrix = dataset.corr()
    
    # for each feature in the dataset (columns of the correlation matrix)
    for i in range(len(corr_matrix.columns)):
        
        # check with other features
        for j in range(i):
            
            # if the correlation is higher than a certain threshold
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                
                # print correlation, and variables examined
                # keep in mind that the columns and rows of the dataframe are identical
                # so we can identify the features being examned by looking for i,j
                # in the column names
                print(abs(corr_matrix.iloc[i, j]), corr_matrix.columns[i], corr_matrix.columns[j])
                
                # get the name of the correlated feature
                colname = corr_matrix.columns[j]
                
                # and add it to our correlated set
                col_corr.add(colname)
                
    return col_corr

In [65]:
corr_features = correlation(X_train, 0.4)
len(set(corr_features))

0.42225050639410183 Al Sg
0.5302125231851449 Pc Al
0.5630393335453477 Pcc Pc
0.5788447929259859 Bgr Su
0.626350292551759 Sc Bu
0.40332596995983666 Sod Al
0.4302954473570444 Pot Bu
0.47015563589440296 Hemo Sg
0.5165706282656113 Hemo Al
0.4760122018498381 Hemo Pc
0.48722683905101927 Hemo Bu
0.4556593416684063 Hemo Sod
0.48593073423694966 Pcv Sg
0.4894360366460531 Pcv Al
0.4840984567298823 Pcv Pc
0.47866590850341684 Pcv Bu
0.4531332812629797 Pcv Sod
0.8569472440924638 Pcv Hemo
0.4144629728382449 Rbcc Sg
0.4042174120421352 Rbcc Al
0.4374611864482033 Rbcc Pc
0.4110886193308692 Rbcc Bu
0.6765013738696887 Rbcc Hemo
0.6912689412553678 Rbcc Pcv
0.4952952077907445 Htn Al
0.4147026136465112 Htn Sod
0.5461684116622173 Htn Hemo
0.5325485378248022 Htn Pcv
0.47384021402252824 Htn Rbcc
0.47401814806091236 Dm Su
0.48878597494759346 Dm Bgr
0.4644143775221832 Dm Hemo
0.44890242359180205 Dm Pcv
0.4184071797113837 Dm Rbcc
0.6404568468299996 Dm Htn
0.4191885997384054 pe Al
0.4422514194088801 pe Appet
0.5262

12

In [66]:
corr_features

{'Al',
 'Appet',
 'Bgr',
 'Bu',
 'Hemo',
 'Htn',
 'Pc',
 'Pcv',
 'Rbcc',
 'Sg',
 'Sod',
 'Su'}

In [67]:
# build a dataframe with the correlation between features
# remember that the absolute value of the correlation
# coefficient is important and not the sign

corrmat = X_train.corr()
corrmat = corrmat.abs().unstack() # absolute value of corr coef
corrmat = corrmat.sort_values(ascending=False)
corrmat = corrmat[corrmat >= 0.4]
corrmat = corrmat[corrmat < 1]
corrmat = pd.DataFrame(corrmat).reset_index()
corrmat.columns = ['feature1', 'feature2', 'corr']
corrmat.head()

Unnamed: 0,feature1,feature2,corr
0,Pcv,Hemo,0.856947
1,Hemo,Pcv,0.856947
2,Pcv,Rbcc,0.691269
3,Rbcc,Pcv,0.691269
4,Rbcc,Hemo,0.676501


In [68]:
grouped_feature_ls = []
correlated_groups = []

for feature in corrmat.feature1.unique():
    
    if feature not in grouped_feature_ls:

        # find all features correlated to a single feature
        correlated_block = corrmat[corrmat.feature1 == feature]
        grouped_feature_ls = grouped_feature_ls + list(
            correlated_block.feature2.unique()) + [feature]

        # append the block of features to the list
        correlated_groups.append(correlated_block)

print('found {} correlated groups'.format(len(correlated_groups)))
print('out of {} total features'.format(X_train.shape[1]))

found 6 correlated groups
out of 23 total features


In [69]:
from sklearn.ensemble import RandomForestClassifier

# add all features of the group to a list
for group in correlated_groups:
    tempo=list(group['feature1'].unique())
    features = list(group['feature2'].unique())+tempo

    # train a random forest 
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train[features].fillna(0), y_train)
    importance = pd.concat(
    [pd.Series(features),
     pd.Series(rf.feature_importances_)], axis=1)

    importance.columns = ['feature', 'importance']

    # sort features by importance, most important first
    print(importance.sort_values(by='importance', ascending=False))
    

   feature  importance
0     Hemo    0.257815
10     Pcv    0.200480
3       Al    0.166865
4       Sg    0.109635
2      Htn    0.080327
1     Rbcc    0.073722
9       Dm    0.047071
8      Sod    0.033362
7       Bu    0.016663
5       Pc    0.012749
6      Ane    0.001312
  feature  importance
1      Sc    0.645964
0      Bu    0.354036
  feature  importance
2     Bgr    0.446639
1      Dm    0.361457
0      Su    0.191904
  feature  importance
0      Pc    0.809021
1     Pcc    0.190979
  feature  importance
1      Al    0.730380
0   Appet    0.150398
2      pe    0.119222
  feature  importance
0      Bu     0.59036
1     Pot     0.40964
