# Feature Extraction using Decision Tree and Random Forest 

In [1]:
import pandas as pd

data = pd.read_csv("heart.csv")
df = pd.DataFrame(data)

df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [16]:
categorical_columns = ['ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

df.loc[df.Sex == 'M', 'Sex'] = 1
df.loc[df.Sex == 'F', 'Sex'] = 0

for feature in categorical_columns:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)
    df = df.join(onehot)

df.head(10)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0.0,0,0,1,0,0,0,1,0,1,0,0,0,1
1,49,0,160,180,0,156,1.0,1,0,0,1,0,0,1,0,1,0,0,1,0
2,37,1,130,283,0,98,0.0,0,0,1,0,0,0,0,1,1,0,0,0,1
3,48,0,138,214,0,108,1.5,1,1,0,0,0,0,1,0,0,1,0,1,0
4,54,1,150,195,0,122,0.0,0,0,0,1,0,0,1,0,1,0,0,0,1
5,39,1,120,339,0,170,0.0,0,0,0,1,0,0,1,0,1,0,0,0,1
6,45,0,130,237,0,170,0.0,0,0,1,0,0,0,1,0,1,0,0,0,1
7,54,1,110,208,0,142,0.0,0,0,1,0,0,0,1,0,1,0,0,0,1
8,37,1,140,207,0,130,1.5,1,1,0,0,0,0,1,0,0,1,0,1,0
9,48,0,120,284,0,120,0.0,0,0,1,0,0,0,1,0,1,0,0,0,1


In [17]:
from sklearn.model_selection import train_test_split

y = df['HeartDisease'] 
x = df.drop(columns='HeartDisease') 

columns = x.columns.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 1)

In [8]:
'''Random Forest Classifier''' 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
 
param_dist = {
    'n_estimators': [1, 50, 100, 150, 200, 250],
    'max_features': range(1, 19)
}

forest = RandomForestClassifier(random_state = 1)
cross_validated_forest = GridSearchCV(forest, param_dist)
cross_validated_forest = cross_validated_forest.fit(x_train, y_train)

print(cross_validated_forest.best_params_)

{'max_features': 3, 'n_estimators': 100}


In [12]:
from sklearn import metrics

forest = RandomForestClassifier(n_estimators = 100, max_features = 3, random_state = 1)
forest = forest.fit(x_train, y_train) 
y_pred = forest.predict(x_test)
mean_squared_error_forest = metrics.mean_squared_error(y_test, y_pred)
print("MSE for cross validated forest is ", mean_squared_error_forest)

forest_coefficients_list = forest.feature_importances_.tolist()
forest_coefficients = {} 
for i in range(0, len(forest_coefficients_list)):
    forest_coefficients[forest_coefficients_list[i]] = columns[i]

forest_coefficients_list.sort(reverse=True) 
sorted_dictionary = {}
keys = list(forest_coefficients.keys())

for i in range(0, len(forest_coefficients_list)): 
    ind = keys.index(forest_coefficients_list[i]) 
    sorted_dictionary[keys[ind]] = forest_coefficients[keys[ind]] 

print('Sorted coefficients: ', sorted_dictionary) 


MSE for cross validated forest is  0.09782608695652174
Sorted coefficients:  {0.12498510876057764: 'ST_Slope_Up', 0.10654155569859607: 'MaxHR', 0.10408510235304479: 'Cholesterol', 0.09788363894387297: 'Oldpeak', 0.08892801388004942: 'ST_Slope_Flat', 0.07663953915839193: 'Age', 0.07507535104443296: 'ChestPainType_ASY', 0.0742126068369538: 'RestingBP', 0.06407421152187907: 'ExerciseAngina_N', 0.03810285743451118: 'Sex', 0.03698234394953545: 'ExerciseAngina_Y', 0.027433346992961495: 'ChestPainType_ATA', 0.021394702035775176: 'FastingBS', 0.01619782975392105: 'ChestPainType_NAP', 0.014295486836176416: 'RestingECG_LVH', 0.011773134387439165: 'RestingECG_ST', 0.011068334966560253: 'RestingECG_Normal', 0.006650015376864288: 'ChestPainType_TA', 0.003676820068456983: 'ST_Slope_Down'}


In [18]:
'''Decision Tree Classifier'''
from sklearn.tree import DecisionTreeClassifier
param_dist = {"max_depth": range(2, 19, 2),
              "min_samples_split": range(2, 157, 5),
 
             "min_samples_leaf": range(2, 157, 5)}
decision_tree = DecisionTreeClassifier(random_state = 1)
cross_validated_tree = GridSearchCV(decision_tree, param_dist)
cross_validated_tree = cross_validated_tree.fit(x_train, y_train)

y_pred = cross_validated_tree.predict(x_test)
mean_squared_error = metrics.mean_squared_error(y_test, y_pred)
print("CV", mean_squared_error)

print(cross_validated_tree.best_params_)

CV 0.125
{'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 57}


In [20]:
cv_decision_tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=2, min_samples_split=57, random_state =1) 
cv_decision_tree = cv_decision_tree.fit(x_train, y_train)
y_pred = cv_decision_tree.predict(x_test)
mean_squared_error = metrics.mean_squared_error(y_test, y_pred)
print("CV Tree", mean_squared_error)

tree_coefficients_list = cv_decision_tree.feature_importances_.tolist()
tree_coefficients = {} 
for i in range(0, len(tree_coefficients_list)):
    tree_coefficients[tree_coefficients_list[i]] = columns[i]

tree_coefficients_list.sort(reverse=True) 
sorted_dictionary = {}
keys = list(tree_coefficients.keys())

for i in range(0, len(tree_coefficients_list)): 
    ind = keys.index(tree_coefficients_list[i]) 
    sorted_dictionary[keys[ind]] = tree_coefficients[keys[ind]] 

print('Sorted coefficients: ', sorted_dictionary) 

CV Tree 0.125
Sorted coefficients:  {0.5990476882678307: 'ST_Slope_Up', 0.13728601391345224: 'Cholesterol', 0.0907641624847736: 'ChestPainType_ASY', 0.08009825115791523: 'MaxHR', 0.04144996814385697: 'ExerciseAngina_N', 0.033798545528448194: 'Sex', 0.01755537050372298: 'Oldpeak', 0.0: 'ST_Slope_Flat'}
