# import and prepare the data

In [7]:
# step 1 import necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# step 2 obtain a data set heart disease data set from UCI 
# repository and out it into a data frame
url =  'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat'

col_names = ['age', 'sex', 'cheay_pain_type', 'resting_blood_pressure', 'serum_cholestoral', 
            'fasting_blood_sugar', 'resting_electro_results', 'max_heart_rate', 'exercise_induced_angina', 
            'oldpeak', 'slop', 'num_major_vessels', 'thal', 'target']

heart_df = pd.read_csv(url, header=None, skiprows=0, names=col_names, sep='\s+')

In [9]:
# step 3 investigate the data set
# look at the data
heart_df.head()

Unnamed: 0,age,sex,cheay_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electro_results,max_heart_rate,exercise_induced_angina,oldpeak,slop,num_major_vessels,thal,target
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [10]:
# show the dimension of shape of the data set
heart_df.shape

(270, 14)

In [11]:
# examine the class distribution
#มีกี่ค่าที่แตกต่างกัน และครอบลุมข้อมูลกี่แถว (แตกต่าง 2 ค่า ค่าละ 150 และ 120 แถว)
heart_df['target'].value_counts()

1    150
2    120
Name: target, dtype: int64

In [12]:
# check the data type of each column
heart_df.dtypes

age                        float64
sex                        float64
cheay_pain_type            float64
resting_blood_pressure     float64
serum_cholestoral          float64
fasting_blood_sugar        float64
resting_electro_results    float64
max_heart_rate             float64
exercise_induced_angina    float64
oldpeak                    float64
slop                       float64
num_major_vessels          float64
thal                       float64
target                       int64
dtype: object

In [13]:
# check whether any column contains NULL data
# use .any(axis=1)row, any(axis=0)column
heart_df.isnull().any()

age                        False
sex                        False
cheay_pain_type            False
resting_blood_pressure     False
serum_cholestoral          False
fasting_blood_sugar        False
resting_electro_results    False
max_heart_rate             False
exercise_induced_angina    False
oldpeak                    False
slop                       False
num_major_vessels          False
thal                       False
target                     False
dtype: bool

In [14]:
# step 4 separate the data set into two parts -data and -target
# by slicing iloc[integer locate] [row,column[start:stop:step]]
x = heart_df.iloc[:,0:-1]
x.shape

(270, 13)

In [15]:
y = heart_df.iloc[:,-1]
y.shape

(270,)

In [16]:
# step 5 split the data set into two training set and the
# test set with 80% 20% ratio
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, 
                                                   random_state=2019)

In [17]:
x_train.shape
x_train

Unnamed: 0,age,sex,cheay_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electro_results,max_heart_rate,exercise_induced_angina,oldpeak,slop,num_major_vessels,thal
94,52.0,1.0,4.0,125.0,212.0,0.0,0.0,168.0,0.0,1.0,1.0,2.0,7.0
184,53.0,1.0,3.0,130.0,246.0,1.0,2.0,173.0,0.0,0.0,1.0,3.0,3.0
42,44.0,1.0,3.0,130.0,233.0,0.0,0.0,179.0,1.0,0.4,1.0,0.0,3.0
38,42.0,1.0,3.0,130.0,180.0,0.0,0.0,150.0,0.0,0.0,1.0,0.0,3.0
19,40.0,1.0,1.0,140.0,199.0,0.0,0.0,178.0,1.0,1.4,1.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,65.0,1.0,4.0,110.0,248.0,0.0,2.0,158.0,0.0,0.6,1.0,2.0,6.0
190,54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0
216,63.0,0.0,3.0,135.0,252.0,0.0,2.0,172.0,0.0,0.0,1.0,0.0,3.0
159,66.0,0.0,4.0,178.0,228.0,1.0,0.0,165.0,1.0,1.0,2.0,2.0,7.0


# Filter method

In [18]:
# import packahes of different statistical scores
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [19]:
# identify type of the statistic measure and the desired number of features
# เลือกมา 10 feature
selector = SelectKBest(score_func=chi2, k=10)

In [20]:
# fit the selector to the training data set.
selector_model = selector.fit(x_train,y_train)

In [21]:
# show statistical scores of all features
print(selector_model.scores_)

[ 16.61835585   7.93340075  10.28102693   8.29839367  32.32880736
   0.61177897   7.86668681 124.41632671  19.56094603  42.79585285
   5.38114598  59.98518073  51.43991979]


In [22]:
# show the indices of the top 10 features
selector_model.get_support(indices=True)

array([ 0,  1,  2,  3,  4,  7,  8,  9, 11, 12], dtype=int64)

In [23]:
# show names of the top 10 features
x_train.columns[selector_model.get_support(indices=True)]

Index(['age', 'sex', 'cheay_pain_type', 'resting_blood_pressure',
       'serum_cholestoral', 'max_heart_rate', 'exercise_induced_angina',
       'oldpeak', 'num_major_vessels', 'thal'],
      dtype='object')

In [24]:
# show statistacal scores of the top 10 features
print(selector_model.scores_[selector_model.get_support(indices=True)])

[ 16.61835585   7.93340075  10.28102693   8.29839367  32.32880736
 124.41632671  19.56094603  42.79585285  59.98518073  51.43991979]


In [25]:
# select a subset of x_train data based on the ten most relevant feature using selector_model.
x_train_model = selector_model.transform(x_train)
x_train_model

array([[52. ,  1. ,  4. , ...,  1. ,  2. ,  7. ],
       [53. ,  1. ,  3. , ...,  0. ,  3. ,  3. ],
       [44. ,  1. ,  3. , ...,  0.4,  0. ,  3. ],
       ...,
       [63. ,  0. ,  3. , ...,  0. ,  0. ,  3. ],
       [66. ,  0. ,  4. , ...,  1. ,  2. ,  7. ],
       [70. ,  1. ,  2. , ...,  0. ,  0. ,  3. ]])

In [26]:
x_train_model.shape

(216, 10)

In [27]:
# select a subset of x_train data based on the top ten most relevant features using seletor_model created from x_train
# the top ten most relevant features used for x_test must be the same set of features as those for x_train
x_test_model = selector_model.transform(x_test)
x_test_model

array([[6.50e+01, 1.00e+00, 4.00e+00, 1.20e+02, 1.77e+02, 1.40e+02,
        0.00e+00, 4.00e-01, 0.00e+00, 7.00e+00],
       [7.40e+01, 0.00e+00, 2.00e+00, 1.20e+02, 2.69e+02, 1.21e+02,
        1.00e+00, 2.00e-01, 1.00e+00, 3.00e+00],
       [5.30e+01, 0.00e+00, 4.00e+00, 1.30e+02, 2.64e+02, 1.43e+02,
        0.00e+00, 4.00e-01, 0.00e+00, 3.00e+00],
       [5.50e+01, 1.00e+00, 2.00e+00, 1.30e+02, 2.62e+02, 1.55e+02,
        0.00e+00, 0.00e+00, 0.00e+00, 3.00e+00],
       [6.20e+01, 0.00e+00, 4.00e+00, 1.38e+02, 2.94e+02, 1.06e+02,
        0.00e+00, 1.90e+00, 3.00e+00, 3.00e+00],
       [4.20e+01, 1.00e+00, 1.00e+00, 1.48e+02, 2.44e+02, 1.78e+02,
        0.00e+00, 8.00e-01, 2.00e+00, 3.00e+00],
       [6.70e+01, 1.00e+00, 4.00e+00, 1.60e+02, 2.86e+02, 1.08e+02,
        1.00e+00, 1.50e+00, 3.00e+00, 3.00e+00],
       [4.30e+01, 0.00e+00, 3.00e+00, 1.22e+02, 2.13e+02, 1.65e+02,
        0.00e+00, 2.00e-01, 0.00e+00, 3.00e+00],
       [4.10e+01, 1.00e+00, 2.00e+00, 1.10e+02, 2.35e+02, 1.53e+

In [28]:
x_test_model.shape

(54, 10)

In [29]:
# instantiate a logisticRegression learning model, train the model and evaluate it on the test set 
# x_train_model = the top 10 features

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)

In [30]:
logisticRegr.fit(x_train_model, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
predictions = logisticRegr.predict(x_test_model)

In [32]:
# compute preformance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

In [33]:
# report the performance
print('Performance')
print('- accuracy =' +str(accuracy))
print('- presision =' +str(precision))
print('- recall= ' +str(recall))
print('- f1=' +str(f1))

Performance
- accuracy =0.8888888888888888
- presision =0.9629629629629629
- recall= 0.8387096774193549
- f1=0.896551724137931


# Wrapper Method (Forward)

In [39]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [228]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [229]:
# create a forwear feature
from sklearn.linear_model import LogisticRegression

In [231]:
logisticRegr = LogisticRegression(solver='lbfgs',max_iter=1000)
sffs = SFS(logisticRegr, 
          k_features=10, 
          forward=True, #For backward selection, set to False
          floating=False, 
          verbose=2, 
          scoring='accuracy', 
          cv=10, #cross validstion
          n_jobs=-1)  #using all available CPU core

In [232]:
sffs = sffs.fit(x_train.values,y_train.values, custom_feature_names=x_train.columns)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    0.3s finished

[2021-09-23 22:48:46] Features: 1/10 -- score: 0.7744588744588744[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.2s finished

[2021-09-23 22:48:46] Features: 2/10 -- score: 0.8023809523809524[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.2s finished

[2021-09-23 22:48:46] Features: 3/10 -- score: 0.8166666666666668[Parallel(n_jobs=-1)]: Using backend 

In [233]:
sffs.k_feature_names_

('sex',
 'cheay_pain_type',
 'resting_blood_pressure',
 'fasting_blood_sugar',
 'resting_electro_results',
 'max_heart_rate',
 'oldpeak',
 'slop',
 'num_major_vessels',
 'thal')

In [234]:
sffs.k_feature_idx_

(1, 2, 3, 5, 6, 7, 9, 10, 11, 12)

In [235]:
x_train_model = sffs.transform(x_train)
x_train_model

array([[  1.,   4., 125., ...,   1.,   2.,   7.],
       [  1.,   3., 130., ...,   1.,   3.,   3.],
       [  1.,   3., 130., ...,   1.,   0.,   3.],
       ...,
       [  0.,   3., 135., ...,   1.,   0.,   3.],
       [  0.,   4., 178., ...,   2.,   2.,   7.],
       [  1.,   2., 156., ...,   1.,   0.,   3.]])

In [236]:
x_test_mpdel = sffs.transform(x_test)
x_test_model

array([[6.50e+01, 1.00e+00, 4.00e+00, 1.20e+02, 1.77e+02, 1.40e+02,
        4.00e-01, 1.00e+00, 0.00e+00, 7.00e+00],
       [7.40e+01, 0.00e+00, 2.00e+00, 1.20e+02, 2.69e+02, 1.21e+02,
        2.00e-01, 1.00e+00, 1.00e+00, 3.00e+00],
       [5.30e+01, 0.00e+00, 4.00e+00, 1.30e+02, 2.64e+02, 1.43e+02,
        4.00e-01, 2.00e+00, 0.00e+00, 3.00e+00],
       [5.50e+01, 1.00e+00, 2.00e+00, 1.30e+02, 2.62e+02, 1.55e+02,
        0.00e+00, 1.00e+00, 0.00e+00, 3.00e+00],
       [6.20e+01, 0.00e+00, 4.00e+00, 1.38e+02, 2.94e+02, 1.06e+02,
        1.90e+00, 2.00e+00, 3.00e+00, 3.00e+00],
       [4.20e+01, 1.00e+00, 1.00e+00, 1.48e+02, 2.44e+02, 1.78e+02,
        8.00e-01, 1.00e+00, 2.00e+00, 3.00e+00],
       [6.70e+01, 1.00e+00, 4.00e+00, 1.60e+02, 2.86e+02, 1.08e+02,
        1.50e+00, 2.00e+00, 3.00e+00, 3.00e+00],
       [4.30e+01, 0.00e+00, 3.00e+00, 1.22e+02, 2.13e+02, 1.65e+02,
        2.00e-01, 2.00e+00, 0.00e+00, 3.00e+00],
       [4.10e+01, 1.00e+00, 2.00e+00, 1.10e+02, 2.35e+02, 1.53e+

In [237]:
logisticRegr = LogisticRegression(solver='lbfgs',max_iter=1000)
logisticRegr.fit(x_train_model, y_train)
predictions = logisticRegr.predict(x_test_model)

In [238]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print('Performance')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

Performance
- accuracy = 0.46296296296296297
- precision = 0.5714285714285714
- recall = 0.25806451612903225
- f1 = 0.35555555555555557


# Wrapper Method (Backward)

In [196]:
logisticRegr = LogisticRegression(solver='lbfgs',max_iter=1000)
sbfs = SFS(logisticRegr, 
           k_features=10, 
           forward=False, #For backward selection, set to False 
           floating=False, 
           verbose=2, 
           scoring='accuracy', 
           cv=10, #cross validstion 
           n_jobs=-1)  #using all available CPU core
sbfs = sbfs.fit(x_train.values,y_train.values, custom_feature_names=x_train.columns)

AttributeError: 'str' object has no attribute 'decode'

AttributeError: 'str' object has no attribute 'decode'

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    5.2s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    7.1s finished

[2021-09-23 22:40:15] Features: 12/10 -- score: 0.8523809523809524[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    2.1s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.7s finished

[2021-09-23 22:40:19] Features: 11/10 -- score: 0.8614718614718615[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.6s remaining:    7.7s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    2.3s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    3.1

In [197]:
sbfs.k_feature_names_

('sex',
 'cheay_pain_type',
 'resting_blood_pressure',
 'fasting_blood_sugar',
 'resting_electro_results',
 'max_heart_rate',
 'oldpeak',
 'slop',
 'num_major_vessels',
 'thal')

In [198]:
sbfs.k_feature_idx_

(1, 2, 3, 5, 6, 7, 9, 10, 11, 12)

In [199]:
x_train_model = sbfs.transform(x_train)
_test_mpdel = sbfs.transform(x_test)

In [200]:
logisticRegr = LogisticRegression(solver='lbfgs',max_iter=1000)
logisticRegr.fit(x_train_model, y_train)
predictions = logisticRegr.predict(x_test_model)

In [201]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print('Performance')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

Performance
- accuracy = 0.8333333333333334
- precision = 1.0
- recall = 0.7096774193548387
- f1 = 0.8301886792452831


# Wrapper Methods (Recursive)

In [99]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [100]:
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
rfe = RFE(estimator=logisticRegr, n_features_to_select=10)

In [106]:
rfe = rfe.fit(x_train, y_train)

In [107]:
x_train.columns[rfe.support_]

Index(['sex', 'cheay_pain_type', 'resting_blood_pressure',
       'fasting_blood_sugar', 'resting_electro_results',
       'exercise_induced_angina', 'oldpeak', 'slop', 'num_major_vessels',
       'thal'],
      dtype='object')

In [109]:
rfe.get_support(indices=True)

array([ 1,  2,  3,  5,  6,  8,  9, 10, 11, 12], dtype=int64)

In [110]:
x_train.columns[rfe.get_support(indices=True)]

Index(['sex', 'cheay_pain_type', 'resting_blood_pressure',
       'fasting_blood_sugar', 'resting_electro_results',
       'exercise_induced_angina', 'oldpeak', 'slop', 'num_major_vessels',
       'thal'],
      dtype='object')

In [111]:
print(rfe.estimator_.coef_)

[[ 0.98936163  0.69960957  0.01951325 -1.02327918  0.3825781   0.34533801
   0.35050871  0.60614109  1.24039576  0.39050175]]


In [239]:
x_train_model = rfe.transform(x_train)
x_test_model = rfe.transform(x_test)

In [240]:
logiticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
logiticRegr.fit(x_train_model,y_train)
predictions = logisticRegr.predict(x_test_model)

In [241]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print('Performance')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

Performance
- accuracy = 0.5370370370370371
- precision = 1.0
- recall = 0.1935483870967742
- f1 = 0.3243243243243243


# Wrapper Methods (Exhaustive)

In [120]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver='lbfgs',max_iter=1000)

In [130]:
efs = EFS(logisticRegr, 
          min_features=10, 
          max_features=10,  
          scoring='accuracy', 
          print_progress=True, 
          cv=10, #cross validstion
          n_jobs=-1)  #using all available CPU core

In [132]:
efs = efs.fit(x_train.values, y_train.values, custom_feature_names=x_train.columns)
efs.subsets_

Features: 286/286

{0: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
  'cv_scores': array([0.63636364, 0.72727273, 0.86363636, 0.86363636, 0.63636364,
         0.77272727, 0.80952381, 0.80952381, 0.71428571, 0.71428571]),
  'avg_score': 0.7547619047619047,
  'feature_names': ('age',
   'sex',
   'cheay_pain_type',
   'resting_blood_pressure',
   'serum_cholestoral',
   'fasting_blood_sugar',
   'resting_electro_results',
   'max_heart_rate',
   'exercise_induced_angina',
   'oldpeak')},
 1: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 10),
  'cv_scores': array([0.68181818, 0.68181818, 0.86363636, 0.86363636, 0.68181818,
         0.77272727, 0.76190476, 0.85714286, 0.61904762, 0.76190476]),
  'avg_score': 0.7545454545454546,
  'feature_names': ('age',
   'sex',
   'cheay_pain_type',
   'resting_blood_pressure',
   'serum_cholestoral',
   'fasting_blood_sugar',
   'resting_electro_results',
   'max_heart_rate',
   'exercise_induced_angina',
   'slop')},
 2: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 11),

In [133]:
efs.best_score_

0.8614718614718615

In [134]:
efs.best_idx_

(1, 2, 3, 5, 6, 7, 9, 10, 11, 12)

In [135]:
efs.best_feature_names_

('sex',
 'cheay_pain_type',
 'resting_blood_pressure',
 'fasting_blood_sugar',
 'resting_electro_results',
 'max_heart_rate',
 'oldpeak',
 'slop',
 'num_major_vessels',
 'thal')

In [136]:
x_train_model = efs.transform(x_train)
x_test_model = efs.transform(x_test)

In [140]:
logiticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
logiticRegr.fit(x_train_model,y_train)
predictions = logisticRegr.predict(x_test_model)

In [141]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print('Performance')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

Performance
- accuracy = 0.8333333333333334
- precision = 1.0
- recall = 0.7096774193548387
- f1 = 0.8301886792452831


# Embedded Methods (Tree-Based Method)

In [210]:
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [211]:
rf = RandomForestClassifier(n_estimators = 100, random_state=2019)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2019,
                       verbose=0, warm_start=False)

In [212]:
importances = rf.feature_importances_
print(importances)

[0.08265099 0.04576638 0.14405997 0.08344625 0.09224165 0.01148403
 0.02367749 0.10086596 0.03103675 0.11598161 0.04262228 0.10421902
 0.12194762]


In [213]:
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, x_train.columns[indices[f]], 
                            importances[indices[f]]))

 1) cheay_pain_type                0.144060
 2) thal                           0.121948
 3) oldpeak                        0.115982
 4) num_major_vessels              0.104219
 5) max_heart_rate                 0.100866
 6) serum_cholestoral              0.092242
 7) resting_blood_pressure         0.083446
 8) age                            0.082651
 9) sex                            0.045766
10) slop                           0.042622
11) exercise_induced_angina        0.031037
12) resting_electro_results        0.023677
13) fasting_blood_sugar            0.011484


In [226]:
sfm = SelectFromModel(rf, threshold=0.04)
sfm.fit(x_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,

In [227]:
rf_feature_boolean_selector = sfm.get_support()
rf_feature_boolean_selector

array([ True,  True,  True,  True,  True, False, False,  True, False,
        True,  True,  True,  True])

In [224]:
rf_feature_set = x_train.columns[rf_feature_boolean_selector]
rf_feature_set

Index(['age', 'sex', 'cheay_pain_type', 'resting_blood_pressure',
       'serum_cholestoral', 'max_heart_rate', 'oldpeak', 'slop',
       'num_major_vessels', 'thal'],
      dtype='object')

In [225]:
x_train_model = sfm.transform(x_train)
x_train_model
x_test_model = sfm.transform(x_test)
x_test_model

array([[6.50e+01, 1.00e+00, 4.00e+00, 1.20e+02, 1.77e+02, 1.40e+02,
        4.00e-01, 1.00e+00, 0.00e+00, 7.00e+00],
       [7.40e+01, 0.00e+00, 2.00e+00, 1.20e+02, 2.69e+02, 1.21e+02,
        2.00e-01, 1.00e+00, 1.00e+00, 3.00e+00],
       [5.30e+01, 0.00e+00, 4.00e+00, 1.30e+02, 2.64e+02, 1.43e+02,
        4.00e-01, 2.00e+00, 0.00e+00, 3.00e+00],
       [5.50e+01, 1.00e+00, 2.00e+00, 1.30e+02, 2.62e+02, 1.55e+02,
        0.00e+00, 1.00e+00, 0.00e+00, 3.00e+00],
       [6.20e+01, 0.00e+00, 4.00e+00, 1.38e+02, 2.94e+02, 1.06e+02,
        1.90e+00, 2.00e+00, 3.00e+00, 3.00e+00],
       [4.20e+01, 1.00e+00, 1.00e+00, 1.48e+02, 2.44e+02, 1.78e+02,
        8.00e-01, 1.00e+00, 2.00e+00, 3.00e+00],
       [6.70e+01, 1.00e+00, 4.00e+00, 1.60e+02, 2.86e+02, 1.08e+02,
        1.50e+00, 2.00e+00, 3.00e+00, 3.00e+00],
       [4.30e+01, 0.00e+00, 3.00e+00, 1.22e+02, 2.13e+02, 1.65e+02,
        2.00e-01, 2.00e+00, 0.00e+00, 3.00e+00],
       [4.10e+01, 1.00e+00, 2.00e+00, 1.10e+02, 2.35e+02, 1.53e+

In [222]:
logiticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
logiticRegr.fit(x_train_model,y_train)
predictions = logisticRegr.predict(sfm.transform(x_test))

In [223]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print('Performance')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

Performance
- accuracy = 0.46296296296296297
- precision = 0.5714285714285714
- recall = 0.25806451612903225
- f1 = 0.35555555555555557
