In [1]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0


In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('datasets/diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
from sklearn.feature_selection import RFE # Recursive Feature Elimination
from sklearn.linear_model import LogisticRegression

In [6]:
model = LogisticRegression(solver='liblinear')
rfe = RFE(model, n_features_to_select=4)

In [7]:
fit = rfe.fit(X, y)

In [8]:
print(f'Num Features: {fit.n_features_} \n Selected Features: {fit.support_} \n Feature Ranking: {fit.ranking_}')

Num Features: 4 
 Selected Features: [ True  True False False False  True  True False] 
 Feature Ranking: [1 1 2 4 5 1 1 3]


In [10]:
feature_rank = pd.DataFrame({'columns': X.columns, 'ranking': fit.ranking_, 'selected': fit.support_})
feature_rank.sort_values(by='ranking', inplace=True)
feature_rank

Unnamed: 0,columns,ranking,selected
0,Pregnancies,1,True
1,Glucose,1,True
5,BMI,1,True
6,DiabetesPedigreeFunction,1,True
2,BloodPressure,2,False
7,Age,3,False
3,SkinThickness,4,False
4,Insulin,5,False


In [11]:
recursive_feature_names = X.columns[fit.support_]
recursive_feature_names

Index(['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction'], dtype='object')

In [12]:
recursive_features = X[recursive_feature_names]
recursive_features.head(10)

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction
0,6,148,33.6,0.627
1,1,85,26.6,0.351
2,8,183,23.3,0.672
3,1,89,28.1,0.167
4,0,137,43.1,2.288
5,5,116,25.6,0.201
6,3,78,31.0,0.248
7,10,115,35.3,0.134
8,2,197,30.5,0.158
9,8,125,0.0,0.232


In [13]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier

In [14]:
feature_selector = SFS(RandomForestClassifier(n_estimators=10),
                        k_features=4,
                        forward=True,
                        scoring='accuracy',
                        cv=4)
features = feature_selector.fit(X, y)

In [17]:
forward_elimination_features = X[X.columns[list(features.k_feature_idx_)]]
forward_elimination_features.head(10)

Unnamed: 0,Glucose,BloodPressure,BMI,Age
0,148,72,33.6,50
1,85,66,26.6,31
2,183,64,23.3,32
3,89,66,28.1,21
4,137,40,43.1,33
5,116,74,25.6,30
6,78,50,31.0,26
7,115,0,35.3,29
8,197,70,30.5,53
9,125,96,0.0,54


In [19]:
feature_selector = SFS(RandomForestClassifier(n_estimators=10),
                        k_features=4,
                        forward=False,
                        scoring='accuracy',
                        cv=4)
features = feature_selector.fit(X, y)

In [20]:
backward_elimination_features = X[X.columns[list(features.k_feature_idx_)]]
backward_elimination_features.head(10)

Unnamed: 0,Glucose,SkinThickness,DiabetesPedigreeFunction,Age
0,148,35,0.627,50
1,85,29,0.351,31
2,183,0,0.672,32
3,89,23,0.167,21
4,137,35,2.288,33
5,116,0,0.201,30
6,78,32,0.248,26
7,115,0,0.134,29
8,197,45,0.158,53
9,125,0,0.232,54


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [22]:
def build_model(X, y, test_frac):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac)
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

In [24]:
build_model(X, y, 0.2)
build_model(recursive_features, y, 0.2)
build_model(forward_elimination_features, y, 0.2)
build_model(backward_elimination_features, y, 0.2)

Accuracy: 0.7792207792207793
Accuracy: 0.7727272727272727
Accuracy: 0.7337662337662337
Accuracy: 0.7532467532467533
