In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import pandas as pd


## Forward Selection with RFE

In [35]:
# No DA
df = pd.read_csv('./processed_datasets/dataset_preprocessed.csv')
df.isnull().sum()

index                   0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
BMI Category            0
Diabetes_012            0
dtype: int64

In [36]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [37]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep  11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[ 0.00000000e+00  0.00000000e+00 -2.12724693e+00 ...  1.10000000e+01
   4.00000000e+00  5.00000000e+00]
 [ 1.00000000e+00  1.00000000e+00 -1.70425254e-01 ...  1.10000000e+01
   4.00000000e+00  3.00000000e+00]
 [ 2.00000000e+00  1.00000000e+00  5.82198469e-01 ...  9.00000000e+00
   4.00000000e+00  7.00000000e+00]
 ...
 [ 5.53480000e+05  1.00000000e+00 -1.30461110e+00 ...  1.00000000e+01
   2.00000000e+00  3.00000000e+00]
 [ 5.53482000e+05  1.00000000e+00 -1.65327735e+00 ...  1.00000000e+01
   2.00000000e+00  3.00000000e+00]
 [ 5.53483000e+05  1.00000000e+00 -9.72121218e-01 ...  1.00000000e+01
   2.00000000e+00  2.00000000e+00]]


In [38]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_without_da_rfe.csv", index=False)

In [18]:
# SMOTE
df = pd.read_csv('./processed_datasets/dataset_preprocessed_smote.csv')
df.isnull().sum()

HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
BMI Category            0
Diabetes_012            0
dtype: int64

In [19]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [20]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep 11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[ 0.          1.         -2.12724693 ...  4.          5.
   0.        ]
 [ 1.          0.         -0.17042525 ...  4.          3.
   2.        ]
 [ 1.          1.          0.58219847 ...  4.          7.
   3.        ]
 ...
 [ 1.          1.          0.43167372 ...  6.          9.
   3.        ]
 [ 1.          1.          0.2294433  ...  5.          5.
   3.        ]
 [ 1.          1.          1.35422925 ...  3.          1.
   4.        ]]


In [21]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_smote_rfe.csv", index=False)

In [22]:
# Adasyn
df = pd.read_csv('./processed_datasets/dataset_preprocessed_adasyn.csv')
df.isnull().sum()

HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
BMI Category            0
Diabetes_012            0
dtype: int64

In [23]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [24]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep  11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[ 1.         -2.12724693  1.         ...  4.          5.
   0.        ]
 [ 0.         -0.17042525  1.         ...  4.          3.
   2.        ]
 [ 1.          0.58219847  1.         ...  4.          7.
   3.        ]
 ...
 [ 0.         -1.22409846  1.         ...  2.          3.
   1.        ]
 [ 1.         -1.65327735  1.         ...  2.          3.
   0.        ]
 [ 1.         -0.97212122  1.         ...  2.          2.
   1.        ]]


In [25]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_adasyn_rfe.csv", index=False)

## Hybrid FS (PCA + rfe)

In [40]:
# PCA only
df = pd.read_csv('./processed_datasets/dataset_without_da_pca.csv')
df.isnull().sum()

PC1             0
PC2             0
PC3             0
PC4             0
PC5             0
PC6             0
PC7             0
PC8             0
PC9             0
PC10            0
PC11            0
Diabetes_012    0
dtype: int64

In [41]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [42]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep  11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[-3.19513537e+05 -6.50714623e+03 -8.72957629e-01 ... -2.63580436e-02
  -1.55140710e-02 -3.73377095e-02]
 [-3.19512313e+05 -6.50709177e+03 -2.28756459e-02 ... -3.60749864e-02
  -2.81744355e-02  2.40137281e-02]
 [-3.19511090e+05 -6.50703731e+03  3.12003705e-01 ... -2.46156232e-02
   3.83564186e-02  2.62729087e-02]
 ...
 [ 3.31775576e+05 -4.71216855e+03 -8.00859600e-01 ... -3.26681902e-02
   6.80200800e-02  1.83074300e-02]
 [ 3.31777439e+05 -4.71269865e+03 -9.55509753e-01 ... -3.31171810e-02
   6.72811976e-02  1.97130483e-02]
 [ 3.31778662e+05 -4.71264419e+03 -6.51843002e-01 ... -1.70056102e-02
   6.77543678e-02  8.91062358e-03]]


In [43]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_without_da_pca_rfe.csv", index=False)

In [26]:
# ADASYN + PCA
df = pd.read_csv('./processed_datasets/dataset_adasyn_pca.csv')
df.isnull().sum()

PC1             0
PC2             0
PC3             0
PC4             0
PC5             0
PC6             0
PC7             0
PC8             0
PC9             0
PC10            0
PC11            0
Diabetes_012    0
dtype: int64

In [27]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [28]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep  11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[-2.31317420e+05 -8.95870634e-01  3.80038479e-01 ... -1.24873335e-02
  -3.41902858e-02  2.56566415e-02]
 [-2.31316554e+05 -4.71607750e-02 -3.52738298e-02 ... -2.69958192e-02
   2.46677171e-02  4.74026271e-03]
 [-2.31315688e+05  2.87714496e-01 -6.57549533e-02 ...  3.92764508e-02
   3.00772987e-02  1.82401538e-02]
 ...
 [ 2.47934474e+05 -8.19468345e-01 -7.43034337e-02 ...  6.98636596e-02
   2.16193859e-02  2.46094583e-02]
 [ 2.47936206e+05 -9.73551323e-01 -6.67499909e-02 ...  6.91264862e-02
   2.33077121e-02  2.45706370e-02]
 [ 2.47937072e+05 -6.69343554e-01 -7.03335840e-02 ...  6.86673138e-02
   1.29698206e-02  2.49300922e-02]]


In [29]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_adasyn_pca_rfe.csv", index=False)

In [30]:
# SMOTE + PCA
df = pd.read_csv('./processed_datasets/dataset_smote_pca.csv')
df.isnull().sum()

PC1             0
PC2             0
PC3             0
PC4             0
PC5             0
PC6             0
PC7             0
PC8             0
PC9             0
PC10            0
PC11            0
Diabetes_012    0
dtype: int64

In [31]:
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

In [32]:
# Define the model
model = RandomForestClassifier()

# Perform RFE
rfe = RFE(estimator=model, n_features_to_select=11)  # Keep  11 features
X_selected = rfe.fit_transform(X, y)

# Display selected features
print("Selected Features: ", X_selected)

Selected Features:  [[-2.30518601e+05 -8.80951203e-01  4.52828188e-01 ... -4.11766380e-02
  -1.04331269e-02  2.48466667e-02]
 [-2.30517735e+05 -3.78041561e-02 -4.38703066e-02 ...  2.28884373e-02
  -3.83675418e-02  2.92467322e-03]
 [-2.30516869e+05  3.00361329e-01 -7.49678266e-02 ...  4.17494827e-02
   2.89870420e-02  1.70224721e-02]
 ...
 [ 2.48719072e+05 -3.46209364e-01 -1.87470408e-01 ... -2.49856545e-03
  -5.86874845e-04  1.86848139e-02]
 [ 2.48720804e+05 -1.56957791e-01  1.44658445e-01 ...  1.79587067e-02
  -3.15859357e-02  3.18421597e-02]
 [ 2.48721669e+05  3.73029047e-01  4.97216436e-01 ...  1.71908836e-02
   1.48861274e-02  6.89441148e-03]]


In [33]:
df = pd.DataFrame(X_selected, columns=[f"FS{i}" for i in range(1,12)])
df["Diabetes_012"] = y
df.to_csv(f"./processed_datasets/dataset_smote_pca_rfe.csv", index=False)