### Import libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import load_breast_cancer

In [5]:
# load data
data = load_breast_cancer()['data']
cols = load_breast_cancer()['feature_names']

df = pd.DataFrame(data=data,columns=cols)
df['Target'] = load_breast_cancer()['target']

In [6]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [7]:
df.shape

(569, 31)

In [8]:
# check missing data
df.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
Target                     0
dtype: int64

##### The dataset has 30 features and 1 target column

30 is very huge as far as dimensionality is concerned. Unnecessary features can act as NOISE

In [10]:
# descriptive statistics

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
mean texture,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


#### Lets set feature and target set

In [11]:
# X = feature
# Y= target

X = df.drop('Target',axis=1)
Y =df['Target']

In [12]:
X.shape

(569, 30)

In [13]:
Y.shape

(569,)

### Train Test Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

#### Standardize the dataset

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

#### Define RandomForest Classifier to be used by Boruta

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model = RandomForestClassifier() # For Boruta

### Boruta 

In [20]:
from boruta import BorutaPy

In [21]:
# define Boruta feature selection method
feature_selector = BorutaPy(model,n_estimators='auto',verbose=2,random_state=1,max_iter=100)

In [22]:
feature_selector.fit(np.array(x_train),np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	19
Tentative: 	11
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	19
Tentative: 	11
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	19
Tentative: 	10
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	19
Tentative: 	10
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	3
Iteration: 	13 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	3
Iteration: 	14 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	3
Iteration: 	15 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	3
Iteration: 	16 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 

BorutaPy(estimator=RandomForestClassifier(n_estimators=69,
                                          random_state=RandomState(MT19937) at 0x1B8B3014B40),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x1B8B3014B40, verbose=2)

Print the decisions made by the feature_selector

In [23]:
print('This array has on wheter to keep a feature or not!\n')
print(feature_selector.support_)

This array has on wheter to keep a feature or not!

[ True  True  True  True  True  True  True  True False False  True False
  True  True False False  True False False False  True  True  True  True
  True  True  True  True  True  True]


In [24]:
np.sum(feature_selector.support_)

22

In [25]:
# print the array has feature ranking

print(feature_selector.ranking_)

[1 1 1 1 1 1 1 1 5 3 1 8 1 1 7 5 1 2 5 8 1 1 1 1 1 1 1 1 1 1]


In [26]:
x_filtered_train = feature_selector.transform(np.array(x_train))
x_filtered_test = feature_selector.transform(np.array(x_test))

### Lets Compare Performances


In [27]:
# model without BorutaPy
model1 = RandomForestClassifier()

In [28]:
#train the model

model1.fit(x_train_scaled,y_train)

RandomForestClassifier()

In [29]:
## Test the model1

y_pred = model1.predict(x_test_scaled)

In [32]:
from sklearn.metrics import accuracy_score, classification_report

In [31]:
print(accuracy_score(y_test,y_pred))

0.9707602339181286


In [33]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



In [39]:
pd.crosstab(y_test,y_pred)

col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,59,4
1,1,107


#### Lets check the model with BorutaPy

In [34]:
#train the model
model.fit(x_filtered_train,y_train)

RandomForestClassifier(n_estimators=69,
                       random_state=RandomState(MT19937) at 0x1B8B3014B40)

In [35]:
# test the model
yb_pred = model.predict(x_filtered_test)

In [36]:
print(classification_report(y_test,yb_pred))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94        63
           1       0.95      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



In [37]:
print(accuracy_score(y_test,yb_pred))

0.9590643274853801


In [38]:
pd.crosstab(y_test,yb_pred)

col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,58,5
1,2,106


The model with BorutaPy gave an accuracy score of ~96%.
And the model without BorutaPy gave an accuracy score of 97%