## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import load_breast_cancer
import graphviz
from IPython.display import display, Image
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
cancer_data = load_breast_cancer()
cancer_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df['target'] = cancer_data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## Split the data into target and inputs

In [4]:
x = df[cancer_data.feature_names].values
y = df['target'].values

## Split into training and testing

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=365)

## Set and tune the hyperparameter

In [6]:
param_grid = {'n_estimators': [10, 25, 50, 75, 100],
              'criterion' : ['gini', 'entropy', 'log_loss'],
              'max_depth': [None, 5, 15, 25],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'sqrt', 'log2'],
              'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=365)

gs = GridSearchCV(rf, param_grid, scoring='f1', cv=5)
gs.fit(x_train, y_train)

In [7]:
best_params = gs.best_params_
best_params

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [8]:
gs.best_score_

0.9724019972650162

In [9]:
gs.cv_results_

{'mean_fit_time': array([0.00312519, 0.        , 0.        , ..., 0.57787709, 0.79548063,
        0.82717052]),
 'std_fit_time': array([0.00625038, 0.        , 0.        , ..., 0.14206027, 0.14263971,
        0.16810135]),
 'mean_score_time': array([0.        , 0.        , 0.        , ..., 0.07421246, 0.057795  ,
        0.04194069]),
 'std_score_time': array([0.        , 0.        , 0.        , ..., 0.07154686, 0.04016717,
        0.01294159]),
 'param_bootstrap': masked_array(data=[True, True, True, ..., False, False, False],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', ..., 'log_loss', 'log_loss',
                    'log_loss'],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[None, None, None, ..., 25, 25, 25],
              mask=[False,

In [10]:
scores = gs.cv_results_['mean_test_score']
scores

array([       nan,        nan,        nan, ..., 0.96036365, 0.96192729,
       0.96530806])

## Train the model

In [11]:
rf_best = RandomForestClassifier(random_state=365, **best_params)
rf_best.fit(x_train, y_train)

## Test the model

In [12]:
y_pred = rf_best.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy score : {accuracy*100:.2f}%')

Accuracy score : 94.74%


## Feature importance

In [13]:
ft_imp = pd.Series(rf_best.feature_importances_, index=cancer_data.feature_names).sort_values(ascending=False)
ft_imp

worst perimeter            0.146620
worst radius               0.123762
mean concave points        0.109424
worst area                 0.106789
worst concave points       0.098797
mean concavity             0.055849
mean perimeter             0.051863
mean area                  0.045260
worst concavity            0.036758
worst texture              0.031891
area error                 0.029338
mean radius                0.024533
mean texture               0.017964
radius error               0.017373
mean compactness           0.016195
worst smoothness           0.015177
worst symmetry             0.012468
worst compactness          0.010917
worst fractal dimension    0.008322
smoothness error           0.006193
mean smoothness            0.005174
fractal dimension error    0.005141
symmetry error             0.004483
concave points error       0.003369
mean fractal dimension     0.003336
texture error              0.003311
compactness error          0.002904
perimeter error            0

## Feature selection

In [14]:
selector = SelectFromModel(rf_best, threshold=0.01)

selector.fit(x_train, y_train)

x_train_selected = selector.transform(x_train)
x_test_selected = selector.transform(x_test)

print(f"Selected feature indices: {selector.get_support(indices=True)}")

Selected feature indices: [ 0  1  2  3  5  6  7 10 13 20 21 22 23 24 25 26 27 28]


In [15]:
selected_feature_names = cancer_data.feature_names[selector.get_support(indices=True)]
selected_feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean compactness', 'mean concavity', 'mean concave points',
       'radius error', 'area error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry'], dtype='<U23')

## Train new model on selected features

In [16]:
rf_selected = RandomForestClassifier(random_state=365, **best_params)
rf_selected.fit(x_train_selected , y_train)

## Evaluating the new model

In [17]:
y_new_pred = rf_selected.predict(x_test_selected)

accuracy = accuracy_score(y_test, y_new_pred)

print(f'Accuracy score : {accuracy*100:.2f}%')

Accuracy score : 95.61%


In [18]:
rf_selected.predict_proba(x_test_selected)

array([[0.84, 0.16],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.96, 0.04],
       [0.38, 0.62],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.6 , 0.4 ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.74, 0.26],
       [0.53, 0.47],
       [0.17, 0.83],
       [0.02, 0.98],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.23, 0.77],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.91, 0.09],
       [0.99, 0.01],
       [0.  , 1.  ],
       [0.94, 0.06],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.99, 0.01],
       [0.25, 0.75],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.92, 0.08],
       [0.33, 0.67],
       [0.  , 1.  ],
       [0.98, 0.02],
       [1.  ,