# Best Model Selection For Breast Cancer Dataset

In [1]:
import pandas as pd
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=datasets.load_breast_cancer()

In [14]:
x=df.data
y=df.target
x

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)

# check for KNN

In [5]:
model=KNeighborsClassifier(n_neighbors=6).fit(x_train,y_train) #checking for different neighbors 5,4,3,2
model.score(x_test,y_test)#check accuracy here using Hyper parameter Tuning for KNN

0.9370629370629371

# Check for Decision Tree

In [6]:
model=DecisionTreeClassifier(max_depth=2).fit(x_train,y_train)#check for different max_depth= 2,3,4,5,6
model.score(x_test,y_test)

0.8881118881118881

#  Check for Random Forest

In [7]:
model=RandomForestClassifier(max_depth=2).fit(x_train,y_train)#check for different max_depth= 2,3,4,5,6 
model.score(x_test,y_test)#you can check accuracy for other algorithms also and check accuracy 

0.9440559440559441

# Hyper Parameter Tuning For Random Forest

In [8]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
Random_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid,cv = 5, n_jobs = -1, verbose = 2).fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   26.7s finished


In [9]:
Random_search.best_estimator_

RandomForestClassifier(max_depth=110, max_features=2, min_samples_leaf=3,
                       min_samples_split=12)

In [10]:
Random_search.best_params_

{'n_estimators': 100,
 'min_samples_split': 12,
 'min_samples_leaf': 3,
 'max_features': 2,
 'max_depth': 110,
 'bootstrap': True}

In [11]:
Random_search.best_score_

0.9601367989056089

In [12]:
model=RandomForestClassifier(n_estimators=1000,min_samples_split=12,min_samples_leaf=3,max_features=3,max_depth=80,bootstrap=True).fit(x_train,y_train)

In [13]:
model.score(x_test,y_test)

0.9370629370629371