# Sklearn - Breast Cancer - Random Search

- toc: true
- badges: False
- comments: true
- author: Sam Treacy
- categories: [sklearn, pandas, random_search, classification, python]

In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [156]:
df = pd.read_csv('DATA/breast.cancer.data.csv')
df = df.iloc[:,:-1].drop(columns='id')

In [157]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Define Target and Features

In [158]:
y = df['diagnosis']

X = df.iloc[:,1:]

## Train Test Split

In [159]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape

((455, 30), (114, 30), (455,))

## Scale Data

In [160]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

## Create model

In [161]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

RandomForestClassifier()

## Evaluate predictions

In [162]:
predictions = model.predict(X_test)

In [163]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, predictions))
print('Confusion Matrix\n')
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           B       0.96      0.99      0.97        71
           M       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Confusion Matrix

[[70  1]
 [ 3 40]]


## Balance Data

In [164]:
df.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [165]:
from imblearn.over_sampling import RandomOverSampler

over_sample = RandomOverSampler(sampling_strategy='minority')

X_train_over, y_train_over = over_sample.fit_sample(X_train, y_train)

In [166]:
y_train_over.value_counts()

B    286
M    286
Name: diagnosis, dtype: int64

In [167]:

model = AdaBoostClassifier()

model.fit(X_train_over, y_train_over)

predictions = model.predict(X_test)

print(classification_report(y_test, predictions))
print('Confusion Matrix\n')
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           B       0.99      0.99      0.99        71
           M       0.98      0.98      0.98        43

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Confusion Matrix

[[70  1]
 [ 1 42]]


#### Using oversampling has reduced the F1 accuracy.

In [168]:

model = AdaBoostClassifier(n_estimators=100, learning_rate=0.2, 
                           algorithm='SAMME')

model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [169]:
print(classification_report(y_test, predictions))
print('Confusion Matrix\n')
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           B       0.96      0.99      0.97        71
           M       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Confusion Matrix

[[70  1]
 [ 3 40]]


## Random Search to tune hyperparameters

In [170]:
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestClassifier()

                            
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 100)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10, 15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3,4,5]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
                            
search_model = RandomizedSearchCV(model, random_grid, n_iter=20, cv=5, random_state=0, verbose=0)                            

In [171]:
search_model.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 12, 23, 34, 45, 56, 66,
                                                      77, 88, 99, 110, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 5, 7, 10, 15],
                                        'n_estimators': [50, 69, 89, 109, 128,
                                                         148, 168, 187, 207,
                                                         227, 246, 266, 286,
                                                         306, 325, 345, 365,
                                                         384, 404, 424, 443,
                                                         463, 483, 503, 522,
      

In [172]:
predictions = search_model.predict(X_test)

In [176]:
search_model.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=12, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=7,
                       n_estimators=1389)

In [177]:
search_model.best_params_

{'n_estimators': 1389,
 'min_samples_split': 7,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 12,
 'bootstrap': False}

In [180]:
search_model.classes_

array(['B', 'M'], dtype=object)

In [189]:
search_model.best_score_

0.9604395604395604

In [202]:
X_test.shape 

(114, 30)

In [205]:
search_model.predict(X_test[17].reshape(1,30))

array(['B'], dtype=object)

In [173]:
print(classification_report(y_test, predictions))
print('Confusion Matrix\n')
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           B       0.96      0.99      0.97        71
           M       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Confusion Matrix

[[70  1]
 [ 3 40]]
