# Wisconsin Breast Cancer Dataset

In [1]:
import datetime
from sklearn import datasets
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models

## Load the data

In [3]:
bunch = datasets.load_breast_cancer()
features = bunch['feature_names']
features

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
X_all = pd.DataFrame(bunch['data'],columns=features)
y_all = pd.Series(bunch['target'])
X_all.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [5]:
print('Count zero',np.count_nonzero(y_all==0),'Malignant')
print('Count ones',np.count_nonzero(y_all),'Benign')

Count zero 212 Malignant
Count ones 357 Benign


## Partition the data: train vs test

In [6]:
print(datetime.datetime.now())
Xtrain,Xtest,ytrain,ytest = train_test_split(X_all, y_all.ravel()) 
        # ,random_state=42) # add this for reproducibility
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape,'non-zero:',np.count_nonzero(ytrain))
print('Xtest',Xtest.shape,'ytest',ytest.shape,'non-zero:',np.count_nonzero(ytest))

2022-06-06 09:18:28.980685
Xtrain (426, 30) ytrain (426,) non-zero: 271
Xtest (143, 30) ytest (143,) non-zero: 86


## Get cross-validation accuracy
Partition the training data into 5 equal-size groups.
For i = 1 to 5, hold group i for validation,
train the model on the other groups,
and evaluate the model on group i.

In [7]:
print(datetime.datetime.now())
model1=RandomForestClassifier()
cv_scores = cross_val_score(model1, Xtrain, ytrain, cv=5)
print(cv_scores)
print('mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))

2022-06-06 09:18:28.985512
[0.95348837 0.91764706 0.91764706 0.96470588 0.98823529]
mean 0.9483 +/- 0.0275


## Re-train the model. Evaluate on test data.
Whereas cross-validation trained on 80% of the training data (5 times),
now train on 100% of the training data.
Test on the test data.
Warning: Never use the test results to improve the model.

In [8]:
print(datetime.datetime.now())
model2=RandomForestClassifier()
model2.fit(Xtrain,ytrain)

2022-06-06 09:18:29.388855


RandomForestClassifier()

In [9]:
print(datetime.datetime.now())
ypred = model2.predict(Xtest)
matches = np.count_nonzero(ytest==ypred)
accuracy = 100.0 * matches / len(ypred) 
accuracy

2022-06-06 09:18:29.477368


97.9020979020979

In [10]:
print(datetime.datetime.now())
confusion = confusion_matrix(ytest, ypred)
confusion

2022-06-06 09:18:29.487110


array([[56,  1],
       [ 2, 84]])

## Rank features by importance
Warning: Random forests use random sampling.
We get a different ranking every time we build a new classifier.

In [11]:
def important_features(model):
    names = model.feature_names_in_
    importances = model.feature_importances_
    pairs = np.column_stack( (names,importances) )
    top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
    top_list = []
    for i in top_array:
         top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
    top_df = pd.DataFrame(top_list)
    return top_df
print(datetime.datetime.now())
top_features = important_features(model2)
top_features[:10]

2022-06-06 09:18:29.491265


Unnamed: 0,0,1
0,0.150718,worst perimeter
1,0.12941,worst concave points
2,0.108985,worst radius
3,0.106031,worst area
4,0.080374,mean concave points
5,0.064665,mean area
6,0.051304,worst concavity
7,0.037349,mean perimeter
8,0.032951,mean concavity
9,0.032639,mean radius
