In [1]:
import collections  # for Python < 3.6

In [2]:
import numpy as np
import pandas as pd
import scipy

In [3]:
import bokeh.plotting
from bokeh.palettes import Category10_10 as palette, viridis
bokeh.plotting.output_notebook()

In [4]:
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.decomposition
import sklearn.linear_model
import sklearn.pipeline
import sklearn.metrics

Set a random state for reproducability.

In [5]:
rs = 1

Lets import the Wisconsin Breast Cancer dataset

In [6]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(url, header=None)

In [7]:
df.describe()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Not to descriptive but these numbers mean something to someone.  The first column looks like an identifier and the second column looks like the class labels, malignant (M) or benign (B).  From this, we can use the other 30 columns to build our machine learning predictor.

In [9]:
X = df.loc[:, 2:].values
y = df.loc[:, 1].values

In [10]:
y[:50]

array(['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B'], dtype=object)

Lets transform the labels into integers.

In [11]:
le = sklearn.preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [12]:
y[:50]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0])

In [13]:
labels = ['M', 'B']
encoded = le.fit_transform(labels)
for label, encode in zip(labels, encoded):
    print('{}: {}'.format(label, encode))

M: 1
B: 0


So 1 is malignant, 0 is benign.

Lets proceed to split the data, 80/20, into training and test sets.

In [14]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, 
                                                                            test_size=0.2, 
                                                                            random_state=rs)

# Pipeline

Now lets create a pipeline to process and fit this data using the following steps:
    1. normalize the data to have a mean of zero and standard deviation of 1
    2. perform a PCA analysis to simplify the data to only 2 descriptive features
    3. fit the data using logistic regression

In [15]:
lr_pipe0 = sklearn.pipeline.Pipeline([
    ('scl', sklearn.preprocessing.StandardScaler()),
    ('pca', sklearn.decomposition.PCA(n_components=2)),
    ('clf', sklearn.linear_model.LogisticRegression(random_state=rs)),
])

In [16]:
lr_pipe0.fit(X_train, y_train)

Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [17]:
print('Testing Accuracy: {:0.3f}'.format(lr_pipe0.score(X_test, y_test)))

Testing Accuracy: 0.947


While this result looks great, using PCA to reduce the data to 2 components likely led to overfitting of the data.

Lets now use k-fold cross-validation to evaluate model performance.

In [18]:
skf = sklearn.model_selection.StratifiedKFold(n_splits=10, random_state=rs)
kfold = skf.split(X_train, y_train)

We will first do this explicitly to demonstrate what is going on.

In [19]:
scores = []
for k, (train, test) in enumerate(kfold):
    lr_pipe0.fit(X_train[train], y_train[train])
    score = lr_pipe0.score(X_train[test], y_train[test])
    scores.append(score)
    print('fold: {:03}, label distribution: ({}, {}), accuracy: {:.3}'.format(
        k+1, *np.bincount(y_train[train]), score))
print('average accuracy: {:.3} +/- {:.3}'.format(np.mean(scores), np.std(scores)))

fold: 001, label distribution: (256, 153), accuracy: 0.891
fold: 002, label distribution: (256, 153), accuracy: 0.978
fold: 003, label distribution: (256, 153), accuracy: 0.978
fold: 004, label distribution: (256, 153), accuracy: 0.913
fold: 005, label distribution: (256, 153), accuracy: 0.935
fold: 006, label distribution: (257, 153), accuracy: 0.978
fold: 007, label distribution: (257, 153), accuracy: 0.933
fold: 008, label distribution: (257, 153), accuracy: 0.956
fold: 009, label distribution: (257, 153), accuracy: 0.978
fold: 010, label distribution: (257, 153), accuracy: 0.956
average accuracy: 0.95 +/- 0.0292


We can do this using simply `sklearn.model_selection.cross_val_score`. Note that by using `n_jobs=-1` the calculation will maximize the number of CPU cores used.

In [20]:
scores = sklearn.model_selection.cross_val_score(estimator=lr_pipe0, X=X_train,
                                                 y=y_train, cv=10, n_jobs=-1)  

In [21]:
print('average accuracy: {:.3} +/- {:.3}'.format(np.mean(scores), np.std(scores)))

average accuracy: 0.95 +/- 0.0292


Not surprisingly, we get the same result.

# Learning Curve

Now lets look at a learning curves for how well our model represents the data.  A learning bias and variance of the model.  High bias, underfitting, situations can often be corrected by including additional features, either by measurement or feature engineering, or by decreasing the degree of regularization.  High variance, overfitting, situations can often be corrected by increasing the number of data samples or by reducing the data dimensionality via feature selection or PCA analysis.   

In [22]:
train_sizes, train_scores, test_scores = sklearn.model_selection.learning_curve(
    estimator=lr_pipe0, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1, 50),
    cv=10, n_jobs=-1)

In [23]:
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)

test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)

In [24]:
p = bokeh.plotting.figure(width=500, height=300,
                          x_range=list(train_sizes[[0, -1]]),
                          y_range=[.80, 1], 
                          x_axis_label='Number of training samples',
                          y_axis_label='Accuracy')

left = train_sizes.astype(np.float)
left[1:] -= np.diff(train_sizes)/2
right = train_sizes.astype(np.float)
right[:-1] += np.diff(train_sizes)/2

p.quad(left=left, right=right, 
       bottom=train_mean-train_std, 
       top=train_mean+train_std, 
       alpha=0.2, color=palette[0])
p.line(train_sizes, train_mean, color=palette[0], legend='training accuracy')

p.quad(left=left, right=right, 
       bottom=test_mean-test_std, 
       top=test_mean+test_std, 
       alpha=0.2, color=palette[1])
p.line(train_sizes, test_mean, color=palette[1], line_dash='dashed', legend='x-validation accuracy')

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

This seems to have reasonable consistency between the training and cross-validation accuracy curves, but we should be able to increase the accuracy by using more of the features.  rather than using the top 2 PCA components, lets now try applying the L2 penalty to our logistic regression and generate a new learning curve.

In [25]:
lr_pipe1 = sklearn.pipeline.Pipeline([
    ('scl', sklearn.preprocessing.StandardScaler()),
    ('clf', sklearn.linear_model.LogisticRegression(
        penalty='l2', random_state=rs)),
])

In [26]:
train_sizes, train_scores, test_scores = sklearn.model_selection.learning_curve(
    estimator=lr_pipe1, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1, 50),
    cv=10, n_jobs=-1)

In [27]:
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)

test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)

In [28]:
p = bokeh.plotting.figure(width=500, height=300,
                          x_range=list(train_sizes[[0, -1]]),
                          y_range=[.80, 1], 
                          x_axis_label='Number of training samples',
                          y_axis_label='Accuracy')

left = train_sizes.astype(np.float)
left[1:] -= np.diff(train_sizes)/2
right = train_sizes.astype(np.float)
right[:-1] += np.diff(train_sizes)/2

p.quad(left=left, right=right, 
       bottom=train_mean-train_std, 
       top=train_mean+train_std, 
       alpha=0.2, color=palette[0])
p.line(train_sizes, train_mean, color=palette[0], legend='training accuracy')

p.quad(left=left, right=right, 
       bottom=test_mean-test_std, 
       top=test_mean+test_std, 
       alpha=0.2, color=palette[1])
p.line(train_sizes, test_mean, color=palette[1], line_dash='dashed', legend='validation accuracy')

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

# Validation Curves

Now lets analyze the accuracy as we vary the model parameters.  Validation curves provide a valuable way to tune model hyperparameters, model parameters that are independent of the training parameters.  For the logistic regression, lets vary the inverse regularization parameter, C (`clf__C`), first for the 2-component PCA pipeline, then for the pipeline applying the L2 penalty.

In [29]:
n_points = 15
param_range = np.logspace(-3, 2, n_points)
train_scores, test_scores = sklearn.model_selection.validation_curve(
    estimator=lr_pipe0, X=X_train, y=y_train, param_name='clf__C', param_range=param_range,
    cv=10, n_jobs=-1)

In [30]:
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)

test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)

In [31]:
p = bokeh.plotting.figure(width=500, height=300,
                          x_range=list(param_range[[0, -1]]),
                          y_range=[.80, 1], 
                          x_axis_label='Parameter C',
                          y_axis_label='Accuracy',
                          x_axis_type='log',
                         )

left = param_range.astype(np.float)
left[1:] -= np.diff(param_range)/2
right = param_range.astype(np.float)
right[:-1] += np.diff(param_range)/2

p.quad(left=left, right=right, 
       bottom=train_mean-train_std, 
       top=train_mean+train_std, 
       alpha=0.2, color=palette[0])
p.line(param_range, train_mean, color=palette[0], legend='training accuracy')

p.quad(left=left, right=right, 
       bottom=test_mean-test_std, 
       top=test_mean+test_std, 
       alpha=0.2, color=palette[1])
p.line(param_range, test_mean, color=palette[1], line_dash='dashed', legend='validation accuracy')

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

In [32]:
10 ** -2 

0.01

This seems to do reasonably well for the parameter range $.001 < C < 1$.

In [33]:
n_points = 15
param_range = np.logspace(-3, 2, n_points)
train_scores, test_scores = sklearn.model_selection.validation_curve(
    estimator=lr_pipe1, X=X_train, y=y_train, param_name='clf__C', param_range=param_range,
    cv=10, n_jobs=-1)

In [34]:
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)

test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)

In [35]:
p = bokeh.plotting.figure(width=500, height=300,
                          x_range=list(param_range[[0, -1]]),
                          y_range=[.80, 1], 
                          x_axis_label='Parameter C',
                          y_axis_label='Accuracy',
                          x_axis_type='log',
                         )

left = param_range.astype(np.float)
left[1:] -= np.diff(param_range)/2
right = param_range.astype(np.float)
right[:-1] += np.diff(param_range)/2

p.quad(left=left, right=right, 
       bottom=train_mean-train_std, 
       top=train_mean+train_std, 
       alpha=0.2, color=palette[0])
p.line(param_range, train_mean, color=palette[0], legend='training accuracy')

p.quad(left=left, right=right, 
       bottom=test_mean-test_std, 
       top=test_mean+test_std, 
       alpha=0.2, color=palette[1])
p.line(param_range, test_mean, color=palette[1], line_dash='dashed', legend='validation accuracy')

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

This seems to do best for $C\approx 0.3$.

Note that [Bokeh PR #6177](https://github.com/bokeh/bokeh/pull/6177) will soon add `bokeh.models.FilledArea` plot option to improve these plots.

# Confusion Matrix
Lets evaluate the performance a support vector machine pipeline.

In [36]:
le.transform(['M', 'B'])

array([1, 0])

In [37]:
svc_pipe = sklearn.pipeline.Pipeline([('scl', sklearn.preprocessing.StandardScaler()),
                                      ('clf', sklearn.svm.SVC(random_state=rs))])
svc_pipe.fit(X_train, y_train)
y_pred = svc_pipe.predict(X_test)

# note the added `labels=[1, 0]` in the next line b/c we said 'M' = 1, 'B' = 0
labels = le.transform(['M', 'B'])
conf_mat = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred, labels=labels) 
print(conf_mat)

[[40  2]
 [ 1 71]]


In [38]:
actual = np.array([['1', '1'], ['0', '0']])
actual = list(actual.reshape(4))

predicted = np.array([['1', '0'], ['1', '0']])
predicted = list(predicted.reshape(4))

count = list(conf_mat.reshape(4))

alpha = [count_val / conf_mat.sum() + 0.2 for count_val in count]

source = bokeh.plotting.ColumnDataSource(
    data=dict(predicted=predicted, actual=actual, count=count, alpha=alpha)
)

In [39]:
p = bokeh.plotting.figure(title='Confusion Matrix', tools="hover,save",
                          y_range=['0', '1'], x_range=['1', '0'],
                          x_axis_label='predicted label', 
                          y_axis_label='true label',
                          x_axis_location='above',
                          width=200, height=200)

rectwidth = 0.9
p.rect('predicted', 'actual', rectwidth, rectwidth, source=source,
      color=palette[0], alpha='alpha',line_width=1)

hover = p.select(dict(type=bokeh.models.HoverTool))
hover.tooltips = collections.OrderedDict([
    ('predicted', '@predicted'),
    ('actual', '@actual'),
    ('count', '@count'),
])

bokeh.plotting.show(p)

In [40]:
tp, fn, fp, tn = conf_mat.reshape(4)

For this situation we have 40 true positives (TP), 71 true negatives (TN), 2 false negative (FN), and 1
false positives (FP).  We calculate the error (ERR) to be
$$ ERR = \frac{FP + FN}{FP + FN + TP + TN}$$

In [41]:
err = (fp + fn)/(fp + fn + tp + tn)
print('ERR = {:.3}'.format(err))

ERR = 0.0263


and the accuracy (ACC) to be
$$ ACC = \frac{TP + TN}{FP + FN + TP + TN} = 1 - ERR $$

In [42]:
acc = 1 - err
print('ACC = {:.3}'.format(acc))

ACC = 0.974


We can also calculate the true positive rate (TPR) 
$$ TPR = \frac{TP}{P} = \frac{TP}{TP + FN} $$

In [43]:
p = tp + fn
tpr = tp / p
print('TPR = {:.3}'.format(tpr))

TPR = 0.952


and false positive rate (FPR)
$$ FPR = \frac{FP}{N} = \frac{FP}{FP + TN} $$

In [44]:
n = fp + tn
fpr = fp / n
print('FPR = {:.3}'.format(fpr))

FPR = 0.0139


Two additional metrics are precision (PRE) and recall (REC).  Precision is the ratio of positive predictions that are true positives (demoninator is the left sife of the confusion matrix)
$$ PRE = \frac{TP}{TP + FP}\,. $$

In [45]:
pre = tp / (tp + fp)
print('PRE = {:.3}'.format(pre))

PRE = 0.976


Recall is the ratio of positive labels that were predicted correctly, TP; note that this is identical to the true positive rate
$$ REC = TPR = \frac{TP}{P} = \frac{TP}{TP + FN} $$

In [46]:
rec = tp / p
print('REC = {:.3}'.format(rec))

REC = 0.952


It is common to use a combination of precision and recall known as the F1-score
$$ F1 = 2 \frac{PRE \times REC}{PRE + REC} = 2 \frac{0.973 \times 0.986}{0.973 + 0.986} = 0.490\,. $$

In [47]:
f1 = 2 * pre * rec / (pre + rec)
print('F1 = {:.3}'.format(f1))

F1 = 0.964


We can get these metrics directly from `sklearn.metrics`.

In [48]:
print('Precision: {:.3}'.format(sklearn.metrics.precision_score(y_true=y_test, y_pred=y_pred)))

Precision: 0.976


In [49]:
print('Recall: {:.3}'.format(sklearn.metrics.recall_score(y_true=y_test, y_pred=y_pred)))

Recall: 0.952


In [50]:
print('F1: {:.3}'.format(sklearn.metrics.f1_score(y_true=y_test, y_pred=y_pred)))

F1: 0.964


# Receiver Operator Characteristic
The receiver operator characteristic (ROC) graph is useful to evaluate model classification based on the true/false positive rate.  This is calculated by shifting the decision threshold of the classifier.

We need to enable `probability=True` for the SVC.

In [51]:
svc_pipe = sklearn.pipeline.Pipeline([('scl', sklearn.preprocessing.StandardScaler()),
                                      ('clf', sklearn.svm.SVC(random_state=rs, probability=True))])

Lets reduce the number of folds for simpler visualization.

In [52]:
n_splits = 3
skf3 = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, random_state=rs)
kfold3 = skf3.split(X_train, y_train)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)  # the x-values for the curves
all_tpr = []

p = bokeh.plotting.figure(width=500, height=500, 
                          x_axis_label='False Positive Rate',
                          y_axis_label='True Positive Rate',
                          title='Receiver Operator Characteristic')

for k, (train, test) in enumerate(kfold3):
    print('k: {}'.format(k))
    proba = svc_pipe.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_train[test], proba[:, 1], pos_label=1)
    
    mean_tpr += scipy.interp(mean_fpr, fpr, tpr)
    print(mean_tpr[0])
    # mean_tpr[0] = 0.0
    
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    p.line(fpr, tpr, legend='ROC fold {}, AUC = {:.3}'.format(k+1, roc_auc), color=palette[k])


p.line([0, 1], [0, 1], line_dash='dashed', legend='random guessing', color=palette[k+1])

mean_tpr /= n_splits
print(mean_tpr[-1])
mean_tpr[-1] = 1.0
mean_auc = sklearn.metrics.auc(mean_fpr, mean_tpr)
p.line(mean_fpr, mean_tpr, legend='mean ROC, AUC = {:.3}'.format(mean_auc), 
       color=palette[k+2], line_width=2)

p.line([0, 0, 1], [0, 1, 1], line_dash='dotted', legend='perfect performance', color=palette[k+3])


p.legend.location = 'bottom_right'


bokeh.plotting.show(p)

k: 0
0.771929824561
k: 1
1.71929824561
k: 2
2.64786967419
1.0


In [53]:
scipy.interp(mean_fpr, fpr, tpr)

array([ 0.92857143,  0.92857143,  0.96428571,  0.96428571,  0.96428571,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  0.98214286,  0.98214286,  0.98214286,  0.98214286,
        0.98214286,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.  

In [54]:
fpr

array([ 0.        ,  0.        ,  0.01052632,  0.01052632,  0.04210526,
        0.04210526,  0.55789474,  0.55789474,  1.        ])

In [55]:
tpr

array([ 0.01785714,  0.92857143,  0.92857143,  0.96428571,  0.96428571,
        0.98214286,  0.98214286,  1.        ,  1.        ])

In [56]:
n_splits = 3
skf3 = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, random_state=rs)
kfold3 = skf3.split(X_train, y_train)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)  # the x-values for the curves
all_tpr = []

p = bokeh.plotting.figure(width=500, height=500, 
                          x_axis_label='False Positive Rate',
                          y_axis_label='True Positive Rate',
                          title='Receiver Operator Characteristic')

for k, (train, test) in enumerate(kfold3):
    # print('k: {}'.format(k))
    proba = svc_pipe.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_train[test], proba[:, 1], pos_label=1)
    
    mean_tpr += scipy.interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0 # force this to 0 as the interpolate function is extrapolating
    
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    p.line(fpr, tpr, legend='ROC fold {}, AUC = {:.3}'.format(k+1, roc_auc), color=palette[k])


p.line([0, 1], [0, 1], line_dash='dashed', legend='random guessing', color=palette[0])

mean_tpr /= n_splits
mean_tpr[-1] = 1.0  # make sure end-point is 1
mean_auc = sklearn.metrics.auc(mean_fpr, mean_tpr)
p.line(mean_fpr, mean_tpr, legend='mean ROC, AUC = {:.3}'.format(mean_auc), 
       color=palette[1], line_width=2)

p.line([0, 0, 1], [0, 1, 1], line_dash='dotted', legend='perfect performance', color=palette[2])

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

In [57]:
n_splits = 3
skf3 = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, random_state=rs)
kfold3 = skf3.split(X_train, y_train)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)  # the x-values for the curves
all_fpr = []
all_tpr = []

p = bokeh.plotting.figure(width=500, height=500, 
                          x_axis_label='False Positive Rate',
                          y_axis_label='True Positive Rate',
                          title='Receiver Operator Characteristic')

for k, (train, test) in enumerate(kfold3):
    # print('k: {}'.format(k))
    proba = svc_pipe.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_train[test], proba[:, 1], pos_label=1)
    
    all_fpr.append(fpr)
    all_tpr.append(tpr)
    mean_tpr += scipy.interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0 # force this to 0 as the interpolate function is extrapolating
    
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    p.line(fpr, tpr, legend='ROC fold {}, AUC = {:.3}'.format(k+1, roc_auc), color=palette[k])


p.line([0, 1], [0, 1], line_dash='dashed', legend='random guessing', color=palette[-1])

mean_tpr /= n_splits
mean_tpr[-1] = 1.0  # make sure end-point is 1
mean_auc = sklearn.metrics.auc(mean_fpr, mean_tpr)
p.line(mean_fpr, mean_tpr, legend='mean ROC, AUC = {:.3}'.format(mean_auc), 
       color=palette[-2], line_width=2)

p.line([0, 0, 1], [0, 1, 1], line_dash='dotted', legend='perfect performance', color=palette[-3])

p.legend.location = 'bottom_right'

bokeh.plotting.show(p)

In [58]:
all_tpr[0].shape, all_tpr[1].shape, all_tpr[2].shape

((19,), (9,), (9,))

In [59]:
all_fpr[0].shape, all_fpr[1].shape, all_fpr[2].shape

((19,), (9,), (9,))

It is interesting that the output from `sklearn.metric.roc_curve` is not a consistent shape.

We see this model performs quite well as we can select the tune the prediction probability so that the false positive rate is low but the true positive rate is almost 100%, e.g., where FPR=0.13 the TPR=0.994, or where FPR=0.08 the TPR=0.982 (depending on what FPR you are willing to accept).