In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Streamlining Workflows with Pipelines

In [None]:
# load dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data',
                 header=None)

In [None]:
# encode labels
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

In [None]:
# check encodings:
le.transform(['M', 'B'])

In [None]:
# split dataset into training and test setss
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=1
)

In [None]:
from sklearn.pipeline import make_pipeline

# scale data, compress data, and initialize logreg model in a pipeline
pipe_lr = make_pipeline(
    StandardScaler(),
    PCA(n_components=2),
    LogisticRegression(random_state=1, solver='liblinear')
)

# fit logreg model
pipe_lr.fit(X_train, y_train)

# make predictions
y_pred = pipe_lr.predict(X_test)

# calculate accuracy
print(f'Test Accuracy: {pipe_lr.score(X_test, y_test):.3f}')

# K-Fold Cross-Validation to Assess Model Performance

In [None]:
from sklearn.model_selection import StratifiedKFold

# initialize stratified k-fold iterator
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)
scores = []

# iterate through the k folds to fit logreg model
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    # calculate accuracy
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print(f'Fold: {k+1}, Class dist.: {np.bincount(y_train[train])}, Acc: {score:.3f}')

# calculate average accuracy and std dev
print(f'\nCV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

In [None]:
from sklearn.model_selection import cross_val_score

# score k-fold cross-validator using scikit-learn
scores = cross_val_score(estimator=pipe_lr,
                         X=X_train, y=y_train,
                         cv=10, n_jobs=1)

print(f'CV accuracy scores: {scores}')
print(f'\nCV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

# Debugging Algorithms with Learning and Validation Curves

## Diagnosing Bias and Variance Problems with Learning Curves

In [None]:
from sklearn.model_selection import learning_curve

# scale data and initialize logreg model
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', random_state=1, solver='liblinear'))

# generate learning curves using 10 even training set intervals
train_sizes, train_scores, test_scores = learning_curve(
    estimator=pipe_lr, X=X_train, y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=10, n_jobs=1)

# calculate average accuracies and std deviations
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
# plot
plt.figure(figsize=(15, 8))
plt.plot(train_sizes, train_mean, color='blue',
         marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std,
                 alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
         marker='s', markersize=5, label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.0])
plt.show()

## Addressing Over- and Underfitting with Validation Curves

In [None]:
from sklearn.model_selection import validation_curve

# set value range for C
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

# generate validation curve plots for all C
train_scores, test_scores = validation_curve(
    estimator=pipe_lr, X=X_train, y=y_train,
    param_name='logisticregression__C',
    param_range=param_range, cv=10)

# calculate average accuracies and std deviationss
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
# plot
plt.figure(figsize=(15, 8))
plt.plot(param_range, train_mean, color='blue',
         marker='o', markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std,
                 alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='green', linestyle='--',
         marker='s', markersize=5, label='validation accuracy')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.03])
plt.show()

# Fine-Tuning Machine Learning Models via Grid Search

## Tuning hyperparameters via grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# scale data and initialize support vector machine
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))

# set value ranges for parameters
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
# create dictionary: C for linear SVM, C and gamma for RBF kernel SVM
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]

# initialize and fit grid search
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)

# find best performing model's accuracy and parameters
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# estimate performance of best performing model
clf = gs.best_estimator_
clf.fit(X_train, y_train)
print(f'Test accuracy: {clf.score(X_test, y_test):.3f}')

## Algorithm Selection with Nested Cross-Validation

In [None]:
# initialize grid search
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2)

# fit grid search using nested cross validation
scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)

# calculate average accuracy and std deviation
print(f'CV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

In [None]:
from sklearn.tree import DecisionTreeClassifier

# initialize grid search
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                  param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
                  scoring='accuracy',
                  cv=2)

# fit grid search using nested cross validation
scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
print(f'CV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

# Performance Evaluation Metrics

## Reading a Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)

# create confusion matrix
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

In [None]:
# plot confusion matrix
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i,
                s=confmat[i, j],
                va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

## Optimizing Prevision and Recall of Classification Models

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score

print(f'Precision: {precision_score(y_true=y_test, y_pred=y_pred):.3f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=y_pred):.3f}')
print(f'F1: {f1_score(y_true=y_test, y_pred=y_pred):.3f}')

In [None]:
from sklearn.metrics import make_scorer, f1_score

# construct own scorer using f1 score
scorer = make_scorer(f1_score, pos_label=0)

# initialize and fit grid search
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=10)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

## Plotting a Receiver Operating Characteristic

In [None]:
from sklearn.metrics import roc_curve, auc
from scipy import interp

# scale data, compress data, and initialize logreg model
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(penalty='l2', solver='liblinear', random_state=1, C=100.0))

X_train2 = X_train[:, [4, 14]]

# initialize stratified k-fold iterator
cv = list(StratifiedKFold(n_splits=3, random_state=1).split(X_train, y_train))

# initialize mean true positive rate and false positive rate
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

In [None]:
fig = plt.figure(figsize=(10, 10))

# for each fold ...
for i, (train, test) in enumerate(cv):
    # ... fit logreg model, 
    probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test])
    # ... calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1)
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    # ... calculate area under curve
    roc_auc = auc(fpr, tpr)
    # ... plot curve
    plt.plot(fpr, tpr, label=f'ROC fold {i+1} (area = {roc_auc:.2f})')

# calculate mean ROC
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)

# plot random guessing, mean ROC, and perfect performance
plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing')
plt.plot(mean_fpr, mean_tpr, 'k--', label=f'mean ROC (area = {roc_auc:.2f})', lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='perfect performance')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend(loc="lower right")
plt.show()

## Scoring Metrics for Multiclass Classification

In [None]:
# construct own scorer precision score
pre_scorer = make_scorer(score_func=precision_score, pos_label=1, greater_is_better=True, average='micro')

In [None]:
# initialize and fit grid search
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring=pre_scorer,
                  cv=10)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

# Class Imbalance

In [None]:
# create imbalanced data
X_imb = np.vstack((X[y == 0], X[y == 1][:40]))
y_imb = np.hstack((y[y == 0], y[y == 1][:40]))

In [None]:
# accuracy of always predicting `0` (benign)
y_pred = np.zeros(y_imb.shape[0])
np.mean(y_pred == y_imb) * 100

In [None]:
from sklearn.utils import resample

print('Number of class 1 samples before:', X_imb[y_imb == 1].shape[0])

In [None]:
# upsample `1` (malignant)
X_upsampled, y_upsampled = resample(X_imb[y_imb == 1], y_imb[y_imb == 1], replace=True,
                                    n_samples=X_imb[y_imb == 0].shape[0], random_state=123)
print('Number of class 1 samples after:', X_upsampled.shape[0])

In [None]:
# stack original samples of `0` with upsampled subset of `1`
X_bal = np.vstack((X[y == 0], X_upsampled))
y_bal = np.hstack((y[y == 0], y_upsampled))

In [None]:
# accuracy of always predicting `0` (benign)
y_pred = np.zeros(y_bal.shape[0])
np.mean(y_pred == y_bal) * 100