In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sklearn --upgrade --quiet --disable-pip-version-check

# Introduction
The dataset used in this notebook is publicly available and was created by Dr. William H. Wolberg, physician at the University Of Wisconsin Hospital at Madison, Wisconsin, USA. The original source, UCI Machine Learning Repository for breast cancer dataset can be found [here](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29).

To create the dataset Dr. Wolberg used digitized fluid samples images, taken from patients with solid breast masses and a computer program, which is capable of perform the analysis of cytological features  of the cell nuclei based on such digital scan. The program uses a curve-fitting algorithm, to compute ten features from each one of the cells in the sample, than it calculates the mean value, standard error and extreme (worst) value of each feature for the image, returning a 30 real-valuated vector.

More advanced information on the mathematical principles and alghorithm implementation can be found in the literature below:

[1] K. P. Bennett, "Decision Tree Construction Via Linear Programming" Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, 1992

[2] K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992

# Dataset
Attribute Information:

1) ID number

2) Diagnosis (M = malignant, B = benign)

3-32) Ten real-valued features are computed for each cell nucleus:

* a) radius (mean of distances from center to points on the perimeter)
* b) texture (standard deviation of gray-scale values)
* c) perimeter
* d) area
* e) smoothness (local variation in radius lengths)
* f) compactness (perimeter^2 / area - 1.0)
* g) concavity (severity of concave portions of the contour)
* h) concave points (number of concave portions of the contour)
* i) symmetry
* j) fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features.
For instance, field 3 is Mean Radius, field 13 is Radius Standard Error, field 23 is Worst Radius.

In [None]:
# save filepath to variable for easier access
file_path = '../input/breast-cancer-wisconsin-data/data.csv'

# read the data and store data in DataFrame
data = pd.read_csv(file_path)
data.dropna(axis=1, inplace=True)

# summary of data types and null counts 
data.info()

In [None]:
# print some examples of the training data
data.head()

In [None]:
data.isna().sum()

Right away we notice that there is absolutely no need for data cleaning since this extremely curated dataset does not contain any missing values.

In [None]:
data['diagnosis'].value_counts()

Furthermore, of the 569 entries in the dataset, 357 are classified as benign while 212 as malignant. This means a 1.68 : 1 proportion that we must keep in mind for later, namely when creating the different splits that will be used, so that our results are not influenced by sampling bias. 

# Data Visualization

In [None]:
!pip install seaborn --upgrade --quiet --disable-pip-version-check

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# histograms for numerical variable 1: Radius
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 2], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 12], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 22], kde=True);

In [None]:
# histograms for numerical variable 1: Radius
fig = plt.figure(figsize=(15,5));

ax1 = fig.add_subplot(131)
sns.kdeplot(data=data, x="radius_mean", hue="diagnosis", ax=ax1);

ax2 = fig.add_subplot(132)
sns.kdeplot(data=data, x="radius_se", hue="diagnosis", ax=ax2);

ax3 = fig.add_subplot(133)
sns.kdeplot(data=data, x="radius_worst", hue="diagnosis", ax=ax3);


In [None]:
# histograms for numerical variable 2: Texture
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 3], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 13], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 23], kde=True);

In [None]:
# histograms for numerical variable 2: Textrue
fig = plt.figure(figsize=(15,5));

ax1 = fig.add_subplot(131)
sns.kdeplot(data=data, x="texture_mean", hue="diagnosis", ax=ax1);

ax2 = fig.add_subplot(132)
sns.kdeplot(data=data, x="texture_se", hue="diagnosis", ax=ax2);

ax3 = fig.add_subplot(133)
sns.kdeplot(data=data, x="texture_worst", hue="diagnosis", ax=ax3);

In [None]:
# histograms for numerical variable 3: Perimeter
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 4], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 14], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 24], kde=True);

In [None]:
# histograms for numerical variable 4: Area
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 5], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 15], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 25], kde=True);

In [None]:
# histograms for numerical variable 5: Smoothness
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 6], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 16], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 26], kde=True);

In [None]:
# histograms for numerical variable 6: Compactness
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 7], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 17], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 27], kde=True);

In [None]:
# histograms for numerical variable 7: Concavity
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 8], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 18], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 28], kde=True);

In [None]:
# histograms for numerical variable 8: Concave Points
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 9], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 19], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 29], kde=True);

In [None]:
# histograms for numerical variable 9: Symmetry
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 10], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 20], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 30], kde=True);

In [None]:
# histograms for numerical variable 10: Fractal Dimension
plt.figure(figsize=(15,5));

plt.subplot(131)
sns.distplot(data.iloc[:, 11], kde=True);

plt.subplot(132)
sns.distplot(data.iloc[:, 21], kde=True);

plt.subplot(133)
sns.distplot(data.iloc[:, 31], kde=True);

The histogram distribution is fairly normal (simil-gaussian) for many of the numerical fields, for instance **symmetry_mean**. However some of them, like **area_se**, are skewed towards the origin of the axis, so we may consider normalizing it later before proceeding further with our model.

In [None]:
data.describe()

One thing that is for sure needed is scaling: our dataset contains features highly varying in magnitudes and range but since most of the machine learning algorithms use some type of distance between two data points in their computations we need to bring all features to the same level of magnitude. In practice, this means we must transform the available data so that it fits within a specific scale, like 0–1.

The best idea is to try a distance-based algorithm, like for instance SVM, and see which type of processing yields the best result.

In [None]:
plt.figure(figsize=(16,8))
plt.title('Correlation heatmap for mean attributes')
sns.heatmap(data.iloc[:, 2:12].corr(), annot=True);

This matrix is a table is showing correlation coefficients between variables, we can see that columns with high absolute values are indeed logically linked just like we expect.

For example, columns **radius_mean**, **perimeter_mean** and **area_mean** have close to 1 values, just like we expect since both perimeter and area are function of the radius. Same goes for columns **concavity_mean** and **concave points_mean**, representing the severity and count of the concave portions of the countour respectively.

In [None]:
plt.figure(figsize=(16,8))
plt.title('Correlation heatmap for se attributes')
sns.heatmap(data.iloc[:, 12:22].corr(), annot=True);

In [None]:
plt.figure(figsize=(16,8))
plt.title('Correlation heatmap for worst attributes')
sns.heatmap(data.iloc[:, 22:32].corr(), annot=True);

The same patterns are present also among standard error and worst correlation heatmaps. Ultimately if we consider all the 30 real-valued features of the dataset the correlation values between features are standard, excluding the ones which are directly derived from each other (like **radius**, **perimeter** and **area**).

In [None]:
# Compute the correlation matrix
corr = data.iloc[:, 2:32].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(24, 12))

# Select colormap
cmap = sns.color_palette('mako', as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
plt.title('Correlation heatmap')
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
sns.pairplot(data=data.iloc[:, np.r_[1, 22:32]], hue="diagnosis");

Intuitively, features that show noticeable differences in characteristics between the two target classification categories (malignant and benign) are to be kept while features that do not show such differentiation should be eliminated. 

The above plot is colored by each entry classification value. For each of the features in the pair plot, it can be noticed that the differences in values between the two target classification categories can be noticed in some cases, but not all of them. Intuitively we can say that **area_worst** is a more interesting property with respect to **compactness_worst**, but objectively, there is no way to tell for sure.

# Data Preprocessing

Before proceding further with our analysis, it is best to transform our data in a way that will be more understandable by the mathematical models later. 
Let us begin by transforming the target column **diagnosis** to a boolean representation, benign entries will be mapped to 0 (False) and malignant entries to 1 (True).

We will use Label Encoder to label the categorical data. Label Encoder is used to convert categorical data, or object fields, into numbers, which our predictive models can better understand.

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data.iloc[:, 1] = encoder.fit_transform(data.iloc[:, 1])

Moreover, as suggested before, we are now going to scale and normalize all of the real-valued features. In this manner we should be able to avoid issues between different features in distance-based algorithms, and all of them wiil equally contribute to the result of our models.

It is crucial that we split our dataset now, before performing any kind of scaling otherwise data leakage will happen and decrease our model performance. Train-test contamination leads to high performance on the training set (and possibly even the validation data), but the model will perform poorly in production.

In other words, leakage causes a model to look accurate until you start making decisions with the model, and then the model becomes very inaccurate.

In [None]:
from sklearn.model_selection import train_test_split

X = data.iloc[:, 2:32]
y = data.iloc[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

X_train.head()

We use parameter stratify to mantain the target class proportion nearly the same in both sets.


In [None]:
train = pd.concat([X_train, y_train], axis=1, sort=False)
train['diagnosis'].value_counts()

In [None]:
test = pd.concat([X_test, y_test], axis=1, sort=False)
test['diagnosis'].value_counts() 

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

X_train_scaled.head()

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1, for this purpose we will use MinMaxScaler. Usually, it is good to use when you know that the distribution of your data does not follow a Gaussian distribution.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test_norm = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

X_train_norm.head()

In [None]:
from sklearn import metrics

def print_scores(title, y_hat, y_pred):
    print(title)
    print(f"Accuracy: {metrics.accuracy_score(y_test, y_hat)*100}")
    print(f"F1: {metrics.f1_score(y_test, y_hat)*100}")
    print(f"ROC AUC: {metrics.roc_auc_score(y_test, y_hat)*100}")

# Feature Selection

It is now quite clear that some features will not be useful to the machine learning model. Therefore, we should now select the most useful of them. Either we straight up eliminate features that are highly correlated with each other or we find a way to select them.

Since we are working with only 569 entries blindly eliminating some information based only on feature correlation could be a serious loss for the performance of our models. 
Let us insted try to filter features using the _feature_importances_ attribute of a baseline Random Forest model, and see if it ultimately boost the accuracy of the final model. 

To create such a filter, train a default LightGBM binary classifier on the existing feature set and determine the importance of each of the features by referencing the _feature_importances_ attribute of the tree based model. Setting a threshold for features with low importance, all features with an importance less than the importance threshold will be eliminated from the data set.

LightGBM is a gradient boosting framework that uses tree based learning algorithm. In this model trees grows vertically while other algorithm trees grows horizontally, meaning that LightGBM grows tree leaf-wise while other algorithm grows level-wise. It is exceptionally fast and accurate, matching state-of-the-art machine learning algorithms like XGBoost and CatBoost.

In [None]:
import lightgbm as lgb

gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train)

y_hat = gbm.predict(X_test)
print_scores('LightLGB baseline scores', y_hat, y_test)
lgb.plot_importance(gbm.booster_);

Let us now use recursive feature elimination to repeat the above computation and take the top 10 important attributes over 20 separate steps.

In [None]:
from sklearn.feature_selection import RFE

rfe = RFE(gbm, 10)
rfe = rfe.fit(X_train, y_train)

# summarize the selection of the attributes
important_features = X_train.columns[rfe.support_].tolist()
print(important_features)

We can immediately see that our LightGBM random forest model regards **texture** and **concave points** are far and away the most important features, making out 5 of our top 10 values. 

In [None]:
plt.figure(figsize=(16,8))
plt.title('Correlation heatmap for important attributes')
sns.heatmap(data[important_features].corr(), annot=True);

In [None]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train[important_features], y_train)

y_hat = gbm.predict(X_test[important_features])
print_scores('LightLGB scores with feature selection', y_hat, y_test)

Interestingly we did not improve our score by filtering out 20 of the 30 real-valued features, but at the same time we did not lose any accuracy at all.
We can keep going forward with both and see which kind of model thrive with more or less attributes.

# Model Building

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_scores = {}
rf = RandomForestClassifier()

rf_scores['X_train'] = cross_val_score(rf, X_train, y_train, cv=5)
rf_scores['X_train_scaled'] = cross_val_score(rf, X_train_scaled, y_train, cv=5)
rf_scores['X_train_norm'] = cross_val_score(rf, X_train_norm, y_train, cv=5)
rf_scores['X_train_imp_feat'] = cross_val_score(rf, X_train[important_features], y_train, cv=5)
rf_scores['X_train_scaled_imp_feat'] = cross_val_score(rf, X_train_scaled[important_features], y_train, cv=5)
rf_scores['X_train_norm_imp_feat'] = cross_val_score(rf, X_train_norm[important_features], y_train, cv=5)

scores = pd.DataFrame(rf_scores)
print('Random Forest Classifier cross validation scores:')
print(scores.mean(axis = 0)*100)

As expected the Random Forest Classifier does not care about scaled or normalized features, reducing the feature set is also not relevant.

In [None]:
from sklearn.svm import SVC

sv_scores = {}
sv = SVC()

sv_scores['X_train'] = cross_val_score(sv, X_train, y_train, cv=5)
sv_scores['X_train_scaled'] = cross_val_score(sv, X_train_scaled, y_train, cv=5)
sv_scores['X_train_norm'] = cross_val_score(sv, X_train_norm, y_train, cv=5)
sv_scores['X_train_imp_feat'] = cross_val_score(sv, X_train[important_features], y_train, cv=5)
sv_scores['X_train_scaled_imp_feat'] = cross_val_score(sv, X_train_scaled[important_features], y_train, cv=5)
sv_scores['X_train_norm_imp_feat'] = cross_val_score(sv, X_train_norm[important_features], y_train, cv=5)

scores = pd.DataFrame(sv_scores)
print('Support Vector Classifier cross validation scores:')
print(scores.mean(axis = 0)*100)

Support Vector Classifier, our first distance-based algorithm, clearly benefits from scaled and normalized data with an average increase in accuracy of 5 percentage points. The reduced feature sets does not benefit the model this time, losing a percentage point.

In [None]:
from sklearn.linear_model import LogisticRegression

lr_scores = {}
lr = LogisticRegression()

lr_scores['X_train'] = cross_val_score(lr, X_train, y_train, cv=5)
lr_scores['X_train_scaled'] = cross_val_score(lr, X_train_scaled, y_train, cv=5)
lr_scores['X_train_norm'] = cross_val_score(lr, X_train_norm, y_train, cv=5)
lr_scores['X_train_imp_feat'] = cross_val_score(lr, X_train[important_features], y_train, cv=5)
lr_scores['X_train_scaled_imp_feat'] = cross_val_score(lr, X_train_scaled[important_features], y_train, cv=5)
lr_scores['X_train_norm_imp_feat'] = cross_val_score(lr, X_train_norm[important_features], y_train, cv=5)

scores = pd.DataFrame(lr_scores)
print('Logistic Regression Classifier cross validation scores:')
print(scores.mean(axis = 0)*100)

Logistic Regression Classifier presents a less marked difference, the best result is with scaled data but only slighty over the others.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_scores = {}
kn = KNeighborsClassifier()

kn_scores['X_train'] = cross_val_score(kn, X_train, y_train, cv=5)
kn_scores['X_train_scaled'] = cross_val_score(kn, X_train_scaled, y_train, cv=5)
kn_scores['X_train_norm'] = cross_val_score(kn, X_train_norm, y_train, cv=5)
kn_scores['X_train_imp_feat'] = cross_val_score(kn, X_train[important_features], y_train, cv=5)
kn_scores['X_train_scaled_imp_feat'] = cross_val_score(kn, X_train_scaled[important_features], y_train, cv=5)
kn_scores['X_train_norm_imp_feat'] = cross_val_score(kn, X_train_norm[important_features], y_train, cv=5)

scores = pd.DataFrame(kn_scores)
print('KNeighbors Classifier cross validation scores:')
print(scores.mean(axis = 0)*100)

KNeighbors Classifier benefits from scaled (and normalized) data as much as SVC. 

As a final consideration we can now for sure say that feature selection was not very useful since all 5 different models now perform equally as good or better when working with all 30 features. For this reason we will continue from now on with the entire feature set.

# Model Tuning
The top scorer up to now is the Support Vector Classifier on _X\_train\_scaled_ with regard to cross validation score. Lets now see if some parameter changes can help us reach the more powerful LightGBM Classifier accuracy on the test set.

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_hat = rf.predict(X_test)
print_scores('Random Forest Classifier baseline scores', y_hat, y_test)

rf = RandomForestClassifier()
param_grid =  {
    'n_estimators': [50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True],
    'max_depth': [15, 20, 25],
    'max_features': ['auto','sqrt', 10],
    'min_samples_split': [2, 3] }      

clf_rf = GridSearchCV(rf, param_grid=param_grid, cv=5, n_jobs=-1)
best_rf = clf_rf.fit(X_train, y_train)
print(f'Best Parameters: {str(best_rf.best_params_)}')

y_hat = best_rf.predict(X_test)
print_scores('Random Forest Classifier tuned scores', y_hat, y_test)

In [None]:
fig, [ax_1, ax_2] = plt.subplots(1, 2, figsize=(13, 5))
metrics.plot_confusion_matrix(best_rf, X_test, y_test, ax=ax_1, cmap=plt.cm.Blues);
metrics.plot_confusion_matrix(best_rf, X_test, y_test, ax=ax_2, cmap=plt.cm.Blues, normalize='true');

In [None]:
sv = SVC(probability = True)
sv.fit(X_train_scaled, y_train)
y_hat = sv.predict(X_test_scaled)
print_scores('Support Vector Classifier baseline scores', y_hat, y_test)

sv = SVC(probability = True)
param_grid = [ 
    {'kernel': ['rbf'], 'gamma': [.1, .5, 1, 2, 5, 10], 'C': [.1, 1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
    {'kernel': ['poly'], 'degree' : [2, 3, 4, 5], 'C': [.1, 1, 10, 100, 1000]} ]

clf_sv = GridSearchCV(sv, param_grid = param_grid, cv = 5, n_jobs = -1)
best_sv = clf_sv.fit(X_train_scaled, y_train)
print(f'Best Parameters: {str(best_sv.best_params_)}')

y_hat = clf_sv.predict(X_test_scaled)
print_scores('Support Vector Classifier tuned scores', y_hat, y_test)

In [None]:
fig, [ax_1, ax_2] = plt.subplots(1, 2, figsize=(13, 5))
metrics.plot_confusion_matrix(best_sv, X_test_scaled, y_test, ax=ax_1, cmap=plt.cm.Blues);
metrics.plot_confusion_matrix(best_sv, X_test_scaled, y_test, ax=ax_2, cmap=plt.cm.Blues, normalize='true');

In [None]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_hat = lr.predict(X_test_scaled)
print_scores('Logistic Regression Classifier baseline scores', y_hat, y_test)

lr = LogisticRegression()
param_grid = { 
    'penalty' : ['l1', 'l2'],
    'C': [.1, 1, 10, 100, 1000],
    'solver' : ['liblinear', 'saga', 'lbfgs'] }

clf_lr = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1)
best_lr = clf_lr.fit(X_train_scaled, y_train)
print(f'Best Parameters: {str(best_lr.best_params_)}')

y_hat = clf_lr.predict(X_test_scaled)
print_scores('Logistic Regression Classifier tuned scores', y_hat, y_test)

In [None]:
fig, [ax_1, ax_2] = plt.subplots(1, 2, figsize=(13, 5))
metrics.plot_confusion_matrix(best_lr, X_test_scaled, y_test, ax=ax_1, cmap=plt.cm.Blues);
metrics.plot_confusion_matrix(best_lr, X_test_scaled, y_test, ax=ax_2, cmap=plt.cm.Blues, normalize='true');

In [None]:
kn = KNeighborsClassifier()
kn.fit(X_train_norm, y_train)
y_hat = kn.predict(X_test_norm)
print_scores('KNeighbors Classifier baseline scores', y_hat, y_test)

kn = KNeighborsClassifier()
param_grid = { 
    'n_neighbors' : [3, 5, 7, 9],
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree','kd_tree'],
    'p' : [1, 2] }

clf_kn = GridSearchCV(kn, param_grid=param_grid, cv=5, n_jobs=-1)
best_kn = clf_kn.fit(X_train_norm, y_train)
print(f'Best Parameters: {str(best_kn.best_params_)}')

y_hat = clf_kn.predict(X_test_norm)
print_scores('KNeighbors Classifier tuned scores', y_hat, y_test)

In [None]:
fig, [ax_1, ax_2] = plt.subplots(1, 2, figsize=(13, 5))
metrics.plot_confusion_matrix(best_kn, X_test_norm, y_test, ax=ax_1, cmap=plt.cm.Blues);
metrics.plot_confusion_matrix(best_kn, X_test_norm, y_test, ax=ax_2, cmap=plt.cm.Blues, normalize='true');

# Visualizations

In [None]:
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(12, 5))

metrics.plot_roc_curve(best_rf, X_test, y_test, ax=ax_roc, name='Random Forest')
metrics.plot_roc_curve(best_sv, X_test_scaled, y_test, ax=ax_roc, name='Support Vector')
metrics.plot_roc_curve(best_lr, X_test_scaled, y_test, ax=ax_roc, name='Logistic Regression')
metrics.plot_roc_curve(best_kn, X_test_norm, y_test, ax=ax_roc, name='KNeighbors')

#metrics.plot_det_curve(clf, X_test, y_test, ax=ax_det, name=name)

ax_roc.set_title('Receiver Operating Characteristic (ROC) curves')
ax_det.set_title('Detection Error Tradeoff (DET) curves')

ax_roc.grid(linestyle='--')
ax_det.grid(linestyle='--')

plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(24, 15))

title = "Learning Curves (Random Forest)"
cv = ShuffleSplit(n_splits=5, test_size=0.2)
estimator = RandomForestClassifier(**best_rf.best_params_)
plot_learning_curve(estimator, title, X_train, y_train, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (SVC)"
cv = ShuffleSplit(n_splits=5, test_size=0.2)
estimator = SVC(**best_sv.best_params_)
plot_learning_curve(estimator, title, X_train_scaled, y_train, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits=5, test_size=0.2)
estimator = LogisticRegression(**best_lr.best_params_)
plot_learning_curve(estimator, title, X_train_scaled, y_train, axes=axes[:, 2], ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (KNeighbors)"
cv = ShuffleSplit(n_splits=5, test_size=0.2)
estimator = KNeighborsClassifier(**best_kn.best_params_)
plot_learning_curve(estimator, title, X_train_norm, y_train, axes=axes[:, 3], ylim=(0.7, 1.01), cv=cv, n_jobs=4)

plt.show()