# Dataset: load_breast_cancer

## 1. Dataset analysis

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pandas
import seaborn as seaborn
import matplotlib.pyplot as matplot
import numpy as np

In [3]:
bc = load_breast_cancer()
print("BRIEF DESCRIPTION OF THE DATASET: ")
print(bc.DESCR)
print()

print("ATTRIBUTES: ")
print(dir(bc))
print()

X = bc.data
y = bc.target
fn = bc.feature_names

print("DATA:")
show_data = pandas.DataFrame(data=X, columns=fn)
show_data['type'] = pandas.Series(bc.target)
show_data

BRIEF DESCRIPTION OF THE DATASET: 
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features. 

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


**Further data analysis**

In [4]:
show_data.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


# 2. Model fitting

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

**PERCEPTRON**

In [7]:
def perceptron(X, y, margin=0.1, learning_rate=1.0, max_iters=200):
    N, D = X.shape
    Y = np.unique(y)
    C = Y.size
    weights = np.zeros((1+D, C))
    
    for iteration in range(1, max_iters + 1):
        errors = 0
        
        for n in range(N):
            xn = np.array([1, *X[n, :]])
            cn = np.squeeze(np.where(Y==y[n]))
            gn = weights[:,cn].T @ xn
            err = False
            
            for c in np.arange(C):
                if c != cn and weights[:,c].T @ xn + margin >= gn:
                    weights[:, c] = weights[:, c] - learning_rate*xn; err = True
            if err:
                weights[:, cn] = weights[:, cn] + learning_rate*xn
                errors = errors + 1
                
        if errors == 0:
            break

    return weights

In [8]:
%%timeit -n1 -r1

best_precision = 0
best_params = [0,0,0]

for margin in (0.01, 0.1, 0.3, 0.5, 0.7):
    for learning_rate in (0.1, 0.5, 1, 2):
        for max_iters in (100, 200, 500, 1000):
            W = perceptron(X_train, y_train, margin, learning_rate, max_iters)

            X_testh = np.hstack([np.ones((len(X_test), 1)), X_test])
            y_test_pred  = np.argmax(X_testh @ W, axis=1).reshape(-1, 1)
            err_test = np.count_nonzero(y_test_pred != y_test) / len(X_test)
            precision = 100 - err_test
            
            if precision > best_precision:
                best_precision = precision
                best_params = [margin, learning_rate, max_iters]

print(f'Accuracy: {best_precision:.1f}% with margin {best_params[0]}, learning rate {best_params[1]} and {best_params[2]} maximum iterations')

Accuracy: 49.3% with margin 0.01, learning rate 0.1 and 1000 maximum iterations
6min 54s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**NAIVE BAYES** 

In [9]:
%%timeit -n1 -r1
nb = GaussianNB()

Gnb = {"var_smoothing": [1e-9, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSnb = GridSearchCV(nb, Gnb, scoring='accuracy', refit=True, cv=5)

acc = GSnb.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSnb.best_params_}')

Accuracy: 95.6% con {'var_smoothing': 1e-09}
122 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LINEAR DISCRIMINANT ANALYSIS**

In [12]:
%%timeit -n1 -r1
import warnings; warnings.filterwarnings('ignore')

lda = LinearDiscriminantAnalysis()

Glda = {"solver": ['svd', 'lsqr', 'eigen'], "n_components": [1,2], "tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSlda = GridSearchCV(lda, Glda, scoring='accuracy', refit=True, cv=5)

acc = GSlda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlda.best_params_}')

Accuracy: 93.9% con {'n_components': 1, 'solver': 'svd', 'tol': 0.3}
1.17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**QUADRATIC DISCRIMINANT ANALYSIS**

In [13]:
%%timeit -n1 -r1

qda = QuadraticDiscriminantAnalysis()

Gqda = {"tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSqda = GridSearchCV(qda, Gqda, scoring='accuracy', refit=True, cv=5)

acc = GSqda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSqda.best_params_}')

Accuracy: 98.2% con {'tol': 1e-05}
128 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LOGISTIC REGRESSION**

In [14]:
%%timeit -n1 -r1

log_reg = LogisticRegression(random_state=23)

Glogreg = {"penalty": ['l1', 'l2', None], "tol": [0.001, 0.01, 0.1], "solver": ['lbfgs', 'liblinear', 'newton-cg'], "multi_class": ['auto', 'multinomial'], "max_iter": [10, 50, 100], "n_jobs": [1,2,4], "C": [0.001, 0.1, 1, 10, 100]}
GSlogreg = GridSearchCV(log_reg, Glogreg, scoring='accuracy', refit=True, cv=5)

acc = GSlogreg.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlogreg.best_params_}')

Accuracy: 96.5% con {'C': 100, 'max_iter': 50, 'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.01}
7min 47s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**DECISION TREE CLASSIFIER**

In [15]:
%%timeit -n1 -r1

dtc = DecisionTreeClassifier(random_state=23)

Gdtc = {"criterion": ['gini', 'entropy', 'log_loss'], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSdtc = GridSearchCV(dtc, Gdtc, scoring='accuracy', refit=True, cv=5)

acc = GSdtc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSdtc.best_params_}')

Accuracy: 95.6% con {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
5.65 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**RANDOM FOREST**

In [16]:
%%timeit -n1 -r1

rfc = RandomForestClassifier(random_state=23)

Grfc = {"n_estimators": [1,5,10,25,50], "criterion": ['gini', 'entropy', 'log_loss'], "bootstrap": [True,False], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSrfc = GridSearchCV(rfc, Grfc, scoring='accuracy', refit=True, cv=5)

acc = GSrfc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSrfc.best_params_}')

Accuracy: 98.2% con {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 3, 'n_estimators': 25}
3min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**ADABOOST CLASSIFIER**

In [17]:
%%timeit -n1 -r1

abc = AdaBoostClassifier(random_state=23)

Gabc = {"n_estimators": [2,5,10,20,50,100], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9,0.99,1]}
GSabc = GridSearchCV(abc, Gabc, scoring='accuracy', refit=True, cv=5)

acc = (GSabc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSabc.best_params_}')

Accuracy: 97.4% con {'learning_rate': 1, 'n_estimators': 100}
39.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**GRADIENT BOOSTING CLASSIFIER**

In [18]:
%%timeit -n1 -r1

gbc = GradientBoostingClassifier(random_state=23)

Ggbc = {"n_estimators": [2,5,10,20,50], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9], "criterion": ['friedman_mse', 'squared_error'], "max_depth": [1,3,5,7,10], "min_samples_split": [2,3,4]}
GSgbc = GridSearchCV(gbc, Ggbc, scoring='accuracy', refit=True, cv=5)

acc = (GSgbc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSgbc.best_params_}')

Accuracy: 94.7% con {'criterion': 'friedman_mse', 'learning_rate': 0.5, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 50}
16min 57s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**KNEIGHBORS CLASSIFIER**

In [19]:
%%timeit -n1 -r1

knc = KNeighborsClassifier()

Gknc = {'n_neighbors': [1,2,5,10,20], 'weights':['uniform', 'distance'], 'leaf_size': [5,10,20,30,50], 'p': [1,2,3,4,5]}
GSknc = GridSearchCV(knc, Gknc, scoring='accuracy', refit=True, cv=5)

acc = (GSknc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSknc.best_params_}')

Accuracy: 96.5% con {'leaf_size': 5, 'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
26.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
