# Dataset: load_diabetes

## 1. Dataset analysis

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pandas
import seaborn as seaborn
import matplotlib.pyplot as matplot
import numpy as np

In [2]:
diabetes = load_diabetes()
print("BRIEF DESCRIPTION OF THE DATASET: ")
print(diabetes.DESCR)
print()

print("ATTRIBUTES: ")
print(dir(diabetes))
print()

X = diabetes.data
y = diabetes.target
fn = diabetes.feature_names

print("DATA:")
show_data = pandas.DataFrame(data=X, columns=fn)
show_data['progression'] = pandas.Series(diabetes.target)
show_data

BRIEF DESCRIPTION OF THE DATASET: 
.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,progression
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


**Further data analysis**

In [14]:
show_data.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,result
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


# 2. Model fitting

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

**PERCEPTRON**

In [18]:
def perceptron(X, y, margin=0.1, learning_rate=1.0, max_iters=200):
    N, D = X.shape
    Y = np.unique(y)
    C = Y.size
    weights = np.zeros((1+D, C))
    
    for iteration in range(1, max_iters + 1):
        errors = 0
        
        for n in range(N):
            xn = np.array([1, *X[n, :]])
            cn = np.squeeze(np.where(Y==y[n]))
            gn = weights[:,cn].T @ xn
            err = False
            
            for c in np.arange(C):
                if c != cn and weights[:,c].T @ xn + margin >= gn:
                    weights[:, c] = weights[:, c] - learning_rate*xn; err = True
            if err:
                weights[:, cn] = weights[:, cn] + learning_rate*xn
                errors = errors + 1
                
        if errors == 0:
            break

    return weights

In [21]:
%%timeit -n1 -r1

best_precision = 0
best_params = [0,0,0]

for margin in (0.1, 0.3, 0.5):
    for learning_rate in (0.1, 0.5, 1):
        for max_iters in (100, 200):
            W = perceptron(X_train, y_train, margin, learning_rate, max_iters)

            X_testh = np.hstack([np.ones((len(X_test), 1)), X_test])
            y_test_pred  = np.argmax(X_testh @ W, axis=1).reshape(-1, 1)
            err_test = np.count_nonzero(y_test_pred != y_test) / len(X_test)
            precision = 100 - err_test
            
            if precision > best_precision:
                best_precision = precision
                best_params = [margin, learning_rate, max_iters]

print(f'Accuracy: {best_precision:.1f}% with margin {best_params[0]}, learning rate {best_params[1]} and {best_params[2]} maximum iterations')

Accuracy: 12.0% with margin 0.1, learning rate 0.5 and 200 maximum iterations
24min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**NAIVE BAYES** 

In [29]:
%%timeit -n1 -r1

import warnings; warnings.filterwarnings('ignore')

nb = GaussianNB()

Gnb = {"var_smoothing": [1e-9, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSnb = GridSearchCV(nb, Gnb, scoring='accuracy', refit=True, cv=5)

acc = GSnb.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSnb.best_params_}')

Accuracy: 1.1% con {'var_smoothing': 0.1}
1.52 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LINEAR DISCRIMINANT ANALYSIS**

In [30]:
%%timeit -n1 -r1
lda = LinearDiscriminantAnalysis()

Glda = {"solver": ['svd', 'lsqr', 'eigen'], "n_components": [1,2], "tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSlda = GridSearchCV(lda, Glda, scoring='accuracy', refit=True, cv=5)

acc = GSlda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlda.best_params_}')

Accuracy: 1.1% con {'n_components': 1, 'solver': 'svd', 'tol': 0.9}
7.96 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LOGISTIC REGRESSION**

In [33]:
%%timeit -n1 -r1

log_reg = LogisticRegression(random_state=23)

Glogreg = {"penalty": ['l1', 'l2', None], "tol": [0.001, 0.01, 0.1], "solver": ['lbfgs', 'liblinear', 'newton-cg'], "multi_class": ['auto', 'multinomial'], "max_iter": [10, 25], "n_jobs": [1,2,4], "C": [0.1, 1, 10, 100]}
GSlogreg = GridSearchCV(log_reg, Glogreg, scoring='accuracy', refit=True, cv=5)

acc = GSlogreg.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlogreg.best_params_}')

Accuracy: 0.0% con {'C': 100, 'max_iter': 10, 'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.001}
12min 42s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**DECISION TREE CLASSIFIER**

In [34]:
%%timeit -n1 -r1

dtc = DecisionTreeClassifier(random_state=23)

Gdtc = {"criterion": ['gini', 'entropy', 'log_loss'], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSdtc = GridSearchCV(dtc, Gdtc, scoring='accuracy', refit=True, cv=5)

acc = GSdtc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSdtc.best_params_}')

Accuracy: 0.0% con {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 3}
8.24 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**RANDOM FOREST**

In [35]:
%%timeit -n1 -r1

rfc = RandomForestClassifier(random_state=23)

Grfc = {"n_estimators": [1,5,10,25,50], "criterion": ['gini', 'entropy', 'log_loss'], "bootstrap": [True,False], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSrfc = GridSearchCV(rfc, Grfc, scoring='accuracy', refit=True, cv=5)

acc = GSrfc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSrfc.best_params_}')

Accuracy: 0.0% con {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 50}
6min 14s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**ADABOOST CLASSIFIER**

In [36]:
%%timeit -n1 -r1

abc = AdaBoostClassifier(random_state=23)

Gabc = {"n_estimators": [2,5,10,20,50,100], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9,0.99,1]}
GSabc = GridSearchCV(abc, Gabc, scoring='accuracy', refit=True, cv=5)

acc = (GSabc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSabc.best_params_}')

Accuracy: 0.0% con {'learning_rate': 0.01, 'n_estimators': 100}
43.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**GRADIENT BOOSTING CLASSIFIER**

In [6]:
%%timeit -n1 -r1

import warnings; warnings.filterwarnings('ignore')

gbc = GradientBoostingClassifier(random_state=23)

Ggbc = {"n_estimators": [2,5,10,20], "learning_rate": [0.25,0.5,0.75], "criterion": ['friedman_mse', 'squared_error'], "max_depth": [1,5,10], "min_samples_split": [2,3,4]}
GSgbc = GridSearchCV(gbc, Ggbc, scoring='accuracy', refit=True, cv=5)

acc = (GSgbc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSgbc.best_params_}')

Accuracy: 0.0% con {'criterion': 'friedman_mse', 'learning_rate': 0.25, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 2}
55min 32s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**KNEIGHBORS CLASSIFIER**

In [7]:
%%timeit -n1 -r1

knc = KNeighborsClassifier()

Gknc = {'n_neighbors': [1,2,5,10,20], 'weights':['uniform', 'distance'], 'leaf_size': [5,10,20,30,50], 'p': [1,2,3,4,5]}
GSknc = GridSearchCV(knc, Gknc, scoring='accuracy', refit=True, cv=5)

acc = (GSknc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSknc.best_params_}')

Accuracy: 0.0% con {'leaf_size': 5, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
10.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
