In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np

In [10]:
data = load_breast_cancer()
x = pd.DataFrame( data = data.data, columns = data.feature_names )
y = np.array(data.target)

# Breast Cancer

Dado que se trabaja con un caso tan sensible como es la detección de tumores malignos, es preferible maximizar el número de tumores malignos predichos de forma correcta, por lo que se usará la métrica ´recall´. Aún así, tambien se desea mentener un balance entre presicion y recall, por lo que se buscará un valor de recall tal que el valor de $f_1$ no se reduzca en gran medida. 


In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

while True:
    vif = pd.DataFrame()
    vif['Variable'] = x.columns
    vif['Value'] = [ variance_inflation_factor( x.values, i ) for i in range(x.shape[1]) ]
    if vif.max()['Value'] < 5: break
    x = x.drop( columns = vif.max()['Variable'] )

x_train, x_test, y_train, y_test = train_test_split( x, y )
x

Unnamed: 0,area error,compactness error
0,153.40,0.04904
1,74.08,0.01308
2,94.03,0.04006
3,27.23,0.07458
4,94.44,0.02461
...,...,...
564,158.70,0.02891
565,99.04,0.02423
566,48.55,0.03731
567,86.22,0.06158


In [27]:
pipeline = Pipeline([('classifier', KNeighborsClassifier())])

param_grid = [
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [ i for i in range( 10, (len(x_train)*9)//10, 10 ) ]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [ 10**i for i in range(-10,10) ]
    }
]

gs = GridSearchCV( pipeline, param_grid, cv = 10, scoring = { 'recall': 'recall', 'f1': 'f1' }, refit = 'recall' )
gs.fit( x_train, y_train )

print(gs.best_params_)
print(gs.best_score_)
print(f'Score: {gs.score( x_test, y_test )}')

{'classifier': KNeighborsClassifier(), 'classifier__n_neighbors': 280}
1.0
Score: 1.0


Podemos ver de antemano que el mejor modelo es un 280-NN con un recall de aproximadamente el 100%, por posibles errores de aproximación.

Ahora, con ambos modelos manualmente tenemos:

In [31]:
Results = pd.DataFrame( columns=['recall','f1'] )
for i in range( 10, (len(x_train)*9)//10, 10 ):
    row = []
    for scorer in ['recall','f1']:
        scores = cross_val_score( KNeighborsClassifier( n_neighbors = i ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[i] = row
Results

Unnamed: 0,recall,f1
10,0.895077,0.888159
20,0.902769,0.883701
30,0.910462,0.891141
40,0.926308,0.891266
50,0.937846,0.891011
60,0.945692,0.892019
70,0.949538,0.892449
80,0.949538,0.889215
90,0.949538,0.884343
100,0.961231,0.890831


Como se puede ver, a partir de 280 es el punto donde el recall es muy cercano a 1 sin reducir el valor de $f_1$, por lo que podemos tomar estos como los mejores parametros del modelo

In [20]:
Results = pd.DataFrame( columns=['recall','f1'] )
for i in range(-10,10):
    row = []
    for scorer in ['recall','f1']:
        scores = cross_val_score( LogisticRegression( C = 10**i ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[i] = row
Results

Unnamed: 0,recall,f1
-10,1.0,0.765178
-9,1.0,0.765178
-8,1.0,0.765178
-7,1.0,0.765178
-6,1.0,0.773084
-5,0.996296,0.856169
-4,0.966239,0.888001
-3,0.954986,0.899507
-2,0.95114,0.897502
-1,0.95114,0.897502


En este caso podemos ver que a pesar de conseguir un recall tan alto como en el modelo anterior, su $f_1$ se reduce en mayor medida, por lo que se considera el modelo 280-NN como superior.

In [37]:
nn_classifier = KNeighborsClassifier(n_neighbors=280)
logit_classifier = LogisticRegression(C=1e-6)

nn_classifier.fit(x_train, y_train)
logit_classifier.fit(x_train, y_train)

nn_predictions = nn_classifier.predict(x_test)
logit_predictions = logit_classifier.predict(x_test)

nn_recall = recall_score(y_test, nn_predictions)
nn_precision = precision_score(y_test, nn_predictions)
nn_f1 = f1_score(y_test, nn_predictions)
nn_auc = roc_auc_score(y_test, nn_predictions)

logit_recall = recall_score(y_test, logit_predictions)
logit_precision = precision_score(y_test, logit_predictions)
logit_f1 = f1_score(y_test, logit_predictions)
logit_auc = roc_auc_score(y_test, logit_predictions)

results = {
    '280-NN': [nn_recall, nn_precision, nn_f1, nn_auc],
    'Logistic Regression': [logit_recall, logit_precision, logit_f1, logit_auc]
}

pd.DataFrame(results, index=['Recall', 'Precision', 'F1', 'AUC']).T


Unnamed: 0,Recall,Precision,F1,AUC
280-NN,1.0,0.804878,0.891892,0.727273
Logistic Regression,1.0,0.702128,0.825,0.522727


En este caso, se confirma que el modelo 280-NN es superior a la regresión logistica para la base de datos.

## Boston Housing

In [40]:
import mglearn
x, y = mglearn.datasets.load_extended_boston()

In [None]:
pipeline = Pipeline([('classifier', Linear())])

param_grid = [
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [ i for i in range( 10, (len(x_train)*9)//10, 10 ) ]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [ 10**i for i in range(-10,10) ]
    }
]

gs = GridSearchCV( pipeline, param_grid, cv = 10, scoring = { 'recall': 'recall', 'f1': 'f1' }, refit = 'recall' )
gs.fit( x_train, y_train )

print(gs.best_params_)
print(gs.best_score_)
print(f'Score: {gs.score( x_test, y_test )}')