In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np

In [10]:
data = load_breast_cancer()
x = pd.DataFrame( data = data.data, columns = data.feature_names )
y = np.array(data.target)

# Breast Cancer

Dado que se trabaja con un caso tan sensible como es la detección de tumores malignos, es preferible maximizar el número de tumores malignos predichos de forma correcta, por lo que se usará la métrica ´recall´. Aún así, tambien se desea mentener un balance entre presicion y recall, por lo que se buscará un valor de recall tal que el valor de $f_1$ no se reduzca en gran medida. 


In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

while True:
    vif = pd.DataFrame()
    vif['Variable'] = x.columns
    vif['Value'] = [ variance_inflation_factor( x.values, i ) for i in range(x.shape[1]) ]
    if vif.max()['Value'] < 5: break
    x = x.drop( columns = vif.max()['Variable'] )

x_train, x_test, y_train, y_test = train_test_split( x, y )
x

Unnamed: 0,area error,compactness error
0,153.40,0.04904
1,74.08,0.01308
2,94.03,0.04006
3,27.23,0.07458
4,94.44,0.02461
...,...,...
564,158.70,0.02891
565,99.04,0.02423
566,48.55,0.03731
567,86.22,0.06158


In [27]:
pipeline = Pipeline([('classifier', KNeighborsClassifier())])

param_grid = [
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [ i for i in range( 10, (len(x_train)*9)//10, 10 ) ]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [ 10**i for i in range(-10,10) ]
    }
]

gs = GridSearchCV( pipeline, param_grid, cv = 10, scoring = { 'recall': 'recall', 'f1': 'f1' }, refit = 'recall' )
gs.fit( x_train, y_train )

print(gs.best_params_)
print(gs.best_score_)
print(f'Score: {gs.score( x_test, y_test )}')

{'classifier': KNeighborsClassifier(), 'classifier__n_neighbors': 280}
1.0
Score: 1.0


Podemos ver de antemano que el mejor modelo es un 280-NN con un recall de aproximadamente el 100%, por posibles errores de aproximación.

Ahora, con ambos modelos manualmente tenemos:

In [31]:
Results = pd.DataFrame( columns=['recall','f1'] )
for i in range( 10, (len(x_train)*9)//10, 10 ):
    row = []
    for scorer in ['recall','f1']:
        scores = cross_val_score( KNeighborsClassifier( n_neighbors = i ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[i] = row
Results

Unnamed: 0,recall,f1
10,0.895077,0.888159
20,0.902769,0.883701
30,0.910462,0.891141
40,0.926308,0.891266
50,0.937846,0.891011
60,0.945692,0.892019
70,0.949538,0.892449
80,0.949538,0.889215
90,0.949538,0.884343
100,0.961231,0.890831


Como se puede ver, a partir de 280 es el punto donde el recall es muy cercano a 1 sin reducir el valor de $f_1$, por lo que podemos tomar estos como los mejores parametros del modelo

In [20]:
Results = pd.DataFrame( columns=['recall','f1'] )
for i in range(-10,10):
    row = []
    for scorer in ['recall','f1']:
        scores = cross_val_score( LogisticRegression( C = 10**i ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[i] = row
Results

Unnamed: 0,recall,f1
-10,1.0,0.765178
-9,1.0,0.765178
-8,1.0,0.765178
-7,1.0,0.765178
-6,1.0,0.773084
-5,0.996296,0.856169
-4,0.966239,0.888001
-3,0.954986,0.899507
-2,0.95114,0.897502
-1,0.95114,0.897502


En este caso podemos ver que a pesar de conseguir un recall tan alto como en el modelo anterior, su $f_1$ se reduce en mayor medida, por lo que se considera el modelo 280-NN como superior.

In [37]:
nn_classifier = KNeighborsClassifier(n_neighbors=280)
logit_classifier = LogisticRegression(C=1e-6)

nn_classifier.fit(x_train, y_train)
logit_classifier.fit(x_train, y_train)

nn_predictions = nn_classifier.predict(x_test)
logit_predictions = logit_classifier.predict(x_test)

nn_recall = recall_score(y_test, nn_predictions)
nn_precision = precision_score(y_test, nn_predictions)
nn_f1 = f1_score(y_test, nn_predictions)
nn_auc = roc_auc_score(y_test, nn_predictions)

logit_recall = recall_score(y_test, logit_predictions)
logit_precision = precision_score(y_test, logit_predictions)
logit_f1 = f1_score(y_test, logit_predictions)
logit_auc = roc_auc_score(y_test, logit_predictions)

results = {
    '280-NN': [nn_recall, nn_precision, nn_f1, nn_auc],
    'Logistic Regression': [logit_recall, logit_precision, logit_f1, logit_auc]
}

pd.DataFrame(results, index=['Recall', 'Precision', 'F1', 'AUC']).T


Unnamed: 0,Recall,Precision,F1,AUC
280-NN,1.0,0.804878,0.891892,0.727273
Logistic Regression,1.0,0.702128,0.825,0.522727


En este caso, se confirma que el modelo 280-NN es superior a la regresión logistica para la base de datos.

## Boston Housing

En este caso, se evaluarán 3 modelos de regresión: Lineal, Ridge y Lasso

In [41]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [63]:
import mglearn
import warnings
warnings.filterwarnings("ignore")

x, y = mglearn.datasets.load_extended_boston()
x = pd.DataFrame(x)
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.000000,0.208015,...,0.043270,0.059749,0.208015,0.018655,0.082503,0.287234,0.025759,1.000000,0.089680,0.008042
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998,0.782698,0.348962,0.043478,0.104962,...,0.011017,0.058064,0.104962,0.021462,0.306021,0.553191,0.113111,1.000000,0.204470,0.041808
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386,0.599382,0.348962,0.043478,0.104962,...,0.011017,0.058064,0.103885,0.006661,0.306021,0.547514,0.035109,0.979580,0.062814,0.004028
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,...,0.004461,0.043345,0.066412,0.002230,0.421118,0.645222,0.021667,0.988585,0.033197,0.001115
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,...,0.004461,0.043345,0.066794,0.006635,0.421118,0.648936,0.064464,1.000000,0.099338,0.009868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954,0.681771,0.122671,0.000000,0.164122,...,0.026936,0.146662,0.162090,0.035958,0.798551,0.882553,0.195787,0.975392,0.216382,0.048003
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324,0.760041,0.105293,0.000000,0.164122,...,0.026936,0.146662,0.164122,0.033286,0.798551,0.893617,0.181239,1.000000,0.202815,0.041134
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340,0.907312,0.094381,0.000000,0.164122,...,0.026936,0.146662,0.164122,0.017707,0.798551,0.893617,0.096414,1.000000,0.107892,0.011641
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.000000,0.164122,...,0.026936,0.146662,0.162694,0.021512,0.798551,0.885843,0.117127,0.982677,0.129930,0.017180


Dada la gran cantidad de variables, se eliminarán según su VIF como en la base anterior.

In [64]:
while True:
    vif = pd.DataFrame()
    vif['Variable'] = x.columns
    vif['Value'] = [ variance_inflation_factor( x.values, i ) for i in range(x.shape[1]) ]
    if vif.max()['Value'] < 5: break
    x = x.drop( columns = vif.max()['Variable'] )

x_train, x_test, y_train, y_test = train_test_split( x, y )
x

Unnamed: 0,0,1,2,3
0,0.000000,0.18,0.067815,0.0
1,0.000236,0.00,0.242302,0.0
2,0.000236,0.00,0.242302,0.0
3,0.000293,0.00,0.063050,0.0
4,0.000705,0.00,0.063050,0.0
...,...,...,...,...
501,0.000633,0.00,0.420455,0.0
502,0.000438,0.00,0.420455,0.0
503,0.000612,0.00,0.420455,0.0
504,0.001161,0.00,0.420455,0.0


In [65]:
pipeline = Pipeline([('classifier', LinearRegression())])

param_grid = [
    {
        'classifier': [LinearRegression()],
    },
    {
        'classifier': [Ridge()],
        'classifier__alpha': [ 10**i for i in range(-20, 20) ]
    },
    {
        'classifier': [Lasso()],
        'classifier__alpha': [ 10**i for i in range(-20, 20) ],
        'classifier__max_iter': [1000000]
    }
]

gs = GridSearchCV( pipeline, param_grid, cv = 10, scoring = { 'r2': 'r2', 'neg_root_mean_squared_error': 'neg_root_mean_squared_error' }, refit = 'r2')
gs.fit( x_train, y_train )

print(gs.best_params_)
print(f'Mejor puntaje en el entrenamiento: {gs.best_score_}')
print(f'Puntaje en la prueba: {gs.score( x_test, y_test )}')

{'classifier': Ridge(), 'classifier__alpha': 1}
Mejor puntaje en el entrenamiento: 0.30328396475476327
Puntaje en la prueba: 0.29369160112228676


Realizando los tres modelos manualmente tenemos:

In [68]:
Results = pd.DataFrame( columns=['r2','neg_root_mean_squared_error'] )
row = []
for scorer in ['r2','neg_root_mean_squared_error']:
    scores = cross_val_score( LinearRegression(), x_train, y_train, cv = 10, scoring = scorer )
    row.append( scores.mean() )
Results.loc['Linear'] = row
Results

Unnamed: 0,r2,neg_root_mean_squared_error
Linear,0.301532,-7.639614


Recordemos que se calcula el RMSE negativo para seguir la tendencia de maximizar los scores, ya que el RMSE es medida de error.

In [72]:
Results = pd.DataFrame( columns=['r2','neg_root_mean_squared_error'] )
for i in range(-20, 20):
    row = []
    for scorer in ['r2','neg_root_mean_squared_error']:
        scores = cross_val_score( Ridge( alpha = 10**i ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[f'10^{i}'] = row
Results

Unnamed: 0,r2,neg_root_mean_squared_error
10^-20,0.301532,-7.639614
10^-19,0.301532,-7.639614
10^-18,0.301532,-7.639614
10^-17,0.301532,-7.639614
10^-16,0.301532,-7.639614
10^-15,0.301532,-7.639614
10^-14,0.301532,-7.639614
10^-13,0.301532,-7.639614
10^-12,0.301532,-7.639614
10^-11,0.301532,-7.639614


En este caso podemos ver como se maximizana ambos scores cuando se llega a $\alpha=1$, con un $R^2$ de 0.303 y un RMSE de 7.64. Estos puntajes son un poco mejores que la regresión lineal, pero aún así, no ubican al modelo como un buen predictor.

In [73]:
Results = pd.DataFrame( columns=['r2','neg_root_mean_squared_error'] )
for i in range(-20, 20):
    row = []
    for scorer in ['r2','neg_root_mean_squared_error']:
        scores = cross_val_score( Lasso( alpha = 10**i, max_iter = 1000000 ), x_train, y_train, cv = 10, scoring = scorer )
        row.append( scores.mean() )
    Results.loc[f'10^{i}'] = row
Results

Unnamed: 0,r2,neg_root_mean_squared_error
10^-20,0.301532,-7.639614
10^-19,0.301532,-7.639614
10^-18,0.301532,-7.639614
10^-17,0.301532,-7.639614
10^-16,0.301532,-7.639614
10^-15,0.301532,-7.639614
10^-14,0.301532,-7.639614
10^-13,0.301532,-7.639614
10^-12,0.301532,-7.639614
10^-11,0.301532,-7.639614


En este caso, ambos puntajes se maximizan al llegar a un $\alpha = 10^{-2}$, cuyos valores son inferiores a los obtenidos previamente en la regresión Ridge.

In [75]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score

linear_classifier = LinearRegression()
ridge_classifier = Ridge( alpha = 1 )
lasso_classifier = Lasso( alpha = 1e-2, max_iter = 1000000 )

linear_classifier.fit(x_train, y_train)
ridge_classifier.fit(x_train, y_train)
lasso_classifier.fit(x_train, y_train)

linear_predictions = linear_classifier.predict(x_test)
ridge_predictions = ridge_classifier.predict(x_test)
lasso_predictions = lasso_classifier.predict(x_test)

linear_scores = [
    mean_absolute_percentage_error(y_test,linear_predictions),
    mean_squared_error(y_test,linear_predictions),
    r2_score(y_test,linear_predictions)
]
ridge_scores = [
    mean_absolute_percentage_error(y_test,ridge_predictions),
    mean_squared_error(y_test,ridge_predictions),
    r2_score(y_test,ridge_predictions)
]
lasso_scores = [
    mean_absolute_percentage_error(y_test,lasso_predictions),
    mean_squared_error(y_test,lasso_predictions),
    r2_score(y_test,lasso_predictions)
]

results = {
    'Linear': linear_scores,
    'Ridge (alpha=1)': ridge_scores,
    'Lasso (alpha=0.01)': lasso_scores
}

pd.DataFrame(results, index=['MAPE', 'MSE', 'R2']).T

Unnamed: 0,MAPE,MSE,R2
Linear,0.239675,54.232849,0.290654
Ridge (alpha=1),0.24235,54.000642,0.293692
Lasso (alpha=0.01),0.239666,54.077926,0.292681


Por RMSE y $R^2$ se confirma que el mejor modelo es una regresión ridge con $\alpha=1$, a pesar que este sea un terrible modelo para predecir la variable respuesta.