# Breast Cancer

Como se mencionó en el trabajo anterior, es preferible usar la métrica recall, puesto que minimiza el número de tumores maligos predichos, por lo que es menos probable que catalogue un tumor maligno como benigno y ponga la vida de alguien en riesgo por falta de detección

In [1]:
from cleaning import reduce_vif

Importación de la base de datos

In [2]:
from sklearn.datasets import load_breast_cancer
from pandas import DataFrame
from numpy import array

data = load_breast_cancer()

X = DataFrame( data = data.data, columns = data.feature_names )
y = array( data.target )

In [3]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
X_clean = reduce_vif( X )

Dropped col worst texture with vif 63306.17203588469


Dropped col worst symmetry with vif 63220.51620336962


Dropped col worst smoothness with vif 63065.70576575069


Dropped col worst radius with vif 61649.81043724271


Dropped col worst perimeter with vif 54345.98850641613


Dropped col worst fractal dimension with vif 50272.811021865375


Dropped col worst concavity with vif 50212.61746969545


Dropped col worst concave points with vif 50206.924195754895


Dropped col worst compactness with vif 50074.052594314235


Dropped col worst area with vif 45851.514134748824


Dropped col texture error with vif 45050.18204804375


Dropped col symmetry error with vif 45006.932477719776


In [11]:
X_clean

Unnamed: 0,area error,compactness error,concavity error,concave points error
0,153.40,0.04904,0.05373,0.01587
1,74.08,0.01308,0.01860,0.01340
2,94.03,0.04006,0.03832,0.02058
3,27.23,0.07458,0.05661,0.01867
4,94.44,0.02461,0.05688,0.01885
...,...,...,...,...
564,158.70,0.02891,0.05198,0.02454
565,99.04,0.02423,0.03950,0.01678
566,48.55,0.03731,0.04730,0.01557
567,86.22,0.06158,0.07117,0.01664


División de set de entrenamiento y prueba

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_clean, y )

## Clasificación Bayesiana

Versión manual

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform( X_clean )

scores = cross_val_score(BernoulliNB(), X_scaled, y, cv=10, scoring='recall')

scores.mean()

0.8234920634920636

Versión con Pipeline

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', BernoulliNB())
])

scores = cross_val_score(pipeline, X_clean, y, cv=10, scoring='recall')

scores.mean()

0.8262698412698413

La diferencia entre los scores se puede deber a la fuga de datos generados por el escalamiento de datos previo a la generación del modelo.

## Arboles de decisión

Versión manual

In [43]:
from sklearn.tree import DecisionTreeClassifier

results = DataFrame( columns = ['Depth', 'recall'] )
for i in range( 4 , 9 ):
    scores = cross_val_score(DecisionTreeClassifier( max_depth=i , random_state=1 ), X_scaled, y, cv=10, scoring='recall')
    results.loc[ len(results) ] = { 'Depth': i, 'recall': scores.mean() }

results

Unnamed: 0,Depth,recall
0,4,0.924286
1,5,0.913175
2,6,0.888175
3,7,0.874286
4,8,0.871508


In [45]:
from sklearn.model_selection import GridSearchCV

classifier = DecisionTreeClassifier(random_state=1)

param_grid = {
    'max_depth': [ i for i in range( 4 , 9 ) ]
}

grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Parámetros:", best_params)
print("Recall:", best_score)

Parámetros: {'max_depth': 4}
Recall: 0.9242857142857142


## Random Forest

Versión manual

In [59]:
from sklearn.ensemble  import RandomForestClassifier

results = DataFrame( columns = ['Depth', 'Trees', 'recall'] )
for i in range( 4 , 9 ):
    for j in range( 10 ):
        scores = cross_val_score(RandomForestClassifier( max_depth=i, n_estimators=(j + 1)*5, random_state=1 ), X_scaled, y, cv=10, scoring='recall')
        results.loc[ len(results) ] = { 'Depth': i, 'Trees': (j + 1)*5, 'recall': scores.mean() }

results.head()

Unnamed: 0,Depth,Trees,recall
0,4,5,0.938175
1,4,10,0.943968
2,4,15,0.946746
3,4,20,0.946746
4,4,25,0.943968


In [55]:
results.max()['recall']

0.9467460317460317

In [57]:
classifier = RandomForestClassifier(random_state=1)

param_grid = {
    'max_depth': [ i for i in range( 4, 9 ) ],
    'n_estimators': [ (i + 1)*5 for i in range( 10 ) ]
}

grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Parámetros:", best_params)
print("Recall:", best_score)

Parámetros: {'max_depth': 4, 'n_estimators': 15}
Recall: 0.9467460317460317


## XGBoost

In [68]:
from xgboost  import XGBClassifier

results = DataFrame( columns = ['Depth', 'Trees', 'L. rate', 'recall'] )
for i in range( 4 , 9 ):
    for j in range( 10 ):
        for k in range( 5 ):
            scores = cross_val_score(XGBClassifier( max_depth=i, n_estimators=(j + 1)*5, learning_rate = 10**(-k-1), random_state=1 ), X_scaled, y, cv=10, scoring='recall', n_jobs = -1)
            results.loc[ len(results) ] = { 'Depth': i, 'Trees': (j + 1)*5, 'L. rate': 10**(-k-1), 'recall': scores.mean() }

results.head()

Unnamed: 0,Depth,Trees,L. rate,recall
0,4,5,0.1,0.949524
1,4,5,0.01,1.0
2,4,5,0.001,1.0
3,4,5,0.0001,1.0
4,4,5,1e-05,1.0


In [63]:
results.max()['recall']

1.0

In [65]:
param_grid = {
    'max_depth': [ i for i in range( 4, 9 ) ],
    'n_estimators': [ (i + 1)*5 for i in range( 10 ) ],
    'learning_rate': [ 10**(-k-1) for k in range( 5 ) ]
}

grid_search = GridSearchCV(XGBClassifier(random_state=1), param_grid, cv=10, scoring='recall')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Parámetros:", best_params)
print("Recall:", best_score)

Parámetros: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 5}
Recall: 1.0


Los resultados 

In [67]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

models = [
    ("BernoulliNB", BernoulliNB(), {}),
    ("DecisionTree", DecisionTreeClassifier(max_depth=4), {}),
    ("RandomForest", RandomForestClassifier(max_depth=4, n_estimators=15), {}),
    ("XGBoost", XGBClassifier(learning_rate=0.01, max_depth=4, n_estimators=5), {})
]

results = []

for name, model, params in models:
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    results.append([name, precision, recall, f1, auc])

results_df = pd.DataFrame(results, columns=["Model", "Precision", "Recall", "F1 Score", "AUC"])
results_df


Unnamed: 0,Model,Precision,Recall,F1 Score,AUC
0,BernoulliNB,0.75,0.984076,0.85124,0.858352
1,DecisionTree,0.75827,0.949045,0.842999,0.807749
2,RandomForest,0.756039,0.996815,0.85989,0.863273
3,XGBoost,0.612086,1.0,0.759371,0.792401


Por los resultados, podemos ver que el modelo que maximiza el recall, es un modelo XGBoost con profundidad 4, $\nu = 0.01$ y 5 estimadores. Aún así, es el que peor desempeño obtuvo en otras métricas.

# Boston Housing

In [71]:
import mglearn
import warnings
warnings.filterwarnings("ignore")

X, y = mglearn.datasets.load_extended_boston()
X = pd.DataFrame(X)

In [72]:
X_clean = reduce_vif( X )

Dropped col 103.0 with vif inf
Dropped col 102.0 with vif inf
Dropped col 101.0 with vif inf
Dropped col 100.0 with vif inf
Dropped col 99.0 with vif inf
Dropped col 98.0 with vif inf
Dropped col 97.0 with vif inf
Dropped col 96.0 with vif inf
Dropped col 95.0 with vif inf
Dropped col 94.0 with vif inf
Dropped col 93.0 with vif inf
Dropped col 92.0 with vif inf
Dropped col 91.0 with vif inf
Dropped col 90.0 with vif inf
Dropped col 89.0 with vif inf
Dropped col 88.0 with vif inf
Dropped col 87.0 with vif inf
Dropped col 86.0 with vif inf
Dropped col 85.0 with vif inf
Dropped col 84.0 with vif inf
Dropped col 83.0 with vif inf
Dropped col 82.0 with vif inf
Dropped col 81.0 with vif inf
Dropped col 80.0 with vif inf
Dropped col 79.0 with vif inf
Dropped col 78.0 with vif inf
Dropped col 77.0 with vif inf
Dropped col 76.0 with vif inf
Dropped col 75.0 with vif inf
Dropped col 74.0 with vif inf
Dropped col 73.0 with vif inf
Dropped col 72.0 with vif inf
Dropped col 71.0 with vif inf
Droppe

In [73]:
X_clean

Unnamed: 0,0,1,2,3,4,5
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105
...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467


In [74]:
X_train, X_test, y_train, y_test = train_test_split( X_clean, y )

Dado que esta regresión no requiere hiperparámetros ni escalado de datos, bastará con la generación del modelo.

In [76]:
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score

model = BayesianRidge()

model.fit( X_train, y_train )
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("RMSE:", mse**(1/2))
print("R^2:", r2)

RMSE: 5.72422243657148
R^2: 0.5820983222977625


## Árbol de decisión

In [78]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

# Assuming X_scaled and y are your features and labels

results = pd.DataFrame(columns=['Depth', 'MSE', 'R^2'])

for i in range(4, 9):
    regressor = DecisionTreeRegressor(max_depth=i, random_state=1)
    scores_mse = -cross_val_score(regressor, X_clean, y, cv=10, scoring='neg_mean_squared_error')
    scores_r2 = cross_val_score(regressor, X_clean, y, cv=10, scoring='r2')
    
    results.loc[len(results)] = {'Depth': i, 'MSE': scores_mse.mean(), 'R^2': scores_r2.mean()}

results

Unnamed: 0,Depth,MSE,R^2
0,4,47.881421,-1.104367
1,5,58.197278,-2.476104
2,6,70.85849,-3.419201
3,7,75.141397,-4.801361
4,8,79.066248,-4.883838


La explicación del $R^2$ se puede encontrar aquí: https://towardsdatascience.com/explaining-negative-r-squared-17894ca26321, pero en escencia, se dan valores negativos cuando el modelo predice la variable respuesta peor que un modelo constante que retorna siempre la media.

In [79]:
regressor = DecisionTreeRegressor(random_state=1)

param_grid = {
    'max_depth': [i for i in range(4, 9)]
}

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Take the negative to get MSE

print("Best Parameters:", best_params)
print("Best MSE:", best_score)

Best Parameters: {'max_depth': 4}
Best MSE: 47.8814214874713


In [81]:
regressor = DecisionTreeRegressor(random_state=1)

param_grid = {
    'max_depth': [i for i in range(4, 9)]
}

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='r2')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_  # Take the negative to get MSE

print("Best Parameters:", best_params)
print("Best R^2:", best_score)

Best Parameters: {'max_depth': 4}
Best R^2: -1.104367077003224


## Random Forest

In [83]:
from sklearn.ensemble import RandomForestRegressor

results = pd.DataFrame(columns=['Depth', 'Trees', 'MSE', 'R^2'])

for i in range(4, 9):
    for j in range(10):
        regressor = RandomForestRegressor(max_depth=i, n_estimators=(j + 1) * 5, random_state=1)
        scores_mse = -cross_val_score(regressor, X_clean, y, cv=10, scoring='neg_mean_squared_error')
        scores_r2 = cross_val_score(regressor, X_clean, y, cv=10, scoring='r2')
        
        results.loc[len(results)] = {'Depth': i, 'Trees': (j + 1) * 5, 'MSE': scores_mse.mean(), 'R^2': scores_r2.mean()}

results.head()

Unnamed: 0,Depth,Trees,MSE,R^2
0,4,5,34.766108,0.010018
1,4,10,32.571282,0.08202
2,4,15,32.494404,0.07248
3,4,20,31.86786,0.075231
4,4,25,31.488796,0.076304


In [84]:
results

Unnamed: 0,Depth,Trees,MSE,R^2
0,4,5,34.766108,0.010018
1,4,10,32.571282,0.08202
2,4,15,32.494404,0.07248
3,4,20,31.86786,0.075231
4,4,25,31.488796,0.076304
5,4,30,32.102933,0.045308
6,4,35,31.615452,0.076428
7,4,40,31.226574,0.071318
8,4,45,31.468311,0.04801
9,4,50,31.315379,0.061626


Dado que el $R^2$ es mínimo, se buscará reducir el MSE lo más que se pueda, en este caso es en el índice 7, con 40 árboles y 4 de profundidad máxima.

In [87]:
regressor = RandomForestRegressor(random_state=1)

param_grid = {
    'max_depth': [i for i in range(4, 9)],
    'n_estimators': [(i + 1) * 5 for i in range(10)]
}

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best MSE:", best_score)

Best Parameters: {'max_depth': 4, 'n_estimators': 40}
Best MSE: 31.226574256687353


## XGBoost

In [88]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd

results = pd.DataFrame(columns=['Depth', 'Trees', 'Learning Rate', 'MSE', 'R^2'])

for i in range(4, 9):
    for j in range(10):
        for k in range(5):
            regressor = XGBRegressor(max_depth=i, n_estimators=(j + 1) * 5, learning_rate=10 ** (-k - 1), random_state=1)
            scores_mse = -cross_val_score(regressor, X_clean, y, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
            scores_r2 = cross_val_score(regressor, X_clean, y, cv=10, scoring='r2', n_jobs=-1)
            
            results.loc[len(results)] = {'Depth': i, 'Trees': (j + 1) * 5, 'Learning Rate': 10 ** (-k - 1),
                                          'MSE': scores_mse.mean(), 'R^2': scores_r2.mean()}

results.head()

Unnamed: 0,Depth,Trees,Learning Rate,MSE,R^2
0,4,5,0.1,53.990609,-0.299513
1,4,5,0.01,86.529409,-1.146103
2,4,5,0.001,91.521877,-1.271545
3,4,5,0.0001,92.043544,-1.284624
4,4,5,1e-05,92.095905,-1.285937


In [90]:
results.max()

Depth             8.000000
Trees            50.000000
Learning Rate     0.100000
MSE              92.095905
R^2               0.267295
dtype: float64

In [91]:
param_grid = {
    'max_depth': [i for i in range(4, 9)],
    'n_estimators': [(i + 1) * 5 for i in range(10)],
    'learning_rate': [10 ** (-k - 1) for k in range(5)]
}

grid_search = GridSearchCV(XGBRegressor(random_state=1), param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_clean, y)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Take the negative to get the actual MSE

print("Best Parameters:", best_params)
print("Best MSE:", best_score)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 25}
Best MSE: 32.61838071368801


## Resultados finales

In [96]:
from numpy import mean, abs, sqrt

models = [
    ("Bayesian Ridge", BayesianRidge()),
    ("Decision Tree", DecisionTreeRegressor(max_depth=4)),
    ("Random Forest", RandomForestRegressor(max_depth=4, n_estimators=40)),
    ("XGBoost", XGBRegressor(learning_rate=0.1, max_depth=4, n_estimators=25))
]

results = DataFrame(columns=["Model", "RMSE", "R2", "MAPE"])

def mean_absolute_percentage_error(y_true, y_pred):
    return mean(abs((y_true - y_pred) / y_true)) * 100

results = []
for name, model in models:
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    results.append({"Model": name, "RMSE": rmse, "R2": r2, "MAPE": mape})

DataFrame(results)

Unnamed: 0,Model,RMSE,R2,MAPE
0,Bayesian Ridge,5.724222,0.582098,21.214606
1,Decision Tree,5.467618,0.618726,15.424934
2,Random Forest,5.246252,0.648974,15.361242
3,XGBoost,4.9158,0.691802,14.60867


Curiosamente, los problemas vistos en el $R^2$ desaparecen con el conjunto de prueba. Aún así vemos que ningún modelo es ideal para la predicción.