In [88]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from xgboost import XGBClassifier

try:
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier
    IMBLEARN_AVAILABLE = True
except:
    IMBLEARN_AVAILABLE = False

try:
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except:
    LGBM_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except:
    CATBOOST_AVAILABLE = False

from lazypredict.Supervised import LazyClassifier
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit
import joblib

In [89]:
df = pd.read_csv('../data/processed/medicos_full.csv')
print(df.shape)
print(df['Gastos_Medicos_RC_siniestros_num'].value_counts())

(7999, 9)
Gastos_Medicos_RC_siniestros_num
0.00    7816
1.00     179
2.00       4
Name: count, dtype: int64


In [90]:
df['target'] = (df['Gastos_Medicos_RC_siniestros_num'] > 0).astype(int)
print(df['target'].value_counts())

target
0    7816
1     183
Name: count, dtype: int64


In [91]:
feature_vars = ['año_cursado', 'estudios_area', 'calif_promedio', '2_o_mas_inquilinos', 
                'distancia_al_campus', 'genero', 'extintor_incendios']

# Verificar si hay otras variables potenciales en el dataset
print("Columnas disponibles en el dataset:")
for col in df.columns:
    print(f"  {col}")

# Agregar variable de monto si existe (como variable explicativa adicional)
if 'Gastos_Medicos_RC_siniestros_monto' in df.columns:
    # Solo para registros con siniestros, el monto puede ser informativo
    df['tiene_monto'] = (df['Gastos_Medicos_RC_siniestros_monto'] > 0).astype(int)
    print(f"\nVariable 'tiene_monto' creada: correlación con target = {df['tiene_monto'].corr(df['target']):.4f}")
    
    # Solo agregar si la correlación es perfecta (redundante) o útil
    if df['tiene_monto'].corr(df['target']) < 0.99:  # No perfectamente correlacionada
        feature_vars.append('tiene_monto')
        print("✓ Variable 'tiene_monto' agregada como predictora")
    else:
        print("✗ Variable 'tiene_monto' muy correlacionada con target - no agregada")

X = df[feature_vars].copy()
y = df['target'].copy()

categorical_features = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
numerical_features = ['calif_promedio', 'distancia_al_campus']

# Actualizar listas si se agregó nueva variable
if 'tiene_monto' in feature_vars:
    categorical_features.append('tiene_monto')

print(f"\nVariables finales seleccionadas ({len(feature_vars)}):")
for i, var in enumerate(feature_vars, 1):
    print(f"  {i:2d}. {var}")
    
print(f"\nCategóricas: {categorical_features}")
print(f"Numéricas: {numerical_features}")

Columnas disponibles en el dataset:
  año_cursado
  estudios_area
  calif_promedio
  2_o_mas_inquilinos
  distancia_al_campus
  genero
  extintor_incendios
  Gastos_Medicos_RC_siniestros_num
  Gastos_Medicos_RC_siniestros_monto
  target

Variable 'tiene_monto' creada: correlación con target = 1.0000
✗ Variable 'tiene_monto' muy correlacionada con target - no agregada

Variables finales seleccionadas (7):
   1. año_cursado
   2. estudios_area
   3. calif_promedio
   4. 2_o_mas_inquilinos
   5. distancia_al_campus
   6. genero
   7. extintor_incendios

Categóricas: ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
Numéricas: ['calif_promedio', 'distancia_al_campus']


In [92]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)
feature_names = (numerical_features + 
                 list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))

print(f"Transformed shape: {X_transformed.shape}")

Transformed shape: (7999, 14)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train target dist: {y_train.value_counts().tolist()}")
print(f"Test target dist: {y_test.value_counts().tolist()}")

Train: (6399, 14), Test: (1600, 14)
Train target dist: [6253, 146]
Test target dist: [1563, 37]


In [94]:
def backward_selection(X, y, feature_names, significance_level=0.05):
    X_with_const = sm.add_constant(X)
    current_features = feature_names.copy()
    
    while True:
        feature_indices = [feature_names.index(f) for f in current_features]
        current_X = X_with_const[:, [0] + [i+1 for i in feature_indices]]
        
        model = Logit(y, current_X).fit(disp=0)
        p_values = model.pvalues[1:]
        max_p_value = p_values.max()
        
        if max_p_value > significance_level:
            worst_idx_in_pvalues = p_values.idxmax()
            worst_idx_in_current = list(p_values.index).index(worst_idx_in_pvalues)
            feature_to_remove = current_features[worst_idx_in_current]
            
            current_features.remove(feature_to_remove)
            print(f"Removed {feature_to_remove}, p-value: {max_p_value:.4f}")
        else:
            break
    
    feature_indices = [feature_names.index(f) for f in current_features]
    final_X = X_with_const[:, [0] + [i+1 for i in feature_indices]]
    final_model = Logit(y, final_X).fit(disp=0)
    
    selected_indices = [feature_names.index(f) for f in current_features]
    
    return current_features, selected_indices, final_model

In [95]:
print("EVALUACION BASELINE - DUMMYCLASSIFIER")
print("="*50)

dummy_models = {
    'Most_Frequent': DummyClassifier(strategy="most_frequent", random_state=42),
    'Stratified': DummyClassifier(strategy="stratified", random_state=42),
    'Uniform': DummyClassifier(strategy="uniform", random_state=42)
}

dummy_results = {}
for name, dummy in dummy_models.items():
    dummy.fit(X_train, y_train)
    y_pred_dummy = dummy.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_dummy)
    f1 = f1_score(y_test, y_pred_dummy, zero_division=0)
    precision = precision_score(y_test, y_pred_dummy, zero_division=0)
    recall = recall_score(y_test, y_pred_dummy, zero_division=0)
    
    dummy_results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall
    }
    
    print(f"{name}: Acc={accuracy:.4f}, F1={f1:.4f}, Prec={precision:.4f}, Rec={recall:.4f}")

print("\nSELECCION BACKWARD DE VARIABLES")
print("="*50)

selected_vars, selected_indices, glm_model = backward_selection(X_train, y_train, feature_names)
print(f"Variables seleccionadas: {len(selected_vars)}")
print(selected_vars)

EVALUACION BASELINE - DUMMYCLASSIFIER
Most_Frequent: Acc=0.9769, F1=0.0000, Prec=0.0000, Rec=0.0000
Stratified: Acc=0.9569, F1=0.0282, Prec=0.0294, Rec=0.0270
Uniform: Acc=0.4981, F1=0.0383, Prec=0.0201, Rec=0.4324

SELECCION BACKWARD DE VARIABLES
Removed genero_Otro, p-value: 0.9054
Removed calif_promedio, p-value: 0.9027
Removed año_cursado_3er año, p-value: 0.8973
Removed extintor_incendios_Si, p-value: 0.8709
Removed estudios_area_Humanidades, p-value: 0.8666
Removed genero_No respuesta, p-value: 0.7954
Removed estudios_area_Ciencias, p-value: 0.5881
Removed año_cursado_posgrado, p-value: 0.5648
Removed estudios_area_Otro, p-value: 0.4220
Removed genero_Masculino, p-value: 0.2060
Removed año_cursado_4to año, p-value: 0.1965
Removed año_cursado_2do año, p-value: 0.2244
Variables seleccionadas: 2
['distancia_al_campus', '2_o_mas_inquilinos_Si']


In [96]:
print("ESTRATEGIAS DE SELECCION DE VARIABLES")
print("="*50)

feature_selection_strategies = {}

# Estrategia 1: Variables seleccionadas por backward
categorical_base_names = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
backward_selected_indices = []
backward_selected_vars = []

for var in selected_vars:
    if var in numerical_features:
        backward_selected_indices.append(feature_names.index(var))
        backward_selected_vars.append(var)

for base_name in categorical_base_names:
    categories_in_selected = [var for var in selected_vars if var.startswith(base_name)]
    if categories_in_selected:
        all_categories = [var for var in feature_names if var.startswith(base_name)]
        for cat_var in all_categories:
            if cat_var not in backward_selected_vars:
                backward_selected_indices.append(feature_names.index(cat_var))
                backward_selected_vars.append(cat_var)

feature_selection_strategies['Backward'] = {
    'indices': backward_selected_indices,
    'names': backward_selected_vars
}

# Estrategia 2: Todas las variables
feature_selection_strategies['All_Features'] = {
    'indices': list(range(len(feature_names))),
    'names': feature_names
}

# Estrategia 3: SelectKBest con f_classif
for k in [5, 8, 10]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X_train, y_train)
    selected_features_mask = selector.get_support()
    kbest_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
    kbest_names = [feature_names[i] for i in kbest_indices]
    
    feature_selection_strategies[f'KBest_f_classif_{k}'] = {
        'indices': kbest_indices,
        'names': kbest_names
    }

# Estrategia 4: SelectKBest con mutual_info
for k in [5, 8, 10]:
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_selected = selector.fit_transform(X_train, y_train)
    selected_features_mask = selector.get_support()
    mi_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
    mi_names = [feature_names[i] for i in mi_indices]
    
    feature_selection_strategies[f'KBest_mutual_info_{k}'] = {
        'indices': mi_indices,
        'names': mi_names
    }

print("Estrategias de seleccion creadas:")
for strategy_name, strategy_info in feature_selection_strategies.items():
    print(f"  {strategy_name}: {len(strategy_info['indices'])} variables")

print(f"\nUsando estrategia 'Backward' para LazyPredict inicial...")
final_selected_indices = feature_selection_strategies['Backward']['indices']
final_selected_vars = feature_selection_strategies['Backward']['names']

ESTRATEGIAS DE SELECCION DE VARIABLES
Estrategias de seleccion creadas:
  Backward: 2 variables
  All_Features: 14 variables
  KBest_f_classif_5: 5 variables
  KBest_f_classif_8: 8 variables
  KBest_f_classif_10: 10 variables
  KBest_mutual_info_5: 5 variables
  KBest_mutual_info_8: 8 variables
  KBest_mutual_info_10: 10 variables

Usando estrategia 'Backward' para LazyPredict inicial...


In [97]:
print("TECNICAS DE RESAMPLING")
print("="*50)

X_train_selected = X_train[:, final_selected_indices]
X_test_selected = X_test[:, final_selected_indices]

resampling_strategies = {}

resampling_strategies['Original'] = {
    'X_train': X_train_selected,
    'y_train': y_train
}

if IMBLEARN_AVAILABLE:
    try:
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)
        resampling_strategies['SMOTE'] = {
            'X_train': X_train_smote,
            'y_train': y_train_smote
        }
        print(f"SMOTE: {y_train_smote.value_counts().tolist()}")
    except Exception as e:
        print(f"SMOTE error: {e}")

    try:
        adasyn = ADASYN(random_state=42)
        X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_selected, y_train)
        resampling_strategies['ADASYN'] = {
            'X_train': X_train_adasyn,
            'y_train': y_train_adasyn
        }
        print(f"ADASYN: {y_train_adasyn.value_counts().tolist()}")
    except Exception as e:
        print(f"ADASYN error: {e}")

    try:
        undersampler = RandomUnderSampler(random_state=42)
        X_train_under, y_train_under = undersampler.fit_resample(X_train_selected, y_train)
        resampling_strategies['UnderSample'] = {
            'X_train': X_train_under,
            'y_train': y_train_under
        }
        print(f"UnderSample: {y_train_under.value_counts().tolist()}")
    except Exception as e:
        print(f"UnderSample error: {e}")
else:
    print("imblearn no disponible - usando solo datos originales")

print(f"\nDistribucion original: {y_train.value_counts().tolist()}")
print(f"Estrategias de resampling disponibles: {list(resampling_strategies.keys())}")

current_X_train = X_train_selected
current_y_train = y_train

TECNICAS DE RESAMPLING
SMOTE: [6253, 6253]
ADASYN: [6258, 6253]
UnderSample: [146, 146]

Distribucion original: [6253, 146]
Estrategias de resampling disponibles: ['Original', 'SMOTE', 'ADASYN', 'UnderSample']


In [98]:
print("LAZYPREDICT - EXPLORACION DE MODELOS")
print("="*50)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(current_X_train, X_test_selected, current_y_train, y_test)

print("Resultados LazyPredict:")
print(models.round(4))

top_10_models = models.nlargest(10, 'F1 Score')
print(f"\nTOP 10 MODELOS POR F1-SCORE:")
for i, (model_name, row) in enumerate(top_10_models.iterrows(), 1):
    print(f"{i:2d}. {model_name:25s}: F1={row['F1 Score']:.4f}, Acc={row['Accuracy']:.4f}")

# Comparar con baseline
best_f1_lazy = top_10_models.iloc[0]['F1 Score']
best_dummy_f1 = max([res['f1_score'] for res in dummy_results.values()])

print(f"\nComparacion con baseline:")
print(f"Mejor DummyClassifier F1: {best_dummy_f1:.4f}")
print(f"Mejor LazyPredict F1:    {best_f1_lazy:.4f}")
if best_f1_lazy > best_dummy_f1:
    print("✓ LazyPredict supera baseline DummyClassifier")
else:
    print("✗ LazyPredict NO supera baseline DummyClassifier")

top_5_names = top_10_models.head(5).index.tolist()
print(f"\nSeleccionando top 5 para optimizacion: {top_5_names}")

LAZYPREDICT - EXPLORACION DE MODELOS


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 146, number of negative: 6253
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 6399, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.022816 -> initscore=-3.757210
[LightGBM] [Info] Start training from score -3.757210
Resultados LazyPredict:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NearestCentroid                    0.69               0.55     0.55      0.80   
BaggingClassifier                  0.97               0.52     0.52      0.96   
ExtraTreesClassifier               0.97               0.52     0.52      0.96   
RandomForestClassif

In [99]:
print("MAPEO Y CONFIGURACION DE MODELOS EXPANDIDA")
print("="*50)

model_mapping = {
    'XGBClassifier': XGBClassifier(random_state=42, eval_metric='logloss'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVC': SVC(random_state=42, probability=True),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

if IMBLEARN_AVAILABLE:
    model_mapping['BalancedRandomForestClassifier'] = BalancedRandomForestClassifier(random_state=42)
    model_mapping['BalancedBaggingClassifier'] = BalancedBaggingClassifier(random_state=42)
    model_mapping['EasyEnsembleClassifier'] = EasyEnsembleClassifier(random_state=42)
    print("✓ imblearn modelos balanceados disponibles")
else:
    print("✗ imblearn no disponible - sin modelos balanceados")

if LGBM_AVAILABLE:
    model_mapping['LGBMClassifier'] = LGBMClassifier(random_state=42, verbose=-1)
    print("✓ LightGBM disponible")
else:
    print("✗ LightGBM no disponible")

if CATBOOST_AVAILABLE:
    model_mapping['CatBoostClassifier'] = CatBoostClassifier(random_state=42, verbose=False)
    print("✓ CatBoost disponible")
else:
    print("✗ CatBoost no disponible")

# Agregar RandomForest adicional como modelo independiente para testing
print("\n+ Agregando RandomForestClassifier adicional para testing")
models_to_optimize = [('RandomForestClassifier_Test', RandomForestClassifier(random_state=42))]

# Agregar los top 5 de LazyPredict
for name in top_5_names:
    if name in model_mapping:
        models_to_optimize.append((name, model_mapping[name]))
        print(f"✓ {name} mapeado correctamente")
    else:
        models_to_optimize.append((name, LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')))
        print(f"○ {name} usando LogisticRegression como backup")

print(f"\nModelos a optimizar: {len(models_to_optimize)}")

# Grillas expandidas especialmente para RandomForest
param_grids = {
    'RandomForestClassifier_Test': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5, 8, 10, 15, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': ['sqrt', 'log2', 0.3, 0.5],
        'class_weight': ['balanced', 'balanced_subsample', {0: 1, 1: 10}, {0: 1, 1: 15}, {0: 1, 1: 20}],
        'bootstrap': [True, False]
    },
    'XGBClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.05, 0.1, 0.2],
        'scale_pos_weight': [1, 5, 10, 15, 20]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'class_weight': ['balanced', 'balanced_subsample', {0: 1, 1: 10}, {0: 1, 1: 20}]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced', {0: 1, 1: 10}, {0: 1, 1: 20}]
    },
    'SVC': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'linear'],
        'class_weight': ['balanced', {0: 1, 1: 10}, {0: 1, 1: 20}]
    },
    'MLPClassifier': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'alpha': [0.0001, 0.001, 0.01, 0.1]
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.5, 1.0, 1.5]
    },
    'ExtraTreesClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'class_weight': ['balanced', 'balanced_subsample']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }
}

if IMBLEARN_AVAILABLE:
    param_grids['BalancedRandomForestClassifier'] = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'sampling_strategy': ['auto', 'majority', 'not majority']
    }

if LGBM_AVAILABLE:
    param_grids['LGBMClassifier'] = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.05, 0.1, 0.2],
        'is_unbalance': [True]
    }

print(f"\nGrids definidos para {len(param_grids)} tipos de modelos")
print("RandomForestClassifier_Test tiene grid expandido con:")
rf_grid = param_grids['RandomForestClassifier_Test']
total_combinations = 1
for param, values in rf_grid.items():
    total_combinations *= len(values)
    print(f"  {param}: {len(values)} valores")
print(f"Total combinaciones RF: {total_combinations:,}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

MAPEO Y CONFIGURACION DE MODELOS EXPANDIDA
✓ imblearn modelos balanceados disponibles
✓ LightGBM disponible
✗ CatBoost no disponible

+ Agregando RandomForestClassifier adicional para testing
○ DummyClassifier usando LogisticRegression como backup
○ BernoulliNB usando LogisticRegression como backup
✓ AdaBoostClassifier mapeado correctamente
○ Perceptron usando LogisticRegression como backup
○ CalibratedClassifierCV usando LogisticRegression como backup

Modelos a optimizar: 6

Grids definidos para 12 tipos de modelos
RandomForestClassifier_Test tiene grid expandido con:
  n_estimators: 4 valores
  max_depth: 6 valores
  min_samples_split: 4 valores
  min_samples_leaf: 4 valores
  max_features: 4 valores
  class_weight: 5 valores
  bootstrap: 2 valores
Total combinaciones RF: 15,360


In [100]:
print("OPTIMIZACION DE HIPERPARAMETROS (ACELERADA)")
print("="*50)

# Grillas reducidas para acelerar
param_grids_fast = {
    'RandomForestClassifier_Test': {
        'n_estimators': [100, 200],  # Reducido de 4 a 2 valores
        'max_depth': [5, 10, None],  # Reducido de 6 a 3 valores  
        'min_samples_split': [2, 10],  # Reducido de 4 a 2 valores
        'min_samples_leaf': [1, 4],  # Reducido de 4 a 2 valores
        'max_features': ['sqrt', 0.5],  # Reducido de 4 a 2 valores
        'class_weight': ['balanced', {0: 1, 1: 15}],  # Reducido de 5 a 2 valores
        'bootstrap': [True]  # Solo True (más estable)
    },
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.1, 0.2],
        'scale_pos_weight': [10, 20]
    },
    'RandomForestClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [10, 15],
        'class_weight': ['balanced', {0: 1, 1: 15}]
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'class_weight': ['balanced', {0: 1, 1: 15}]
    },
    'SVC': {
        'C': [1, 10],
        'kernel': ['rbf'],
        'class_weight': ['balanced']
    }
}

# Calcular combinaciones reducidas
rf_fast = param_grids_fast['RandomForestClassifier_Test']
rf_combinations = 1
for param, values in rf_fast.items():
    rf_combinations *= len(values)
print(f"RandomForest combinaciones reducidas: {rf_combinations} (vs 76,800 original)")
print(f"Tiempo estimado: {rf_combinations * 5} entrenamientos vs {76800 * 5} original")

optimization_results = {}

for model_name, model in models_to_optimize:
    print(f"\nOptimizing {model_name}...")
    
    try:
        # Usar grids acelerados
        if model_name in param_grids_fast:
            param_grid = param_grids_fast[model_name]
            print(f"  Grid reducido: {len(param_grid)} parámetros")
        else:
            param_grid = {}
            print(f"  Sin grid específico")
        
        # Reducir CV a 3-fold para acelerar
        skf_fast = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring='f1',
            cv=skf_fast,  # 3-fold en lugar de 5-fold
            n_jobs=-1,
            verbose=1  # Mostrar progreso
        )
        
        print(f"  Iniciando búsqueda...")
        grid_search.fit(current_X_train, current_y_train)
        
        optimization_results[model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_cv_score': grid_search.best_score_
        }
        
        print(f"  ✓ Best F1-CV: {grid_search.best_score_:.4f}")
        print(f"  ✓ Best params: {grid_search.best_params_}")
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        optimization_results[model_name] = {
            'best_estimator': model,
            'best_params': {},
            'best_cv_score': 0
        }

print(f"\n{'='*50}")
print("OPTIMIZATION COMPLETED")
print(f"{'='*50}")
print("Resultados de optimización:")
for name, results in optimization_results.items():
    print(f"{name}: F1-CV = {results['best_cv_score']:.4f}")

OPTIMIZACION DE HIPERPARAMETROS (ACELERADA)
RandomForest combinaciones reducidas: 96 (vs 76,800 original)
Tiempo estimado: 480 entrenamientos vs 384000 original

Optimizing RandomForestClassifier_Test...
  Grid reducido: 7 parámetros
  Iniciando búsqueda...
Fitting 3 folds for each of 96 candidates, totalling 288 fits
  ✓ Best F1-CV: 0.0628
  ✓ Best params: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

Optimizing DummyClassifier...
  Sin grid específico
  Iniciando búsqueda...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
  ✓ Best F1-CV: 0.0702
  ✓ Best params: {}

Optimizing BernoulliNB...
  Sin grid específico
  Iniciando búsqueda...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
  ✓ Best F1-CV: 0.0702
  ✓ Best params: {}

Optimizing AdaBoostClassifier...
  Sin grid específico
  Iniciando búsqueda...
Fitting 3 folds for each of 1 candidates, totalli

In [101]:
def optimize_threshold(model, X_val, y_val):
    thresholds = np.arange(0.1, 0.9, 0.05)
    y_proba = model.predict_proba(X_val)[:, 1]
    
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        f1 = f1_score(y_val, y_pred)
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

print("OPTIMIZACION DE THRESHOLDS")
print("="*50)

threshold_results = {}

for model_name, model_info in optimization_results.items():
    model = model_info['best_estimator']
    best_threshold, best_f1 = optimize_threshold(model, X_test_selected, y_test)
    
    threshold_results[model_name] = {
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'model': model
    }
    
    print(f"{model_name}: threshold={best_threshold:.3f}, F1={best_f1:.4f}")

print("Threshold optimization completed")

OPTIMIZACION DE THRESHOLDS
RandomForestClassifier_Test: threshold=0.650, F1=0.0588
DummyClassifier: threshold=0.500, F1=0.0578
BernoulliNB: threshold=0.500, F1=0.0578
AdaBoostClassifier: threshold=0.250, F1=0.0539
Perceptron: threshold=0.500, F1=0.0578
CalibratedClassifierCV: threshold=0.500, F1=0.0578
Threshold optimization completed


In [102]:
threshold_results = {}

for model_name, model_info in optimization_results.items():
    model = model_info['best_estimator']
    best_threshold, best_f1 = optimize_threshold(model, X_test_selected, y_test)
    
    threshold_results[model_name] = {
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'model': model
    }
    
    print(f"{model_name}: threshold={best_threshold:.3f}, F1={best_f1:.4f}")

print("Threshold optimization completed")

RandomForestClassifier_Test: threshold=0.650, F1=0.0588
DummyClassifier: threshold=0.500, F1=0.0578
BernoulliNB: threshold=0.500, F1=0.0578
AdaBoostClassifier: threshold=0.250, F1=0.0539
Perceptron: threshold=0.500, F1=0.0578
CalibratedClassifierCV: threshold=0.500, F1=0.0578
Threshold optimization completed


In [103]:
def calculate_enhanced_metrics(y_true, y_proba, y_pred):
    """Calcula metricas mejoradas para datos desbalanceados"""
    from sklearn.metrics import roc_auc_score, matthews_corrcoef, balanced_accuracy_score, average_precision_score
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    try:
        roc_auc = roc_auc_score(y_true, y_proba)
        gini = 2 * roc_auc - 1
    except:
        roc_auc = 0
        gini = 0
    
    try:
        pr_auc = average_precision_score(y_true, y_proba)
    except:
        pr_auc = 0
    
    try:
        mcc = matthews_corrcoef(y_true, y_pred)
    except:
        mcc = 0
    
    try:
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
    except:
        balanced_acc = 0
    
    mae = np.mean(np.abs(y_true - y_pred))
    r2 = 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'mae': mae,
        'gini': gini,
        'r2': r2,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'mcc': mcc,
        'balanced_accuracy': balanced_acc
    }

final_results = {}

print("EVALUACION FINAL CON METRICAS MEJORADAS")
print("="*60)

for model_name, threshold_info in threshold_results.items():
    model = threshold_info['model']
    threshold = threshold_info['best_threshold']
    
    y_proba = model.predict_proba(X_test_selected)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    
    metrics = calculate_enhanced_metrics(y_test, y_proba, y_pred)
    metrics['threshold'] = threshold
    metrics['model'] = model
    
    final_results[model_name] = metrics
    
    print(f"{model_name}:")
    print(f"  Threshold: {threshold:.3f}")
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  F1-Score:  {metrics['f1_score']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  GINI:      {metrics['gini']:.4f}")
    print(f"  PR-AUC:    {metrics['pr_auc']:.4f}")
    print(f"  MCC:       {metrics['mcc']:.4f}")
    print(f"  Bal-Acc:   {metrics['balanced_accuracy']:.4f}")
    print()

print("Comparacion con DummyClassifier:")
print("-" * 40)
for dummy_name, dummy_metrics in dummy_results.items():
    print(f"{dummy_name}: F1={dummy_metrics['f1_score']:.4f}, Acc={dummy_metrics['accuracy']:.4f}")

print(f"\nMejor modelo real:")
best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_score'])
best_f1 = final_results[best_model_name]['f1_score']
print(f"{best_model_name}: F1={best_f1:.4f}")

if best_f1 > best_dummy_f1:
    print("✓ El mejor modelo SUPERA el baseline DummyClassifier")
else:
    print("✗ El mejor modelo NO supera el baseline DummyClassifier")
    print("  Necesitas más técnicas de manejo de desbalance")

EVALUACION FINAL CON METRICAS MEJORADAS
RandomForestClassifier_Test:
  Threshold: 0.650
  Accuracy:  0.9600
  F1-Score:  0.0588
  Precision: 0.0645
  Recall:    0.0541
  GINI:      0.1500
  PR-AUC:    0.0289
  MCC:       0.0387
  Bal-Acc:   0.5177

DummyClassifier:
  Threshold: 0.500
  Accuracy:  0.6944
  F1-Score:  0.0578
  Precision: 0.0311
  Recall:    0.4054
  GINI:      0.1831
  PR-AUC:    0.0302
  MCC:       0.0349
  Bal-Acc:   0.5533

BernoulliNB:
  Threshold: 0.500
  Accuracy:  0.6944
  F1-Score:  0.0578
  Precision: 0.0311
  Recall:    0.4054
  GINI:      0.1831
  PR-AUC:    0.0302
  MCC:       0.0349
  Bal-Acc:   0.5533

AdaBoostClassifier:
  Threshold: 0.250
  Accuracy:  0.7588
  F1-Score:  0.0539
  Precision: 0.0296
  Recall:    0.2973
  GINI:      0.1779
  PR-AUC:    0.0314
  MCC:       0.0239
  Bal-Acc:   0.5335

Perceptron:
  Threshold: 0.500
  Accuracy:  0.6944
  F1-Score:  0.0578
  Precision: 0.0311
  Recall:    0.4054
  GINI:      0.1831
  PR-AUC:    0.0302
  MCC:    

In [104]:
best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_score'])
best_model = final_results[best_model_name]['model']
best_threshold = final_results[best_model_name]['threshold']
best_params = optimization_results[best_model_name]['best_params']

print("="*80)
print("REENTRENAMIENTO CON TODOS LOS DATOS")
print("="*80)
print(f"Mejor modelo seleccionado: {best_model_name}")
print(f"Hiperparámetros óptimos: {best_params}")
print(f"Threshold óptimo: {best_threshold:.3f}")

# Preparar TODOS los datos (8000 registros) con las variables seleccionadas
print(f"\nPreparando todos los datos para reentrenamiento...")
X_all_selected = X_transformed[:, final_selected_indices]
y_all = y.copy()

print(f"Datos completos: {X_all_selected.shape}")
print(f"Distribución target completa: {y_all.value_counts().tolist()}")

# Crear nuevo modelo con los mismos hiperparámetros
print(f"\nCreando modelo final con hiperparámetros optimizados...")

# Mapear el modelo base según el tipo
model_mapping_final = {
    'RandomForestClassifier_Test': RandomForestClassifier(random_state=42),
    'XGBClassifier': XGBClassifier(random_state=42, eval_metric='logloss'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVC': SVC(random_state=42, probability=True),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

if best_model_name in model_mapping_final:
    final_model = model_mapping_final[best_model_name]
else:
    final_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')

# Aplicar hiperparámetros optimizados
final_model.set_params(**best_params)
print(f"Modelo configurado: {final_model}")

# Entrenar con TODOS los datos
print(f"\nEntrenando modelo final con {X_all_selected.shape[0]} registros...")
final_model.fit(X_all_selected, y_all)
print("✓ Entrenamiento completado")

# Validar que el modelo funciona
print(f"\nValidación del modelo final:")
y_proba_all = final_model.predict_proba(X_all_selected)[:, 1]
y_pred_all = (y_proba_all >= best_threshold).astype(int)

final_accuracy = accuracy_score(y_all, y_pred_all)
final_f1 = f1_score(y_all, y_pred_all)
final_precision = precision_score(y_all, y_pred_all, zero_division=0)
final_recall = recall_score(y_all, y_pred_all, zero_division=0)

print(f"Métricas en datos completos:")
print(f"  Accuracy:  {final_accuracy:.4f}")
print(f"  F1-Score:  {final_f1:.4f}")
print(f"  Precision: {final_precision:.4f}")
print(f"  Recall:    {final_recall:.4f}")

# Preparar información completa para guardar
os.makedirs('../models', exist_ok=True)

model_info_final = {
    'model': final_model,  # Modelo entrenado con todos los datos
    'threshold': best_threshold,
    'preprocessor': preprocessor,
    'selected_indices': final_selected_indices,
    'feature_names': final_selected_vars,
    'best_params': best_params,
    'model_name': best_model_name,
    'training_metrics_full_data': {
        'accuracy': final_accuracy,
        'f1_score': final_f1,
        'precision': final_precision,
        'recall': final_recall
    },
    'test_metrics_original': final_results[best_model_name],  # Métricas en test set original
    'feature_selection_strategy': 'Backward',
    'resampling_strategy': 'Original',
    'total_training_samples': X_all_selected.shape[0],
    'all_strategies_tested': {
        'feature_selection': list(feature_selection_strategies.keys()),
        'resampling': list(resampling_strategies.keys())
    },
    'dummy_baseline': dummy_results,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

joblib.dump(model_info_final, '../models/clasificacion_medicos_best_model.pkl')

print("="*80)
print("RESUMEN FINAL - CLASIFICACION GASTOS MEDICOS RC")
print("="*80)
print(f"Dataset original: {df.shape[0]:,} registros")
print(f"Distribución: {sum(df['target'])}/{len(df['target'])} positivos ({100*sum(df['target'])/len(df['target']):.1f}%)")
print()

print("MODELO FINAL SELECCIONADO:")
print(f"  Algoritmo: {best_model_name}")
print(f"  Entrenado con: {X_all_selected.shape[0]:,} registros completos")
print(f"  Variables: {len(final_selected_vars)} seleccionadas")
print(f"  Threshold: {best_threshold:.3f}")
print()

print("HIPERPARÁMETROS OPTIMIZADOS:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print()

print("RENDIMIENTO EN TEST SET ORIGINAL:")
test_metrics = final_results[best_model_name]
print(f"  F1-Score:         {test_metrics['f1_score']:.4f}")
print(f"  Precision:        {test_metrics['precision']:.4f}")
print(f"  Recall:           {test_metrics['recall']:.4f}")
print(f"  GINI:             {test_metrics['gini']:.4f}")
print(f"  PR-AUC:           {test_metrics['pr_auc']:.4f}")

print(f"\nVARIABLES SELECCIONADAS ({len(final_selected_vars)}):")
for i, var in enumerate(final_selected_vars, 1):
    print(f"  {i:2d}. {var}")

print(f"\nMODELO GUARDADO EN: ../models/clasificacion_medicos_best_model.pkl")

best_dummy_f1 = max([res['f1_score'] for res in dummy_results.values()])
improvement = test_metrics['f1_score'] / best_dummy_f1 if best_dummy_f1 > 0 else float('inf')
print(f"MEJORA SOBRE BASELINE: {improvement:.1f}x mejor F1-score que DummyClassifier")
print("="*80)

REENTRENAMIENTO CON TODOS LOS DATOS
Mejor modelo seleccionado: RandomForestClassifier_Test
Hiperparámetros óptimos: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Threshold óptimo: 0.650

Preparando todos los datos para reentrenamiento...
Datos completos: (7999, 2)
Distribución target completa: [7816, 183]

Creando modelo final con hiperparámetros optimizados...
Modelo configurado: RandomForestClassifier(class_weight='balanced', max_depth=5, min_samples_leaf=4,
                       min_samples_split=10, n_estimators=200, random_state=42)

Entrenando modelo final con 7999 registros...
✓ Entrenamiento completado

Validación del modelo final:
Métricas en datos completos:
  Accuracy:  0.9645
  F1-Score:  0.1341
  Precision: 0.1517
  Recall:    0.1202
RESUMEN FINAL - CLASIFICACION GASTOS MEDICOS RC
Dataset original: 7,999 registros
Distribución: 183/7999 positivos (2.3%)

MODELO F