In [1]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

# Modelos específicos para multinomial
from sklearn.linear_model import LogisticRegression as LogisticRegressionMultinomial
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier

try:
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier
    IMBLEARN_AVAILABLE = True
except:
    IMBLEARN_AVAILABLE = False

try:
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except:
    LGBM_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except:
    CATBOOST_AVAILABLE = False

from lazypredict.Supervised import LazyClassifier
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit
import joblib
from collections import Counter

In [2]:
df = pd.read_csv('../data/processed/medicos_siniestrados.csv')
print(f"Shape: {df.shape}")
print(f"\nDistribución de Gastos_Medicos_RC_siniestros_num:")
freq_counts = df['Gastos_Medicos_RC_siniestros_num'].value_counts().sort_index()
print(freq_counts)
print(f"\nPorcentajes:")
for val, count in freq_counts.items():
    pct = (count / len(df)) * 100
    print(f"  {val}: {count} registros ({pct:.1f}%)")

Shape: (183, 15)

Distribución de Gastos_Medicos_RC_siniestros_num:
Gastos_Medicos_RC_siniestros_num
1.00    179
2.00      4
Name: count, dtype: int64

Porcentajes:
  1.0: 179 registros (97.8%)
  2.0: 4 registros (2.2%)


In [3]:
# La variable objetivo es la frecuencia (sin transformar)
# Valores: 1 = 1 siniestro, 2 = 2 siniestros
df['target'] = df['Gastos_Medicos_RC_siniestros_num'].astype(int)
print(f"Distribución target:")
print(df['target'].value_counts().sort_index())
print(f"\nTipo de problema: Clasificación multinomial (frecuencia 1 vs 2 siniestros)")
print(f"Clases: {sorted(df['target'].unique())}")

Distribución target:
target
1    179
2      4
Name: count, dtype: int64

Tipo de problema: Clasificación multinomial (frecuencia 1 vs 2 siniestros)
Clases: [np.int64(1), np.int64(2)]


In [4]:
feature_vars = ['año_cursado', 'estudios_area', 'calif_promedio', '2_o_mas_inquilinos', 
                'distancia_al_campus', 'genero', 'extintor_incendios']

print("Columnas disponibles en el dataset:")
for col in df.columns:
    print(f"  {col}")

# Agregar monto como variable explicativa
if 'Gastos_Medicos_RC_siniestros_monto' in df.columns:
    df['monto_promedio'] = df['Gastos_Medicos_RC_siniestros_monto'] / df['Gastos_Medicos_RC_siniestros_num']
    feature_vars.append('monto_promedio')
    print(f"\nVariable 'monto_promedio' creada (monto/frecuencia)")
    print(f"Correlación monto_promedio vs target: {df['monto_promedio'].corr(df['target']):.4f}")

X = df[feature_vars].copy()
y = df['target'].copy()

categorical_features = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
numerical_features = ['calif_promedio', 'distancia_al_campus']

if 'monto_promedio' in feature_vars:
    numerical_features.append('monto_promedio')

print(f"\nVariables finales seleccionadas ({len(feature_vars)}):")
for i, var in enumerate(feature_vars, 1):
    print(f"  {i:2d}. {var}")
    
print(f"\nCategóricas: {categorical_features}")
print(f"Numéricas: {numerical_features}")

Columnas disponibles en el dataset:
  año_cursado
  estudios_area
  calif_promedio
  2_o_mas_inquilinos
  distancia_al_campus
  genero
  extintor_incendios
  Gastos_Adicionales_siniestros_num
  Gastos_Adicionales_siniestros_monto
  Gastos_Medicos_RC_siniestros_num
  Gastos_Medicos_RC_siniestros_monto
  Resp_Civil_siniestros_num
  Resp_Civil_siniestros_monto
  Contenidos_siniestros_num
  Contenidos_siniestros_monto
  target

Variable 'monto_promedio' creada (monto/frecuencia)
Correlación monto_promedio vs target: 0.0264

Variables finales seleccionadas (8):
   1. año_cursado
   2. estudios_area
   3. calif_promedio
   4. 2_o_mas_inquilinos
   5. distancia_al_campus
   6. genero
   7. extintor_incendios
   8. monto_promedio

Categóricas: ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
Numéricas: ['calif_promedio', 'distancia_al_campus', 'monto_promedio']


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)
feature_names = (numerical_features + 
                 list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))

print(f"Transformed shape: {X_transformed.shape}")
print(f"Feature names: {len(feature_names)} variables")

Transformed shape: (183, 15)
Feature names: 15 variables


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train target dist: {Counter(y_train)}")
print(f"Test target dist: {Counter(y_test)}")
print(f"\nPorcentajes en train:")
for cls, count in Counter(y_train).items():
    pct = (count / len(y_train)) * 100
    print(f"  Clase {cls}: {count} ({pct:.1f}%)")

Train: (146, 15), Test: (37, 15)
Train target dist: Counter({1: 143, 2: 3})
Test target dist: Counter({1: 36, 2: 1})

Porcentajes en train:
  Clase 1: 143 (97.9%)
  Clase 2: 3 (2.1%)


In [7]:
def backward_selection_multinomial(X, y, feature_names, significance_level=0.05):
    """Selección backward para regresión logística multinomial"""
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import f_classif
    from scipy.stats import chi2
    
    current_features = feature_names.copy()
    removed_features = []
    
    while True:
        # Ajustar modelo con características actuales
        feature_indices = [feature_names.index(f) for f in current_features]
        X_current = X[:, feature_indices]
        
        # Usar F-test para variables multinomiales
        f_scores, p_values = f_classif(X_current, y)
        
        max_p_value = p_values.max()
        max_p_idx = p_values.argmax()
        
        if max_p_value > significance_level:
            feature_to_remove = current_features[max_p_idx]
            current_features.remove(feature_to_remove)
            removed_features.append(feature_to_remove)
            print(f"Removed {feature_to_remove}, p-value: {max_p_value:.4f}")
        else:
            break
    
    selected_indices = [feature_names.index(f) for f in current_features]
    
    return current_features, selected_indices, removed_features

In [8]:
print("EVALUACION BASELINE - DUMMYCLASSIFIER")
print("="*50)

dummy_models = {
    'Most_Frequent': DummyClassifier(strategy="most_frequent", random_state=42),
    'Stratified': DummyClassifier(strategy="stratified", random_state=42),
    'Uniform': DummyClassifier(strategy="uniform", random_state=42)
}

dummy_results = {}
for name, dummy in dummy_models.items():
    dummy.fit(X_train, y_train)
    y_pred_dummy = dummy.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_dummy)
    f1_macro = f1_score(y_test, y_pred_dummy, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred_dummy, average='weighted', zero_division=0)
    mae = mean_absolute_error(y_test, y_pred_dummy)
    
    dummy_results[name] = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'mae': mae
    }
    
    print(f"{name}: Acc={accuracy:.4f}, F1_macro={f1_macro:.4f}, F1_weighted={f1_weighted:.4f}, MAE={mae:.4f}")

print("\nSELECCION BACKWARD DE VARIABLES (MULTINOMIAL)")
print("="*50)

selected_vars, selected_indices, removed_vars = backward_selection_multinomial(X_train, y_train, feature_names)
print(f"\nVariables seleccionadas: {len(selected_vars)}")
print(f"Variables: {selected_vars}")
print(f"\nVariables removidas: {removed_vars}")

EVALUACION BASELINE - DUMMYCLASSIFIER
Most_Frequent: Acc=0.9730, F1_macro=0.4932, F1_weighted=0.9596, MAE=0.0270
Stratified: Acc=0.9730, F1_macro=0.4932, F1_weighted=0.9596, MAE=0.0270
Uniform: Acc=0.4865, F1_macro=0.3684, F1_weighted=0.6267, MAE=0.5135

SELECCION BACKWARD DE VARIABLES (MULTINOMIAL)
Removed 2_o_mas_inquilinos_Si, p-value: 0.9248
Removed distancia_al_campus, p-value: 0.9225
Removed genero_No respuesta, p-value: 0.7194
Removed monto_promedio, p-value: 0.7053
Removed año_cursado_posgrado, p-value: 0.5718
Removed año_cursado_4to año, p-value: 0.5069
Removed estudios_area_Ciencias, p-value: 0.3477
Removed año_cursado_3er año, p-value: 0.3112
Removed estudios_area_Otro, p-value: 0.3025
Removed extintor_incendios_Si, p-value: 0.2093
Removed genero_Masculino, p-value: 0.1697
Removed estudios_area_Humanidades, p-value: 0.0977

Variables seleccionadas: 3
Variables: ['calif_promedio', 'año_cursado_2do año', 'genero_Otro']

Variables removidas: ['2_o_mas_inquilinos_Si', 'distancia

In [9]:
print("ESTRATEGIAS DE SELECCION DE VARIABLES")
print("="*50)

feature_selection_strategies = {}

# Estrategia 1: Variables seleccionadas por backward
categorical_base_names = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
backward_selected_indices = []
backward_selected_vars = []

for var in selected_vars:
    if var in numerical_features:
        backward_selected_indices.append(feature_names.index(var))
        backward_selected_vars.append(var)

for base_name in categorical_base_names:
    categories_in_selected = [var for var in selected_vars if var.startswith(base_name)]
    if categories_in_selected:
        all_categories = [var for var in feature_names if var.startswith(base_name)]
        for cat_var in all_categories:
            if cat_var not in backward_selected_vars:
                backward_selected_indices.append(feature_names.index(cat_var))
                backward_selected_vars.append(cat_var)

feature_selection_strategies['Backward'] = {
    'indices': backward_selected_indices,
    'names': backward_selected_vars
}

# Estrategia 2: Todas las variables
feature_selection_strategies['All_Features'] = {
    'indices': list(range(len(feature_names))),
    'names': feature_names
}

# Estrategia 3: SelectKBest con f_classif
for k in [5, 8, 10]:
    if k <= len(feature_names):
        selector = SelectKBest(score_func=f_classif, k=k)
        X_selected = selector.fit_transform(X_train, y_train)
        selected_features_mask = selector.get_support()
        kbest_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
        kbest_names = [feature_names[i] for i in kbest_indices]
        
        feature_selection_strategies[f'KBest_f_classif_{k}'] = {
            'indices': kbest_indices,
            'names': kbest_names
        }

# Estrategia 4: SelectKBest con mutual_info
for k in [5, 8, 10]:
    if k <= len(feature_names):
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        X_selected = selector.fit_transform(X_train, y_train)
        selected_features_mask = selector.get_support()
        mi_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
        mi_names = [feature_names[i] for i in mi_indices]
        
        feature_selection_strategies[f'KBest_mutual_info_{k}'] = {
            'indices': mi_indices,
            'names': mi_names
        }

print("Estrategias de seleccion creadas:")
for strategy_name, strategy_info in feature_selection_strategies.items():
    print(f"  {strategy_name}: {len(strategy_info['indices'])} variables")

print(f"\nUsando estrategia 'All_Features' para LazyPredict inicial...")
final_selected_indices = feature_selection_strategies['All_Features']['indices']
final_selected_vars = feature_selection_strategies['All_Features']['names']

ESTRATEGIAS DE SELECCION DE VARIABLES
Estrategias de seleccion creadas:
  Backward: 8 variables
  All_Features: 15 variables
  KBest_f_classif_5: 5 variables
  KBest_f_classif_8: 8 variables
  KBest_f_classif_10: 10 variables
  KBest_mutual_info_5: 5 variables
  KBest_mutual_info_8: 8 variables
  KBest_mutual_info_10: 10 variables

Usando estrategia 'All_Features' para LazyPredict inicial...


In [10]:
print("TECNICAS DE RESAMPLING PARA MULTINOMIAL")
print("="*50)

X_train_selected = X_train[:, final_selected_indices]
X_test_selected = X_test[:, final_selected_indices]

resampling_strategies = {}

# Datos originales
resampling_strategies['Original'] = {
    'X_train': X_train_selected,
    'y_train': y_train
}

if IMBLEARN_AVAILABLE:
    try:
        # SMOTE para multinomial
        smote = SMOTE(random_state=42, k_neighbors=1)  # k_neighbors=1 para dataset pequeño
        X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)
        resampling_strategies['SMOTE'] = {
            'X_train': X_train_smote,
            'y_train': y_train_smote
        }
        print(f"SMOTE: {Counter(y_train_smote)}")
    except Exception as e:
        print(f"SMOTE error: {e}")

    try:
        # RandomUnderSampler
        undersampler = RandomUnderSampler(random_state=42, sampling_strategy='auto')
        X_train_under, y_train_under = undersampler.fit_resample(X_train_selected, y_train)
        resampling_strategies['UnderSample'] = {
            'X_train': X_train_under,
            'y_train': y_train_under
        }
        print(f"UnderSample: {Counter(y_train_under)}")
    except Exception as e:
        print(f"UnderSample error: {e}")
else:
    print("imblearn no disponible - usando solo datos originales")

print(f"\nDistribucion original: {Counter(y_train)}")
print(f"Estrategias de resampling disponibles: {list(resampling_strategies.keys())}")

# Usar datos originales por defecto
current_X_train = X_train_selected
current_y_train = y_train

TECNICAS DE RESAMPLING PARA MULTINOMIAL
SMOTE: Counter({1: 143, 2: 143})
UnderSample: Counter({1: 3, 2: 3})

Distribucion original: Counter({1: 143, 2: 3})
Estrategias de resampling disponibles: ['Original', 'SMOTE', 'UnderSample']


In [11]:
print("LAZYPREDICT - EXPLORACION DE MODELOS MULTINOMIALES")
print("="*50)

try:
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(current_X_train, X_test_selected, current_y_train, y_test)

    print("Resultados LazyPredict:")
    print(models.round(4))

    # Ordenar por F1 Score
    top_10_models = models.nlargest(10, 'F1 Score')
    print(f"\nTOP 10 MODELOS POR F1-SCORE:")
    for i, (model_name, row) in enumerate(top_10_models.iterrows(), 1):
        print(f"{i:2d}. {model_name:25s}: F1={row['F1 Score']:.4f}, Acc={row['Accuracy']:.4f}")

    best_f1_lazy = top_10_models.iloc[0]['F1 Score']
    best_dummy_f1 = max([res['f1_weighted'] for res in dummy_results.values()])

    print(f"\nComparacion con baseline:")
    print(f"Mejor DummyClassifier F1: {best_dummy_f1:.4f}")
    print(f"Mejor LazyPredict F1:    {best_f1_lazy:.4f}")
    if best_f1_lazy > best_dummy_f1:
        print("✓ LazyPredict supera baseline DummyClassifier")
    else:
        print("✗ LazyPredict NO supera baseline DummyClassifier")

    top_5_names = top_10_models.head(5).index.tolist()
    print(f"\nSeleccionando top 5 para optimizacion: {top_5_names}")
    
except Exception as e:
    print(f"Error en LazyPredict: {e}")
    print("Continuando con modelos predefinidos...")
    top_5_names = ['LogisticRegression', 'RandomForestClassifier', 'XGBClassifier', 'GradientBoostingClassifier', 'SVC']

LAZYPREDICT - EXPLORACION DE MODELOS MULTINOMIALES


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3, number of negative: 143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 146, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.020548 -> initscore=-3.864232
[LightGBM] [Info] Start training from score -3.864232
Resultados LazyPredict:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.97               0.50     0.50      0.96   
ExtraTreesClassifier               0.97               0.50     0.50      0.96   
CalibratedClassifierCV             0.97               0.50     0.50      0.96   
ExtraTreeClassifier   

In [12]:
print("MAPEO Y CONFIGURACION DE MODELOS MULTINOMIALES")
print("="*50)

# Modelos base con soporte multinomial
model_mapping = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial', solver='lbfgs'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'XGBClassifier': XGBClassifier(random_state=42, eval_metric='mlogloss', objective='multi:softprob'),
    'SVC': SVC(random_state=42, probability=True),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42, algorithm='SAMME'),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'GaussianNB': GaussianNB(),
}

# Modelos específicos multinomiales
print("Agregando modelos multinomiales específicos:")
model_mapping['LogisticRegression_Multinomial'] = LogisticRegression(
    random_state=42, max_iter=1000, multi_class='multinomial', solver='newton-cg'
)
model_mapping['LogisticRegression_OvR'] = LogisticRegression(
    random_state=42, max_iter=1000, multi_class='ovr', solver='liblinear'
)
print("✓ Modelos logísticos multinomial y OvR agregados")

if IMBLEARN_AVAILABLE:
    model_mapping['BalancedRandomForestClassifier'] = BalancedRandomForestClassifier(random_state=42)
    model_mapping['BalancedBaggingClassifier'] = BalancedBaggingClassifier(random_state=42)
    print("✓ imblearn modelos balanceados disponibles")
else:
    print("✗ imblearn no disponible - sin modelos balanceados")

if LGBM_AVAILABLE:
    model_mapping['LGBMClassifier'] = LGBMClassifier(random_state=42, verbose=-1, objective='multiclass')
    print("✓ LightGBM disponible")
else:
    print("✗ LightGBM no disponible")

if CATBOOST_AVAILABLE:
    model_mapping['CatBoostClassifier'] = CatBoostClassifier(random_state=42, verbose=False, loss_function='MultiClass')
    print("✓ CatBoost disponible")
else:
    print("✗ CatBoost no disponible")

# Preparar modelos para optimización
models_to_optimize = []

# Agregar modelos multinomiales específicos
models_to_optimize.append(('LogisticRegression_Multinomial', model_mapping['LogisticRegression_Multinomial']))
models_to_optimize.append(('LogisticRegression_OvR', model_mapping['LogisticRegression_OvR']))

# Agregar top 5 de LazyPredict
for name in top_5_names[:3]:  # Solo top 3 para acelerar
    if name in model_mapping:
        models_to_optimize.append((name, model_mapping[name]))
        print(f"✓ {name} mapeado correctamente")
    else:
        models_to_optimize.append((name, LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial')))
        print(f"○ {name} usando LogisticRegression multinomial como backup")

print(f"\nModelos a optimizar: {len(models_to_optimize)}")
for name, _ in models_to_optimize:
    print(f"  - {name}")

MAPEO Y CONFIGURACION DE MODELOS MULTINOMIALES
Agregando modelos multinomiales específicos:
✓ Modelos logísticos multinomial y OvR agregados
✓ imblearn modelos balanceados disponibles
✓ LightGBM disponible
✗ CatBoost no disponible
○ BernoulliNB usando LogisticRegression multinomial como backup
✓ ExtraTreesClassifier mapeado correctamente
○ CalibratedClassifierCV usando LogisticRegression multinomial como backup

Modelos a optimizar: 5
  - LogisticRegression_Multinomial
  - LogisticRegression_OvR
  - BernoulliNB
  - ExtraTreesClassifier
  - CalibratedClassifierCV


In [13]:
print("OPTIMIZACION DE HIPERPARAMETROS MULTINOMIALES")
print("="*50)

# Grillas específicas para modelos multinomiales
param_grids_multinomial = {
    'LogisticRegression_Multinomial': {
        'C': [0.1, 1, 10],
        'class_weight': [None, 'balanced']
    },
    'LogisticRegression_OvR': {
        'C': [0.1, 1, 10],
        'class_weight': [None, 'balanced']
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'class_weight': [None, 'balanced']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100],
        'max_depth': [5, 10, None],
        'class_weight': [None, 'balanced']
    },
    'XGBClassifier': {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.1, 0.2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 0.2],
        'max_depth': [3, 5]
    },
    'SVC': {
        'C': [1, 10],
        'kernel': ['rbf', 'linear'],
        'class_weight': [None, 'balanced']
    }
}

optimization_results = {}

for model_name, model in models_to_optimize:
    print(f"\nOptimizing {model_name}...")
    
    try:
        if model_name in param_grids_multinomial:
            param_grid = param_grids_multinomial[model_name]
            print(f"  Grid específico: {len(param_grid)} parámetros")
        else:
            param_grid = {}
            print(f"  Sin grid específico")
        
        # Usar StratifiedKFold para multinomial
        skf_multi = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring='f1_macro',  # F1 macro para multinomial
            cv=skf_multi,
            n_jobs=-1,
            verbose=0
        )
        
        print(f"  Iniciando búsqueda...")
        grid_search.fit(current_X_train, current_y_train)
        
        optimization_results[model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_cv_score': grid_search.best_score_
        }
        
        print(f"  ✓ Best F1-macro CV: {grid_search.best_score_:.4f}")
        print(f"  ✓ Best params: {grid_search.best_params_}")
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        optimization_results[model_name] = {
            'best_estimator': model,
            'best_params': {},
            'best_cv_score': 0
        }

print(f"\n{'='*50}")
print("OPTIMIZATION COMPLETED")
print(f"{'='*50}")
print("Resultados de optimización:")
for name, results in optimization_results.items():
    print(f"{name}: F1-macro CV = {results['best_cv_score']:.4f}")

OPTIMIZACION DE HIPERPARAMETROS MULTINOMIALES

Optimizing LogisticRegression_Multinomial...
  Grid específico: 2 parámetros
  Iniciando búsqueda...
  ✓ Best F1-macro CV: 0.4948
  ✓ Best params: {'C': 0.1, 'class_weight': None}

Optimizing LogisticRegression_OvR...
  Grid específico: 2 parámetros
  Iniciando búsqueda...
  ✓ Best F1-macro CV: 0.5462
  ✓ Best params: {'C': 0.1, 'class_weight': 'balanced'}

Optimizing BernoulliNB...
  Sin grid específico
  Iniciando búsqueda...
  ✓ Best F1-macro CV: 0.4948
  ✓ Best params: {}

Optimizing ExtraTreesClassifier...
  Sin grid específico
  Iniciando búsqueda...
  ✓ Best F1-macro CV: 0.4948
  ✓ Best params: {}

Optimizing CalibratedClassifierCV...
  Sin grid específico
  Iniciando búsqueda...
  ✓ Best F1-macro CV: 0.4948
  ✓ Best params: {}

OPTIMIZATION COMPLETED
Resultados de optimización:
LogisticRegression_Multinomial: F1-macro CV = 0.4948
LogisticRegression_OvR: F1-macro CV = 0.5462
BernoulliNB: F1-macro CV = 0.4948
ExtraTreesClassifier: F1

In [14]:
def calculate_enhanced_metrics_multinomial(y_true, y_pred, y_proba=None):
    """Calcula métricas mejoradas para clasificación multinomial"""
    from sklearn.metrics import classification_report, confusion_matrix
    
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    
    try:
        mcc = matthews_corrcoef(y_true, y_pred)
    except:
        mcc = 0
    
    try:
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
    except:
        balanced_acc = 0
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'mae': mae,
        'mse': mse,
        'mcc': mcc,
        'balanced_accuracy': balanced_acc
    }

final_results = {}

print("EVALUACION FINAL CON METRICAS MULTINOMIALES")
print("="*60)

for model_name, model_info in optimization_results.items():
    model = model_info['best_estimator']
    
    try:
        y_pred = model.predict(X_test_selected)
        y_proba = None
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test_selected)
        
        metrics = calculate_enhanced_metrics_multinomial(y_test, y_pred, y_proba)
        metrics['model'] = model
        
        final_results[model_name] = metrics
        
        print(f"{model_name}:")
        print(f"  Accuracy:     {metrics['accuracy']:.4f}")
        print(f"  F1-macro:     {metrics['f1_macro']:.4f}")
        print(f"  F1-weighted:  {metrics['f1_weighted']:.4f}")
        print(f"  MAE:          {metrics['mae']:.4f}")
        print(f"  MSE:          {metrics['mse']:.4f}")
        print(f"  MCC:          {metrics['mcc']:.4f}")
        print(f"  Bal-Acc:      {metrics['balanced_accuracy']:.4f}")
        print()
        
        # Reporte detallado para el mejor modelo
        if len(final_results) == 1 or metrics['f1_macro'] == max([r['f1_macro'] for r in final_results.values()]):
            print(f"Reporte detallado para {model_name}:")
            print(classification_report(y_test, y_pred, zero_division=0))
            print(f"Matriz de confusión:")
            print(confusion_matrix(y_test, y_pred))
            print()
        
    except Exception as e:
        print(f"Error evaluando {model_name}: {e}")
        continue

print("Comparacion con DummyClassifier:")
print("-" * 40)
for dummy_name, dummy_metrics in dummy_results.items():
    print(f"{dummy_name}: F1_macro={dummy_metrics['f1_macro']:.4f}, Acc={dummy_metrics['accuracy']:.4f}, MAE={dummy_metrics['mae']:.4f}")

if final_results:
    best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_macro'])
    best_f1 = final_results[best_model_name]['f1_macro']
    best_dummy_f1 = max([res['f1_macro'] for res in dummy_results.values()])
    
    print(f"\nMejor modelo real:")
    print(f"{best_model_name}: F1_macro={best_f1:.4f}")

    if best_f1 > best_dummy_f1:
        print("✓ El mejor modelo SUPERA el baseline DummyClassifier")
    else:
        print("✗ El mejor modelo NO supera el baseline DummyClassifier")
        print("  Dataset muy pequeño, considerar más datos o regularización")

EVALUACION FINAL CON METRICAS MULTINOMIALES
LogisticRegression_Multinomial:
  Accuracy:     0.9730
  F1-macro:     0.4932
  F1-weighted:  0.9596
  MAE:          0.0270
  MSE:          0.0270
  MCC:          0.0000
  Bal-Acc:      0.5000

Reporte detallado para LogisticRegression_Multinomial:
              precision    recall  f1-score   support

           1       0.97      1.00      0.99        36
           2       0.00      0.00      0.00         1

    accuracy                           0.97        37
   macro avg       0.49      0.50      0.49        37
weighted avg       0.95      0.97      0.96        37

Matriz de confusión:
Error evaluando LogisticRegression_Multinomial: name 'confusion_matrix' is not defined
LogisticRegression_OvR:
  Accuracy:     0.7838
  F1-macro:     0.4394
  F1-weighted:  0.8550
  MAE:          0.2162
  MSE:          0.2162
  MCC:          -0.0805
  Bal-Acc:      0.4028

BernoulliNB:
  Accuracy:     0.9730
  F1-macro:     0.4932
  F1-weighted:  0.9596
  M

In [15]:
if final_results:
    best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_macro'])
    best_model = final_results[best_model_name]['model']
    best_params = optimization_results[best_model_name]['best_params']

    print("="*80)
    print("REENTRENAMIENTO CON TODOS LOS DATOS (FRECUENCIA)")
    print("="*80)
    print(f"Mejor modelo seleccionado: {best_model_name}")
    print(f"Hiperparámetros óptimos: {best_params}")

    print(f"\nPreparando todos los datos para reentrenamiento...")
    X_all_selected = X_transformed[:, final_selected_indices]
    y_all = y.copy()

    print(f"Datos completos: {X_all_selected.shape}")
    print(f"Distribución target completa: {Counter(y_all)}")

    print(f"\nCreando modelo final con hiperparámetros optimizados...")

    # Mapeo para crear modelo final
    model_mapping_final = {
        'LogisticRegression_Multinomial': LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial', solver='lbfgs'),
        'LogisticRegression_OvR': LogisticRegression(random_state=42, max_iter=1000, multi_class='ovr', solver='liblinear'),
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial'),
        'RandomForestClassifier': RandomForestClassifier(random_state=42),
        'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
        'XGBClassifier': XGBClassifier(random_state=42, eval_metric='mlogloss', objective='multi:softprob'),
        'SVC': SVC(random_state=42, probability=True),
    }

    if best_model_name in model_mapping_final:
        final_model = model_mapping_final[best_model_name]
    else:
        final_model = LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial')

    final_model.set_params(**best_params)
    print(f"Modelo configurado: {final_model}")

    print(f"\nEntrenando modelo final con {X_all_selected.shape[0]} registros...")
    final_model.fit(X_all_selected, y_all)
    print("✓ Entrenamiento completado")

    print(f"\nValidación del modelo final:")
    y_pred_all = final_model.predict(X_all_selected)

    final_accuracy = accuracy_score(y_all, y_pred_all)
    final_f1_macro = f1_score(y_all, y_pred_all, average='macro')
    final_f1_weighted = f1_score(y_all, y_pred_all, average='weighted')
    final_mae = mean_absolute_error(y_all, y_pred_all)

    print(f"Métricas en datos completos:")
    print(f"  Accuracy:     {final_accuracy:.4f}")
    print(f"  F1-macro:     {final_f1_macro:.4f}")
    print(f"  F1-weighted:  {final_f1_weighted:.4f}")
    print(f"  MAE:          {final_mae:.4f}")

    # Guardar modelo
    os.makedirs('../models', exist_ok=True)

    model_info_final = {
        'model': final_model,
        'preprocessor': preprocessor,
        'selected_indices': final_selected_indices,
        'feature_names': final_selected_vars,
        'best_params': best_params,
        'model_name': best_model_name,
        'training_metrics_full_data': {
            'accuracy': final_accuracy,
            'f1_macro': final_f1_macro,
            'f1_weighted': final_f1_weighted,
            'mae': final_mae
        },
        'test_metrics_original': final_results[best_model_name],
        'feature_selection_strategy': 'All_Features',
        'resampling_strategy': 'Original',
        'total_training_samples': X_all_selected.shape[0],
        'all_strategies_tested': {
            'feature_selection': list(feature_selection_strategies.keys()),
            'resampling': list(resampling_strategies.keys())
        },
        'dummy_baseline': dummy_results,
        'target_classes': sorted(y_all.unique()),
        'class_distribution': dict(Counter(y_all)),
        'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    joblib.dump(model_info_final, '../models/frecuencia_medicos_best_model.pkl')

    print("="*80)
    print("RESUMEN FINAL - FRECUENCIA GASTOS MEDICOS RC")
    print("="*80)
    print(f"Dataset original: {df.shape[0]:,} registros (solo siniestrados)")
    print(f"Distribución frecuencias: {dict(Counter(df['target']))}")
    print()

    print("BASELINE DUMMYCLASSIFIER:")
    for dummy_name, dummy_metrics in dummy_results.items():
        print(f"  {dummy_name:15s}: F1_macro={dummy_metrics['f1_macro']:.4f}, MAE={dummy_metrics['mae']:.4f}")

    print(f"\nMODELO FINAL SELECCIONADO:")
    print(f"  Algoritmo: {best_model_name}")
    print(f"  Tipo: Clasificación Multinomial")
    print(f"  Clases: {sorted(y_all.unique())} (frecuencias de siniestros)")
    print(f"  Entrenado con: {X_all_selected.shape[0]:,} registros completos")
    print(f"  Variables: {len(final_selected_vars)} seleccionadas")
    print()

    print("HIPERPARÁMETROS OPTIMIZADOS:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print()

    print("RENDIMIENTO EN TEST SET ORIGINAL:")
    test_metrics = final_results[best_model_name]
    print(f"  F1-macro:     {test_metrics['f1_macro']:.4f}")
    print(f"  F1-weighted:  {test_metrics['f1_weighted']:.4f}")
    print(f"  Accuracy:     {test_metrics['accuracy']:.4f}")
    print(f"  MAE:          {test_metrics['mae']:.4f}")
    print(f"  MCC:          {test_metrics['mcc']:.4f}")

    print(f"\nVARIABLES SELECCIONADAS ({len(final_selected_vars)}):")
    for i, var in enumerate(final_selected_vars, 1):
        print(f"  {i:2d}. {var}")

    print(f"\nCOMPARACION TODOS LOS MODELOS (por F1-macro):")
    print("-" * 50)
    sorted_models = sorted(final_results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)
    best_dummy_f1 = max([res['f1_macro'] for res in dummy_results.values()])
    for i, (model_name, metrics) in enumerate(sorted_models, 1):
        superiority = "✓" if metrics['f1_macro'] > best_dummy_f1 else "✗"
        print(f"{i:2d}. {superiority} {model_name:25s}: F1_macro={metrics['f1_macro']:.4f}, MAE={metrics['mae']:.4f}")

    print(f"\nMODELO GUARDADO EN: ../models/frecuencia_medicos_best_model.pkl")

    improvement = test_metrics['f1_macro'] / best_dummy_f1 if best_dummy_f1 > 0 else float('inf')
    print(f"\nMEJORA SOBRE BASELINE: {improvement:.1f}x mejor F1-macro que DummyClassifier")
    print("="*80)
else:
    print("No se pudieron evaluar modelos. Revisar datos o configuración.")

REENTRENAMIENTO CON TODOS LOS DATOS (FRECUENCIA)
Mejor modelo seleccionado: LogisticRegression_Multinomial
Hiperparámetros óptimos: {'C': 0.1, 'class_weight': None}

Preparando todos los datos para reentrenamiento...
Datos completos: (183, 15)
Distribución target completa: Counter({1: 179, 2: 4})

Creando modelo final con hiperparámetros optimizados...
Modelo configurado: LogisticRegression(C=0.1, max_iter=1000, multi_class='multinomial',
                   random_state=42)

Entrenando modelo final con 183 registros...
✓ Entrenamiento completado

Validación del modelo final:
Métricas en datos completos:
  Accuracy:     0.9781
  F1-macro:     0.4945
  F1-weighted:  0.9673
  MAE:          0.0219
RESUMEN FINAL - FRECUENCIA GASTOS MEDICOS RC
Dataset original: 183 registros (solo siniestrados)
Distribución frecuencias: {1: 179, 2: 4}

BASELINE DUMMYCLASSIFIER:
  Most_Frequent  : F1_macro=0.4932, MAE=0.0270
  Stratified     : F1_macro=0.4932, MAE=0.0270
  Uniform        : F1_macro=0.3684, MA