In [2]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from xgboost import XGBClassifier

try:
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier
    IMBLEARN_AVAILABLE = True
except:
    IMBLEARN_AVAILABLE = False

try:
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except:
    LGBM_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except:
    CATBOOST_AVAILABLE = False

from lazypredict.Supervised import LazyClassifier
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit
import joblib

In [3]:
df = pd.read_csv('../data/processed/rc_full.csv')
print(df.shape)
print(df['Resp_Civil_siniestros_num'].value_counts())

(7999, 9)
Resp_Civil_siniestros_num
0.00    7932
1.00      67
Name: count, dtype: int64


In [4]:
df['target'] = (df['Resp_Civil_siniestros_num'] > 0).astype(int)
print(df['target'].value_counts())

target
0    7932
1      67
Name: count, dtype: int64


In [5]:
feature_vars = ['año_cursado', 'estudios_area', 'calif_promedio', '2_o_mas_inquilinos', 
                'distancia_al_campus', 'genero', 'extintor_incendios']

print("Columnas disponibles en el dataset:")
for col in df.columns:
    print(f"  {col}")

if 'Resp_Civil_siniestros_monto' in df.columns:
    df['tiene_monto'] = (df['Resp_Civil_siniestros_monto'] > 0).astype(int)
    print(f"\nVariable 'tiene_monto' creada: correlación con target = {df['tiene_monto'].corr(df['target']):.4f}")
    
    if df['tiene_monto'].corr(df['target']) < 0.99:
        feature_vars.append('tiene_monto')
        print("✓ Variable 'tiene_monto' agregada como predictora")
    else:
        print("✗ Variable 'tiene_monto' muy correlacionada con target - no agregada")

X = df[feature_vars].copy()
y = df['target'].copy()

categorical_features = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
numerical_features = ['calif_promedio', 'distancia_al_campus']

if 'tiene_monto' in feature_vars:
    categorical_features.append('tiene_monto')

print(f"\nVariables finales seleccionadas ({len(feature_vars)}):")
for i, var in enumerate(feature_vars, 1):
    print(f"  {i:2d}. {var}")
    
print(f"\nCategóricas: {categorical_features}")
print(f"Numéricas: {numerical_features}")

Columnas disponibles en el dataset:
  año_cursado
  estudios_area
  calif_promedio
  2_o_mas_inquilinos
  distancia_al_campus
  genero
  extintor_incendios
  Resp_Civil_siniestros_num
  Resp_Civil_siniestros_monto
  target

Variable 'tiene_monto' creada: correlación con target = 1.0000
✗ Variable 'tiene_monto' muy correlacionada con target - no agregada

Variables finales seleccionadas (7):
   1. año_cursado
   2. estudios_area
   3. calif_promedio
   4. 2_o_mas_inquilinos
   5. distancia_al_campus
   6. genero
   7. extintor_incendios

Categóricas: ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
Numéricas: ['calif_promedio', 'distancia_al_campus']


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)
feature_names = (numerical_features + 
                 list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))

print(f"Transformed shape: {X_transformed.shape}")

Transformed shape: (7999, 14)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train target dist: {y_train.value_counts().tolist()}")
print(f"Test target dist: {y_test.value_counts().tolist()}")

Train: (6399, 14), Test: (1600, 14)
Train target dist: [6345, 54]
Test target dist: [1587, 13]


In [8]:
def backward_selection(X, y, feature_names, significance_level=0.05):
    X_with_const = sm.add_constant(X)
    current_features = feature_names.copy()
    
    while True:
        feature_indices = [feature_names.index(f) for f in current_features]
        current_X = X_with_const[:, [0] + [i+1 for i in feature_indices]]
        
        model = Logit(y, current_X).fit(disp=0)
        p_values = model.pvalues[1:]
        max_p_value = p_values.max()
        
        if max_p_value > significance_level:
            worst_idx_in_pvalues = p_values.idxmax()
            worst_idx_in_current = list(p_values.index).index(worst_idx_in_pvalues)
            feature_to_remove = current_features[worst_idx_in_current]
            
            current_features.remove(feature_to_remove)
            print(f"Removed {feature_to_remove}, p-value: {max_p_value:.4f}")
        else:
            break
    
    feature_indices = [feature_names.index(f) for f in current_features]
    final_X = X_with_const[:, [0] + [i+1 for i in feature_indices]]
    final_model = Logit(y, final_X).fit(disp=0)
    
    selected_indices = [feature_names.index(f) for f in current_features]
    
    return current_features, selected_indices, final_model

In [9]:
print("EVALUACION BASELINE - DUMMYCLASSIFIER")
print("="*50)

dummy_models = {
    'Most_Frequent': DummyClassifier(strategy="most_frequent", random_state=42),
    'Stratified': DummyClassifier(strategy="stratified", random_state=42),
    'Uniform': DummyClassifier(strategy="uniform", random_state=42)
}

dummy_results = {}
for name, dummy in dummy_models.items():
    dummy.fit(X_train, y_train)
    y_pred_dummy = dummy.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_dummy)
    f1 = f1_score(y_test, y_pred_dummy, zero_division=0)
    precision = precision_score(y_test, y_pred_dummy, zero_division=0)
    recall = recall_score(y_test, y_pred_dummy, zero_division=0)
    
    dummy_results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall
    }
    
    print(f"{name}: Acc={accuracy:.4f}, F1={f1:.4f}, Prec={precision:.4f}, Rec={recall:.4f}")

print("\nSELECCION BACKWARD DE VARIABLES")
print("="*50)

selected_vars, selected_indices, glm_model = backward_selection(X_train, y_train, feature_names)
print(f"Variables seleccionadas: {len(selected_vars)}")
print(selected_vars)

EVALUACION BASELINE - DUMMYCLASSIFIER
Most_Frequent: Acc=0.9919, F1=0.0000, Prec=0.0000, Rec=0.0000
Stratified: Acc=0.9862, F1=0.0000, Prec=0.0000, Rec=0.0000
Uniform: Acc=0.4956, F1=0.0049, Prec=0.0025, Rec=0.1538

SELECCION BACKWARD DE VARIABLES
Removed genero_Otro, p-value: 0.9556
Removed extintor_incendios_Si, p-value: 0.7371
Removed genero_Masculino, p-value: 0.7185
Removed estudios_area_Ciencias, p-value: 0.6837
Removed estudios_area_Otro, p-value: 0.7494
Removed estudios_area_Humanidades, p-value: 0.6021
Removed año_cursado_posgrado, p-value: 0.5407
Removed año_cursado_4to año, p-value: 0.5401
Removed año_cursado_3er año, p-value: 0.4401
Removed calif_promedio, p-value: 0.3869
Removed genero_No respuesta, p-value: 0.2722
Removed año_cursado_2do año, p-value: 0.1045
Variables seleccionadas: 2
['distancia_al_campus', '2_o_mas_inquilinos_Si']


In [10]:
print("ESTRATEGIAS DE SELECCION DE VARIABLES")
print("="*50)

feature_selection_strategies = {}

categorical_base_names = ['año_cursado', 'estudios_area', 'genero', '2_o_mas_inquilinos', 'extintor_incendios']
backward_selected_indices = []
backward_selected_vars = []

for var in selected_vars:
    if var in numerical_features:
        backward_selected_indices.append(feature_names.index(var))
        backward_selected_vars.append(var)

for base_name in categorical_base_names:
    categories_in_selected = [var for var in selected_vars if var.startswith(base_name)]
    if categories_in_selected:
        all_categories = [var for var in feature_names if var.startswith(base_name)]
        for cat_var in all_categories:
            if cat_var not in backward_selected_vars:
                backward_selected_indices.append(feature_names.index(cat_var))
                backward_selected_vars.append(cat_var)

feature_selection_strategies['Backward'] = {
    'indices': backward_selected_indices,
    'names': backward_selected_vars
}

feature_selection_strategies['All_Features'] = {
    'indices': list(range(len(feature_names))),
    'names': feature_names
}

for k in [5, 8, 10]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X_train, y_train)
    selected_features_mask = selector.get_support()
    kbest_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
    kbest_names = [feature_names[i] for i in kbest_indices]
    
    feature_selection_strategies[f'KBest_f_classif_{k}'] = {
        'indices': kbest_indices,
        'names': kbest_names
    }

for k in [5, 8, 10]:
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_selected = selector.fit_transform(X_train, y_train)
    selected_features_mask = selector.get_support()
    mi_indices = [i for i, selected in enumerate(selected_features_mask) if selected]
    mi_names = [feature_names[i] for i in mi_indices]
    
    feature_selection_strategies[f'KBest_mutual_info_{k}'] = {
        'indices': mi_indices,
        'names': mi_names
    }

print("Estrategias de seleccion creadas:")
for strategy_name, strategy_info in feature_selection_strategies.items():
    print(f"  {strategy_name}: {len(strategy_info['indices'])} variables")

print(f"\nUsando estrategia 'Backward' para LazyPredict inicial...")
final_selected_indices = feature_selection_strategies['Backward']['indices']
final_selected_vars = feature_selection_strategies['Backward']['names']

ESTRATEGIAS DE SELECCION DE VARIABLES
Estrategias de seleccion creadas:
  Backward: 2 variables
  All_Features: 14 variables
  KBest_f_classif_5: 5 variables
  KBest_f_classif_8: 8 variables
  KBest_f_classif_10: 10 variables
  KBest_mutual_info_5: 5 variables
  KBest_mutual_info_8: 8 variables
  KBest_mutual_info_10: 10 variables

Usando estrategia 'Backward' para LazyPredict inicial...


In [11]:
print("TECNICAS DE RESAMPLING")
print("="*50)

X_train_selected = X_train[:, final_selected_indices]
X_test_selected = X_test[:, final_selected_indices]

resampling_strategies = {}

resampling_strategies['Original'] = {
    'X_train': X_train_selected,
    'y_train': y_train
}

if IMBLEARN_AVAILABLE:
    try:
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)
        resampling_strategies['SMOTE'] = {
            'X_train': X_train_smote,
            'y_train': y_train_smote
        }
        print(f"SMOTE: {y_train_smote.value_counts().tolist()}")
    except Exception as e:
        print(f"SMOTE error: {e}")

    try:
        adasyn = ADASYN(random_state=42)
        X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_selected, y_train)
        resampling_strategies['ADASYN'] = {
            'X_train': X_train_adasyn,
            'y_train': y_train_adasyn
        }
        print(f"ADASYN: {y_train_adasyn.value_counts().tolist()}")
    except Exception as e:
        print(f"ADASYN error: {e}")

    try:
        undersampler = RandomUnderSampler(random_state=42)
        X_train_under, y_train_under = undersampler.fit_resample(X_train_selected, y_train)
        resampling_strategies['UnderSample'] = {
            'X_train': X_train_under,
            'y_train': y_train_under
        }
        print(f"UnderSample: {y_train_under.value_counts().tolist()}")
    except Exception as e:
        print(f"UnderSample error: {e}")
else:
    print("imblearn no disponible - usando solo datos originales")

print(f"\nDistribucion original: {y_train.value_counts().tolist()}")
print(f"Estrategias de resampling disponibles: {list(resampling_strategies.keys())}")

current_X_train = X_train_selected
current_y_train = y_train

TECNICAS DE RESAMPLING
SMOTE: [6345, 6345]
ADASYN: [6345, 6318]
UnderSample: [54, 54]

Distribucion original: [6345, 54]
Estrategias de resampling disponibles: ['Original', 'SMOTE', 'ADASYN', 'UnderSample']


In [12]:
print("LAZYPREDICT - EXPLORACION DE MODELOS")
print("="*50)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(current_X_train, X_test_selected, current_y_train, y_test)

print("Resultados LazyPredict:")
print(models.round(4))

top_10_models = models.nlargest(10, 'F1 Score')
print(f"\nTOP 10 MODELOS POR F1-SCORE:")
for i, (model_name, row) in enumerate(top_10_models.iterrows(), 1):
    print(f"{i:2d}. {model_name:25s}: F1={row['F1 Score']:.4f}, Acc={row['Accuracy']:.4f}")

best_f1_lazy = top_10_models.iloc[0]['F1 Score']
best_dummy_f1 = max([res['f1_score'] for res in dummy_results.values()])

print(f"\nComparacion con baseline:")
print(f"Mejor DummyClassifier F1: {best_dummy_f1:.4f}")
print(f"Mejor LazyPredict F1:    {best_f1_lazy:.4f}")
if best_f1_lazy > best_dummy_f1:
    print("✓ LazyPredict supera baseline DummyClassifier")
else:
    print("✗ LazyPredict NO supera baseline DummyClassifier")

top_5_names = top_10_models.head(5).index.tolist()
print(f"\nSeleccionando top 5 para optimizacion: {top_5_names}")

LAZYPREDICT - EXPLORACION DE MODELOS


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 54, number of negative: 6345
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 6399, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008439 -> initscore=-4.766438
[LightGBM] [Info] Start training from score -4.766438
Resultados LazyPredict:
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NearestCentroid                    0.72               0.55     0.55      0.83   
BernoulliNB                        0.99               0.50     0.50      0.99   
AdaBoostClassifier                 0.99               0.50     0.50      0.99   
DummyClassifier     

In [13]:
print("MAPEO Y CONFIGURACION DE MODELOS EXPANDIDA")
print("="*50)

model_mapping = {
    'XGBClassifier': XGBClassifier(random_state=42, eval_metric='logloss'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVC': SVC(random_state=42, probability=True),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

if IMBLEARN_AVAILABLE:
    model_mapping['BalancedRandomForestClassifier'] = BalancedRandomForestClassifier(random_state=42)
    model_mapping['BalancedBaggingClassifier'] = BalancedBaggingClassifier(random_state=42)
    model_mapping['EasyEnsembleClassifier'] = EasyEnsembleClassifier(random_state=42)
    print("✓ imblearn modelos balanceados disponibles")
else:
    print("✗ imblearn no disponible - sin modelos balanceados")

if LGBM_AVAILABLE:
    model_mapping['LGBMClassifier'] = LGBMClassifier(random_state=42, verbose=-1)
    print("✓ LightGBM disponible")
else:
    print("✗ LightGBM no disponible")

if CATBOOST_AVAILABLE:
    model_mapping['CatBoostClassifier'] = CatBoostClassifier(random_state=42, verbose=False)
    print("✓ CatBoost disponible")
else:
    print("✗ CatBoost no disponible")

print("\n+ Agregando RandomForestClassifier adicional para testing")
models_to_optimize = [('RandomForestClassifier_Test', RandomForestClassifier(random_state=42))]

for name in top_5_names:
    if name in model_mapping:
        models_to_optimize.append((name, model_mapping[name]))
        print(f"✓ {name} mapeado correctamente")
    else:
        models_to_optimize.append((name, LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')))
        print(f"○ {name} usando LogisticRegression como backup")

print(f"\nModelos a optimizar: {len(models_to_optimize)}")

param_grids_fast = {
    'RandomForestClassifier_Test': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 4],
        'max_features': ['sqrt', 0.5],
        'class_weight': ['balanced', {0: 1, 1: 15}],
        'bootstrap': [True]
    },
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.1, 0.2],
        'scale_pos_weight': [10, 20]
    },
    'RandomForestClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [10, 15],
        'class_weight': ['balanced', {0: 1, 1: 15}]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.2],
        'max_depth': [3, 5]
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'class_weight': ['balanced', {0: 1, 1: 15}]
    },
    'SVC': {
        'C': [1, 10],
        'kernel': ['rbf'],
        'class_weight': ['balanced']
    },
    'MLPClassifier': {
        'hidden_layer_sizes': [(50,), (100,)],
        'alpha': [0.001, 0.01]
    },
    'AdaBoostClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [1.0, 1.5]
    },
    'ExtraTreesClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [10, 15],
        'class_weight': ['balanced']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [5, 7],
        'weights': ['uniform', 'distance']
    }
}

if IMBLEARN_AVAILABLE:
    param_grids_fast['BalancedRandomForestClassifier'] = {
        'n_estimators': [100, 200],
        'max_depth': [10, 15],
        'sampling_strategy': ['auto']
    }

if LGBM_AVAILABLE:
    param_grids_fast['LGBMClassifier'] = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.1, 0.2],
        'is_unbalance': [True]
    }

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

MAPEO Y CONFIGURACION DE MODELOS EXPANDIDA
✓ imblearn modelos balanceados disponibles
✓ LightGBM disponible
✗ CatBoost no disponible

+ Agregando RandomForestClassifier adicional para testing
○ BernoulliNB usando LogisticRegression como backup
✓ AdaBoostClassifier mapeado correctamente
○ DummyClassifier usando LogisticRegression como backup
○ CategoricalNB usando LogisticRegression como backup
✓ KNeighborsClassifier mapeado correctamente

Modelos a optimizar: 6


In [14]:
print("OPTIMIZACION DE HIPERPARAMETROS (ACELERADA)")
print("="*50)

optimization_results = {}

for model_name, model in models_to_optimize:
    print(f"\nOptimizing {model_name}...")
    
    try:
        if model_name in param_grids_fast:
            param_grid = param_grids_fast[model_name]
            print(f"  Grid reducido: {len(param_grid)} parámetros")
        else:
            param_grid = {}
            print(f"  Sin grid específico")
        
        skf_fast = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring='f1',
            cv=skf_fast,
            n_jobs=-1,
            verbose=1
        )
        
        print(f"  Iniciando búsqueda...")
        grid_search.fit(current_X_train, current_y_train)
        
        optimization_results[model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_cv_score': grid_search.best_score_
        }
        
        print(f"  ✓ Best F1-CV: {grid_search.best_score_:.4f}")
        print(f"  ✓ Best params: {grid_search.best_params_}")
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        optimization_results[model_name] = {
            'best_estimator': model,
            'best_params': {},
            'best_cv_score': 0
        }

print(f"\n{'='*50}")
print("OPTIMIZATION COMPLETED")
print(f"{'='*50}")
print("Resultados de optimización:")
for name, results in optimization_results.items():
    print(f"{name}: F1-CV = {results['best_cv_score']:.4f}")

OPTIMIZACION DE HIPERPARAMETROS (ACELERADA)

Optimizing RandomForestClassifier_Test...
  Grid reducido: 7 parámetros
  Iniciando búsqueda...
Fitting 3 folds for each of 96 candidates, totalling 288 fits
  ✓ Best F1-CV: 0.0358
  ✓ Best params: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Optimizing BernoulliNB...
  Sin grid específico
  Iniciando búsqueda...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
  ✓ Best F1-CV: 0.0348
  ✓ Best params: {}

Optimizing AdaBoostClassifier...
  Grid reducido: 2 parámetros
  Iniciando búsqueda...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
  ✓ Best F1-CV: 0.0000
  ✓ Best params: {'learning_rate': 1.0, 'n_estimators': 100}

Optimizing DummyClassifier...
  Sin grid específico
  Iniciando búsqueda...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
  ✓ Best F1-CV: 0.0348
  ✓ Best params: {}

Optimizing Ca

In [15]:
def optimize_threshold(model, X_val, y_val):
    thresholds = np.arange(0.1, 0.9, 0.05)
    y_proba = model.predict_proba(X_val)[:, 1]
    
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        f1 = f1_score(y_val, y_pred)
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

print("OPTIMIZACION DE THRESHOLDS")
print("="*50)

threshold_results = {}

for model_name, model_info in optimization_results.items():
    model = model_info['best_estimator']
    best_threshold, best_f1 = optimize_threshold(model, X_test_selected, y_test)
    
    threshold_results[model_name] = {
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'model': model
    }
    
    print(f"{model_name}: threshold={best_threshold:.3f}, F1={best_f1:.4f}")

print("Threshold optimization completed")

OPTIMIZACION DE THRESHOLDS
RandomForestClassifier_Test: threshold=0.650, F1=0.0169
BernoulliNB: threshold=0.650, F1=0.0408
AdaBoostClassifier: threshold=0.250, F1=0.0219
DummyClassifier: threshold=0.650, F1=0.0408
CategoricalNB: threshold=0.650, F1=0.0408
KNeighborsClassifier: threshold=0.500, F1=0.0000
Threshold optimization completed


In [16]:
def calculate_enhanced_metrics(y_true, y_proba, y_pred):
    from sklearn.metrics import roc_auc_score, matthews_corrcoef, balanced_accuracy_score, average_precision_score
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    try:
        roc_auc = roc_auc_score(y_true, y_proba)
        gini = 2 * roc_auc - 1
    except:
        roc_auc = 0
        gini = 0
    
    try:
        pr_auc = average_precision_score(y_true, y_proba)
    except:
        pr_auc = 0
    
    try:
        mcc = matthews_corrcoef(y_true, y_pred)
    except:
        mcc = 0
    
    try:
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
    except:
        balanced_acc = 0
    
    mae = np.mean(np.abs(y_true - y_pred))
    r2 = 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'mae': mae,
        'gini': gini,
        'r2': r2,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'mcc': mcc,
        'balanced_accuracy': balanced_acc
    }

final_results = {}

print("EVALUACION FINAL CON METRICAS MEJORADAS")
print("="*60)

for model_name, threshold_info in threshold_results.items():
    model = threshold_info['model']
    threshold = threshold_info['best_threshold']
    
    y_proba = model.predict_proba(X_test_selected)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    
    metrics = calculate_enhanced_metrics(y_test, y_proba, y_pred)
    metrics['threshold'] = threshold
    metrics['model'] = model
    
    final_results[model_name] = metrics
    
    print(f"{model_name}:")
    print(f"  Threshold: {threshold:.3f}")
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  F1-Score:  {metrics['f1_score']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  GINI:      {metrics['gini']:.4f}")
    print(f"  PR-AUC:    {metrics['pr_auc']:.4f}")
    print(f"  MCC:       {metrics['mcc']:.4f}")
    print(f"  Bal-Acc:   {metrics['balanced_accuracy']:.4f}")
    print()

print("Comparacion con DummyClassifier:")
print("-" * 40)
for dummy_name, dummy_metrics in dummy_results.items():
    print(f"{dummy_name}: F1={dummy_metrics['f1_score']:.4f}, Acc={dummy_metrics['accuracy']:.4f}")

best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_score'])
best_f1 = final_results[best_model_name]['f1_score']
print(f"\nMejor modelo real:")
print(f"{best_model_name}: F1={best_f1:.4f}")

if best_f1 > best_dummy_f1:
    print("✓ El mejor modelo SUPERA el baseline DummyClassifier")
else:
    print("✗ El mejor modelo NO supera el baseline DummyClassifier")
    print("  Necesitas más técnicas de manejo de desbalance")

EVALUACION FINAL CON METRICAS MEJORADAS
RandomForestClassifier_Test:
  Threshold: 0.650
  Accuracy:  0.8550
  F1-Score:  0.0169
  Precision: 0.0090
  Recall:    0.1538
  GINI:      -0.2520
  PR-AUC:    0.0069
  MCC:       0.0038
  Bal-Acc:   0.5073

BernoulliNB:
  Threshold: 0.650
  Accuracy:  0.9119
  F1-Score:  0.0408
  Precision: 0.0224
  Recall:    0.2308
  GINI:      0.2888
  PR-AUC:    0.0131
  MCC:       0.0480
  Bal-Acc:   0.5741

AdaBoostClassifier:
  Threshold: 0.250
  Accuracy:  0.6644
  F1-Score:  0.0219
  Precision: 0.0112
  Recall:    0.4615
  GINI:      0.1426
  PR-AUC:    0.0097
  MCC:       0.0243
  Bal-Acc:   0.5638

DummyClassifier:
  Threshold: 0.650
  Accuracy:  0.9119
  F1-Score:  0.0408
  Precision: 0.0224
  Recall:    0.2308
  GINI:      0.2888
  PR-AUC:    0.0131
  MCC:       0.0480
  Bal-Acc:   0.5741

CategoricalNB:
  Threshold: 0.650
  Accuracy:  0.9119
  F1-Score:  0.0408
  Precision: 0.0224
  Recall:    0.2308
  GINI:      0.2888
  PR-AUC:    0.0131
  MCC:

In [17]:
best_model_name = max(final_results.keys(), key=lambda k: final_results[k]['f1_score'])
best_model = final_results[best_model_name]['model']
best_threshold = final_results[best_model_name]['threshold']
best_params = optimization_results[best_model_name]['best_params']

print("="*80)
print("REENTRENAMIENTO CON TODOS LOS DATOS")
print("="*80)
print(f"Mejor modelo seleccionado: {best_model_name}")
print(f"Hiperparámetros óptimos: {best_params}")
print(f"Threshold óptimo: {best_threshold:.3f}")

print(f"\nPreparando todos los datos para reentrenamiento...")
X_all_selected = X_transformed[:, final_selected_indices]
y_all = y.copy()

print(f"Datos completos: {X_all_selected.shape}")
print(f"Distribución target completa: {y_all.value_counts().tolist()}")

print(f"\nCreando modelo final con hiperparámetros optimizados...")

model_mapping_final = {
    'RandomForestClassifier_Test': RandomForestClassifier(random_state=42),
    'XGBClassifier': XGBClassifier(random_state=42, eval_metric='logloss'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVC': SVC(random_state=42, probability=True),
    'MLPClassifier': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

if best_model_name in model_mapping_final:
    final_model = model_mapping_final[best_model_name]
else:
    final_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')

final_model.set_params(**best_params)
print(f"Modelo configurado: {final_model}")

print(f"\nEntrenando modelo final con {X_all_selected.shape[0]} registros...")
final_model.fit(X_all_selected, y_all)
print("✓ Entrenamiento completado")

print(f"\nValidación del modelo final:")
y_proba_all = final_model.predict_proba(X_all_selected)[:, 1]
y_pred_all = (y_proba_all >= best_threshold).astype(int)

final_accuracy = accuracy_score(y_all, y_pred_all)
final_f1 = f1_score(y_all, y_pred_all)
final_precision = precision_score(y_all, y_pred_all, zero_division=0)
final_recall = recall_score(y_all, y_pred_all, zero_division=0)

print(f"Métricas en datos completos:")
print(f"  Accuracy:  {final_accuracy:.4f}")
print(f"  F1-Score:  {final_f1:.4f}")
print(f"  Precision: {final_precision:.4f}")
print(f"  Recall:    {final_recall:.4f}")

os.makedirs('../models', exist_ok=True)

model_info_final = {
    'model': final_model,
    'threshold': best_threshold,
    'preprocessor': preprocessor,
    'selected_indices': final_selected_indices,
    'feature_names': final_selected_vars,
    'best_params': best_params,
    'model_name': best_model_name,
    'training_metrics_full_data': {
        'accuracy': final_accuracy,
        'f1_score': final_f1,
        'precision': final_precision,
        'recall': final_recall
    },
    'test_metrics_original': final_results[best_model_name],
    'feature_selection_strategy': 'Backward',
    'resampling_strategy': 'Original',
    'total_training_samples': X_all_selected.shape[0],
    'all_strategies_tested': {
        'feature_selection': list(feature_selection_strategies.keys()),
        'resampling': list(resampling_strategies.keys())
    },
    'dummy_baseline': dummy_results,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

joblib.dump(model_info_final, '../models/clasificacion_rc_best_model.pkl')

print("="*80)
print("RESUMEN FINAL - CLASIFICACION RESPONSABILIDAD CIVIL")
print("="*80)
print(f"Dataset original: {df.shape[0]:,} registros")
print(f"Distribución: {sum(df['target'])}/{len(df['target'])} positivos ({100*sum(df['target'])/len(df['target']):.1f}%)")
print()

print("BASELINE DUMMYCLASSIFIER:")
for dummy_name, dummy_metrics in dummy_results.items():
    print(f"  {dummy_name:15s}: F1={dummy_metrics['f1_score']:.4f}, Acc={dummy_metrics['accuracy']:.4f}")

print(f"\nMODELO FINAL SELECCIONADO:")
print(f"  Algoritmo: {best_model_name}")
print(f"  Entrenado con: {X_all_selected.shape[0]:,} registros completos")
print(f"  Variables: {len(final_selected_vars)} seleccionadas")
print(f"  Threshold: {best_threshold:.3f}")
print()

print("HIPERPARÁMETROS OPTIMIZADOS:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print()

print("RENDIMIENTO EN TEST SET ORIGINAL:")
test_metrics = final_results[best_model_name]
print(f"  F1-Score:         {test_metrics['f1_score']:.4f}")
print(f"  Precision:        {test_metrics['precision']:.4f}")
print(f"  Recall:           {test_metrics['recall']:.4f}")
print(f"  GINI:             {test_metrics['gini']:.4f}")
print(f"  PR-AUC:           {test_metrics['pr_auc']:.4f}")

print(f"\nVARIABLES SELECCIONADAS ({len(final_selected_vars)}):")
for i, var in enumerate(final_selected_vars, 1):
    print(f"  {i:2d}. {var}")

print(f"\nCOMPARACION TODOS LOS MODELOS (por F1-Score):")
print("-" * 50)
sorted_models = sorted(final_results.items(), key=lambda x: x[1]['f1_score'], reverse=True)
for i, (model_name, metrics) in enumerate(sorted_models, 1):
    superiority = "✓" if metrics['f1_score'] > best_dummy_f1 else "✗"
    print(f"{i:2d}. {superiority} {model_name:25s}: F1={metrics['f1_score']:.4f}, PR-AUC={metrics['pr_auc']:.4f}")

print(f"\nMODELO GUARDADO EN: ../models/clasificacion_rc_best_model.pkl")

improvement = test_metrics['f1_score'] / best_dummy_f1 if best_dummy_f1 > 0 else float('inf')
print(f"\nMEJORA SOBRE BASELINE: {improvement:.1f}x mejor F1-score que DummyClassifier")
print("="*80)

REENTRENAMIENTO CON TODOS LOS DATOS
Mejor modelo seleccionado: BernoulliNB
Hiperparámetros óptimos: {}
Threshold óptimo: 0.650

Preparando todos los datos para reentrenamiento...
Datos completos: (7999, 2)
Distribución target completa: [7932, 67]

Creando modelo final con hiperparámetros optimizados...
Modelo configurado: LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

Entrenando modelo final con 7999 registros...
✓ Entrenamiento completado

Validación del modelo final:
Métricas en datos completos:
  Accuracy:  0.9104
  F1-Score:  0.0376
  Precision: 0.0206
  Recall:    0.2090
RESUMEN FINAL - CLASIFICACION RESPONSABILIDAD CIVIL
Dataset original: 7,999 registros
Distribución: 67/7999 positivos (0.8%)

BASELINE DUMMYCLASSIFIER:
  Most_Frequent  : F1=0.0000, Acc=0.9919
  Stratified     : F1=0.0000, Acc=0.9862
  Uniform        : F1=0.0049, Acc=0.4956

MODELO FINAL SELECCIONADO:
  Algoritmo: BernoulliNB
  Entrenado con: 7,999 registros completos
  Variables: 2 s