In [219]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import OneHotEncoder
import folium
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from datetime import datetime, timedelta
import contextily as cx
import geopandas as gpd
from shapely.geometry import Point
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_recall_curve, confusion_matrix
import xgboost as xgb
import joblib
import json
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
import pickle

In [27]:
df_final = pd.read_csv(r"C:\Users\emili\sp-ml-17-final-project-g3\notebooks\df_final_a_modelar.csv")

In [28]:
df_final.drop("Unnamed: 0", axis=1, inplace=True)

In [29]:
df_final.drop("snowfall (cm)", axis=1, inplace=True)

In [30]:
df_final["Fecha"] = pd.to_datetime(df_final["Fecha"])

In [31]:
df_final = df_final.sort_values('Fecha').reset_index(drop=True)

- Separamos objetivo y features

In [44]:
X = df_final.drop(columns=["es_accidente", "Fecha"])
y = df_final["es_accidente"]

- Convertimos la unica columna categorica que nos quedaba

In [34]:
df_final = pd.get_dummies(df_final, columns=['Tipo_dia'], drop_first=True, dtype=int)

In [46]:
df_final


Unnamed: 0,Fecha,lon,lat,temperature_2m (°C),precipitation (mm),wind_speed_10m (km/h),cluster,es_accidente,Festivo,dia,hora,fiesta_local,mes,hora_sin,hora_cos,mes_sin,mes_cos,Tipo_dia_Fin de semana,Tipo_dia_Laboral
0,2017-01-01 00:00:00,2.102477,41.385437,3.5,0.0,7.1,94,0,0,7,0,0,1,0.000000,1.000000,5.000000e-01,0.866025,1,0
1,2017-01-01 00:00:00,2.150096,41.382647,7.2,0.0,7.1,133,0,0,7,0,0,1,0.000000,1.000000,5.000000e-01,0.866025,1,0
2,2017-01-01 01:00:00,2.211578,41.408096,7.0,0.0,8.9,41,0,0,7,1,0,1,0.258819,0.965926,5.000000e-01,0.866025,1,0
3,2017-01-01 01:00:00,2.156381,41.402319,-0.1,0.0,3.3,112,0,0,7,1,0,1,0.258819,0.965926,5.000000e-01,0.866025,1,0
4,2017-01-01 01:00:00,2.189408,41.398465,5.6,0.0,8.9,143,0,0,7,1,0,1,0.258819,0.965926,5.000000e-01,0.866025,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273718,2024-12-31 20:00:00,2.143247,41.387573,8.7,0.0,3.4,86,1,0,2,20,0,12,-0.866025,0.500000,-2.449294e-16,1.000000,0,1
273719,2024-12-31 21:00:00,2.175309,41.390951,8.7,0.0,5.5,129,0,0,2,21,0,12,-0.707107,0.707107,-2.449294e-16,1.000000,0,1
273720,2024-12-31 22:00:00,2.157283,41.389924,7.4,0.0,8.1,167,0,0,2,22,0,12,-0.500000,0.866025,-2.449294e-16,1.000000,0,1
273721,2024-12-31 22:00:00,2.182863,41.441627,7.8,0.0,8.1,127,0,0,2,22,0,12,-0.500000,0.866025,-2.449294e-16,1.000000,0,1


#### SPLIT

- Hacemos el split temporal

In [47]:
split_idx = int(len(df_final) * 0.8)
X_train, y_train = X.iloc[:split_idx], y.iloc[:split_idx]
X_test, y_test = X.iloc[split_idx:], y.iloc[split_idx:]


In [48]:
X_train.dtypes

lon                       float64
lat                       float64
temperature_2m (°C)       float64
precipitation (mm)        float64
wind_speed_10m (km/h)     float64
cluster                     int64
Festivo                     int64
dia                         int64
hora                        int64
fiesta_local                int64
mes                         int64
hora_sin                  float64
hora_cos                  float64
mes_sin                   float64
mes_cos                   float64
Tipo_dia_Fin de semana      int64
Tipo_dia_Laboral            int64
dtype: object

#### MODELO XGBCLASSIFIER

In [49]:
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:,1]


In [50]:
report = classification_report(y_test, y_pred_xgb)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.96      0.88     42372
           1       0.64      0.23      0.34     12373

    accuracy                           0.80     54745
   macro avg       0.73      0.60      0.61     54745
weighted avg       0.77      0.80      0.76     54745



----

#### MODELO LGBM

In [51]:
lgbm_model = LGBMClassifier(
    objective='binary',
    random_state=42
)
lgbm_model.fit(X_train, y_train)

y_pred_lgbm = lgbm_model.predict(X_test)
y_proba_lgbm = lgbm_model.predict_proba(X_test)[:,1]

[LightGBM] [Info] Number of positive: 54966, number of negative: 164012
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0,010228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1438
[LightGBM] [Info] Number of data points in the train set: 218978, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0,251012 -> initscore=-1,093225
[LightGBM] [Info] Start training from score -1,093225


In [52]:
report_lgbm = classification_report(y_test, y_pred_lgbm)
print(report_lgbm)

              precision    recall  f1-score   support

           0       0.78      0.99      0.87     42372
           1       0.62      0.05      0.09     12373

    accuracy                           0.78     54745
   macro avg       0.70      0.52      0.48     54745
weighted avg       0.74      0.78      0.70     54745



------

#### Optimizacion con XGBoost

- Empezare haciendo Feature Engineering

In [53]:
def feature_engineering(df_final):
    """
    Aplica ingeniería de features adicionales para XGBoost / LightGBM
    """
    df_fe = df_final.copy()
    
    # --- Features temporales ---
    df_fe['es_fin_semana'] = df_fe['dia'].apply(lambda x: 1 if x >= 6 else 0)
    df_fe['es_hora_rush'] = df_fe['hora'].apply(lambda x: 1 if x in [7,8,9,17,18,19] else 0)
    df_fe['es_noche'] = df_fe['hora'].apply(lambda x: 1 if x >= 22 or x <= 6 else 0)
    
    # --- Interacciones ---
    df_fe['rush_x_lluvia'] = df_fe['es_hora_rush'] * df_fe['precipitation (mm)'].apply(lambda x: 1 if x>1 else 0)
    df_fe['noche_x_weekend'] = df_fe['es_noche'] * df_fe['es_fin_semana']
    
    # --- Features espaciales ---
    cluster_stats = df_fe.groupby('cluster')['es_accidente'].agg(['mean','count']).reset_index()
    cluster_stats.columns = ['cluster','cluster_accident_rate','cluster_total_events']
    df_fe = df_fe.merge(cluster_stats, on='cluster', how='left')
    
    center_lat, center_lon = 41.3851, 2.1734
    df_fe['dist_to_center'] = np.sqrt((df_fe['lat'] - center_lat)**2 + (df_fe['lon'] - center_lon)**2)
    
    # --- Features meteorológicos ---
    df_fe['lluvia'] = df_fe['precipitation (mm)'].apply(lambda x: 1 if x > 1 else 0)
    df_fe['viento_fuerte'] = df_fe['wind_speed_10m (km/h)'].apply(lambda x: 1 if x > 20 else 0)
    df_fe['temp_extrema'] = df_fe['temperature_2m (°C)'].apply(lambda x: 1 if x < 5 or x > 30 else 0)
    
    # --- Dummies ya aplicadas previamente (Tipo_dia) ---
    # Asegúrate que ya ejecutaste: pd.get_dummies(df, columns=['Tipo_dia'], drop_first=True, dtype=int)
    
    return df_fe


In [55]:
df_processed = feature_engineering(df_final)


- Nuevo split con features

In [56]:
X_f = df_processed.drop(columns=['es_accidente', 'Fecha'])
y_f = df_processed['es_accidente']

# Split temporal 80/20
split_idx = int(len(df_processed) * 0.8)
X_f_train, y_f_train = X_f.iloc[:split_idx], y_f.iloc[:split_idx]
X_f_test, y_f_test = X_f.iloc[split_idx:], y_f.iloc[split_idx:]

- Configuramos el desbalance

In [57]:
scale_pos_weight = (y_train==0).sum() / (y_train==1).sum()

- Preparamos el RandomizedSearchCV

In [61]:
# Modelo base
xgb_model_o = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

# Espacio de búsqueda
param_dist = {
    'n_estimators': randint(200, 1200),
    'max_depth': randint(3, 10),
    'learning_rate': loguniform(1e-3, 3e-1),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': loguniform(1e-8, 0.1),
    'min_child_weight': randint(1, 12),
    'reg_alpha': loguniform(1e-8, 0.1),
    'reg_lambda': loguniform(1e-3, 10)
}

tscv = TimeSeriesSplit(n_splits=4)

search = RandomizedSearchCV(
    estimator=xgb_model_o,
    param_distributions=param_dist,
    n_iter=60,               # ajusta según tu tiempo
    scoring='roc_auc',
    cv=tscv,
    n_jobs=-1,
    verbose=2,
    random_state=42
)


In [62]:
search.fit(X_f_train, y_f_train)

Fitting 4 folds for each of 60 candidates, totalling 240 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': <scipy.stats....001EC0A59A540>, 'gamma': <scipy.stats....001EC0A59A120>, 'learning_rate': <scipy.stats....001EC0A598CB0>, 'max_depth': <scipy.stats....001EC0A598860>, ...}"
,n_iter,60
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,TimeSeriesSpl...est_size=None)
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.9861007321563079)
,device,
,early_stopping_rounds,
,enable_categorical,False


In [64]:
best_params = search.best_params_
best_params['scale_pos_weight'] = scale_pos_weight  # mantener el balance

best_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    **best_params
)

best_model.fit(X_f_train, y_f_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.9861007321563079)
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_proba = best_model.predict_proba(X_f_test)[:,1]

# Threshold por defecto 0.5
y_pred = (y_proba >= 0.5).astype(int)

print("Classification report con threshold 0.5:")
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

# Threshold más bajo (para mejorar recall en clase 1)
threshold = 0.45
y_pred_thresh = (y_proba >= threshold).astype(int)

print(f"\nClassification report con threshold {threshold}:")
print(classification_report(y_test, y_pred_thresh))


Classification report con threshold 0.5:
              precision    recall  f1-score   support

           0       0.92      0.76      0.83     42372
           1       0.48      0.76      0.59     12373

    accuracy                           0.76     54745
   macro avg       0.70      0.76      0.71     54745
weighted avg       0.82      0.76      0.78     54745

AUC: 0.844292731417319

Classification report con threshold 0.35:
              precision    recall  f1-score   support

           0       0.95      0.61      0.74     42372
           1       0.40      0.88      0.55     12373

    accuracy                           0.67     54745
   macro avg       0.67      0.74      0.64     54745
weighted avg       0.82      0.67      0.70     54745



- Buscamos un threshold que nos de mejor recall y precison, pero no se encontro

In [83]:
y_proba = best_model.predict_proba(X_f_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_f_test, y_proba)

# Crear DataFrame con los valores alineados
pr_df = pd.DataFrame({
    "threshold": list(thresholds) + [1.0],  # se añade 1.0 para cuadrar longitudes
    "precision": precision,
    "recall": recall
})

# Filtrar candidatos donde ambas métricas >= 0.70
candidates = pr_df[(pr_df["precision"] >= 0.70) & (pr_df["recall"] >= 0.70)]

if not candidates.empty:
    # Mostrar el mejor balance según F1
    best = candidates.assign(f1 = 2 * (candidates["precision"]*candidates["recall"]) /
                                   (candidates["precision"]+candidates["recall"]))
    print(best.sort_values("f1", ascending=False).head(10))
else:
    print("⚠️ No hay threshold donde precision y recall sean ambos >= 0.70")


⚠️ No hay threshold donde precision y recall sean ambos >= 0.70


-------

#### Optimizacion con features en LightGBM

In [104]:
# Definimos el modelo base
lgbm_o = LGBMClassifier(random_state=42, class_weight='balanced')

# Espacio de hiperparámetros
param_dist = {
    'num_leaves': np.arange(20, 150, 10),       # tamaño de las hojas
    'max_depth': np.arange(3, 15, 1),           # profundidad máxima
    'learning_rate': np.linspace(0.01, 0.3, 20), # tasa de aprendizaje
    'n_estimators': np.arange(100, 1000, 100),   # número de árboles
    'subsample': np.linspace(0.5, 1.0, 6),       # fracción de datos usados
    'colsample_bytree': np.linspace(0.5, 1.0, 6),# fracción de features
    'reg_alpha': np.linspace(0, 1, 6),           # regularización L1
    'reg_lambda': np.linspace(0, 1, 6)           # regularización L2
}

# Configuramos RandomizedSearchCV
random_search_lgbm_o = RandomizedSearchCV(
    estimator=lgbm_o,
    param_distributions=param_dist,
    n_iter=50,                # número de combinaciones a probar
    scoring='f1',        # métrica a optimizar
    cv=3,                     # validación cruzada
    verbose=2,
    random_state=22,
    n_jobs=-1
)

# Entrenamos
random_search_lgbm_o.fit(X_f_train, y_f_train)

# Mejor modelo encontrado
best_lgbm_o = random_search_lgbm_o.best_estimator_
print("Mejores hiperparámetros:", random_search_lgbm_o.best_params_)

# Evaluamos en test
y_pred_lgbm_o = best_lgbm_o.predict(X_f_test)
y_proba_lgbm_o = best_lgbm_o.predict_proba(X_f_test)[:,1]

print("\nClassification Report:")
print(classification_report(y_f_test, y_pred_lgbm_o))
print("AUC:", roc_auc_score(y_f_test, y_proba_lgbm_o))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 54966, number of negative: 164012
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0,010824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2099
[LightGBM] [Info] Number of data points in the train set: 218978, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0,500000 -> initscore=0,000000
[LightGBM] [Info] Start training from score 0,000000
Mejores hiperparámetros: {'subsample': np.float64(0.5), 'reg_lambda': np.float64(1.0), 'reg_alpha': np.float64(0.8), 'num_leaves': np.int64(120), 'n_estimators': np.int64(900), 'max_depth': np.int64(10), 'learning_rate': np.float64(0.10157894736842105), 'colsample_bytree': np.float64(0.8)}

Classification Report:
              precision    recall  f1-score   support

          

In [105]:
report_lgbm_o = classification_report(y_f_test, y_pred_lgbm_o)
print(report_lgbm_o)

              precision    recall  f1-score   support

           0       0.91      0.80      0.85     42372
           1       0.51      0.73      0.60     12373

    accuracy                           0.78     54745
   macro avg       0.71      0.76      0.72     54745
weighted avg       0.82      0.78      0.79     54745



In [106]:


y_proba = best_lgbm_o.predict_proba(X_f_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_f_test, y_proba_lgbm_o)

# Crear DataFrame con valores
pr_df = pd.DataFrame({
    "threshold": list(thresholds) + [1.0],  # cuadramos longitudes
    "precision": precision,
    "recall": recall
})

# Filtrar donde ambas métricas >= 0.70
candidates = pr_df[(pr_df["precision"] >= 0.50) & (pr_df["recall"] >= 0.70)]

if not candidates.empty:
    best = candidates.assign(f1 = 2 * (candidates["precision"]*candidates["recall"]) /
                                   (candidates["precision"]+candidates["recall"]))
    print(best.sort_values("f1", ascending=False).head(10))
else:
    print("⚠️ No hay threshold que cumpla precision ≥ 0.70 y recall ≥ 0.70")


       threshold  precision    recall        f1
38466   0.528600   0.532723  0.700638  0.605250
38475   0.528836   0.532833  0.700396  0.605231
38465   0.528598   0.532690  0.700638  0.605229
38468   0.528649   0.532727  0.700558  0.605223
38474   0.528789   0.532800  0.700396  0.605210
38464   0.528587   0.532657  0.700638  0.605208
38477   0.528842   0.532837  0.700315  0.605203
38467   0.528634   0.532694  0.700558  0.605202
38473   0.528787   0.532768  0.700396  0.605189
38463   0.528585   0.532625  0.700638  0.605187


In [116]:
threshold_lgbm = 0.47
y_pred_thresh_lgbm = (y_proba_lgbm_o >= threshold_lgbm).astype(int)

print(f"\nClassification report con threshold {threshold_lgbm}:")
print(classification_report(y_f_test, y_pred_thresh_lgbm))


Classification report con threshold 0.47:
              precision    recall  f1-score   support

           0       0.92      0.77      0.84     42372
           1       0.49      0.76      0.60     12373

    accuracy                           0.77     54745
   macro avg       0.70      0.76      0.72     54745
weighted avg       0.82      0.77      0.78     54745



- Generamos un "predicciones_test.csv" para poder hacer pruebas en el pront y visualizar algo de nuestro proyecto

In [128]:
# Recuperamos las columnas originales de X_test para saber ubicación y tiempo
df_test = X_f_test.copy()
df_test["y_true"] = y_f_test.values  # etiqueta real
df_test["y_proba"] = y_proba_lgbm_o       # probabilidad predicha

# Si tienes la columna Fecha y hora en tu dataset original:
# Solo nos quedamos con las columnas que X_test no tiene
extra_cols = df_processed.loc[X_test.index, ["Fecha", "lon", "lat", "hora"]]

# Concatenamos
df_test = pd.concat([X_test, extra_cols], axis=1)
df_test["y_true"] = y_test.values
df_test["y_proba"] = y_proba


# Guardar para usar en la app
df_test.to_csv("predicciones_test.csv", index=False)


---------

#### SEGUNDA OPTIMIZACION DEL LGBM

In [213]:
# Definimos el modelo base
lgbm_o2 = LGBMClassifier(random_state=42, class_weight='balanced')

# Espacio de hiperparámetros
param_dist_2 = {
    'num_leaves': [200, 20, 100],
    'max_depth': [15,10,20],
    'learning_rate': [0.2806477930901046, 0.10],
    'n_estimators': [845, 900],
    'subsample': [0.9439010917466873, 0.90], 
    'colsample_bytree': [0.9967411011074678,1], 
    'reg_alpha': [0.5429749378530826,0.49], 
    'reg_lambda': [1.3008251868764253,1.35]
}

# Configuramos RandomizedSearchCV
random_search_lgbm_o2 = RandomizedSearchCV(
    estimator=lgbm_o2,
    param_distributions=param_dist_2,
    n_iter=50,                # número de combinaciones a probar
    scoring='f1',        # métrica a optimizar
    cv=3,                     # validación cruzada
    verbose=2,
    random_state=22,
    n_jobs=-1
)

# Entrenamos
random_search_lgbm_o2.fit(X_f_train, y_f_train)

# Mejor modelo encontrado
best_lgbm_o2 = random_search_lgbm_o2.best_estimator_
print("Mejores hiperparámetros:", random_search_lgbm_o2.best_params_)

# Evaluamos en test
y_pred_lgbm_o2 = best_lgbm_o2.predict(X_f_test)
y_proba_lgbm_o2 = best_lgbm_o2.predict_proba(X_f_test)[:,1]

print("\nClassification Report:")
print(classification_report(y_f_test, y_pred_lgbm_o))
print("AUC:", roc_auc_score(y_f_test, y_proba_lgbm_o))







Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

------

- Prueba modelo base lgbm 2

In [214]:
lgbm_model2 = LGBMClassifier(
    objective='binary',
    random_state=22,
    boosting_type='gbdt',
    num_leaves=200,
    max_depth=15,
    learning_rate=0.2806477930901046,
    n_estimators=845,
    subsample=0.9439010917466873,
    colsample_bytree=0.9967411011074678,
    reg_alpha=0.5429749378530826,
    reg_lambda=1.3008251868764253,
    class_weight='balanced',  # para compensar el desbalanceo
    verbose=-1, threshold= 0.48
)
lgbm_model2.fit(X_f_train, y_f_train)

y_pred_lgbm2 = lgbm_model2.predict(X_f_test)
y_proba_lgbm2 = lgbm_model2.predict_proba(X_f_test)[:,1]

In [217]:
report_lgbm2 = classification_report(y_f_test, y_pred_lgbm2)
print(report_lgbm2)

              precision    recall  f1-score   support

           0       0.88      0.87      0.88     42372
           1       0.58      0.61      0.59     12373

    accuracy                           0.81     54745
   macro avg       0.73      0.74      0.74     54745
weighted avg       0.81      0.81      0.81     54745



hola


In [177]:
threshold = 0.48  # <-- aquí puedes cambiarlo
y_pred_thresh = (y_proba >= threshold).astype(int)

print("Classification Report con threshold ajustado:")
print(classification_report(y_f_test, y_pred_thresh))

Classification Report con threshold ajustado:
              precision    recall  f1-score   support

           0       0.91      0.78      0.84     42372
           1       0.50      0.75      0.60     12373

    accuracy                           0.77     54745
   macro avg       0.71      0.76      0.72     54745
weighted avg       0.82      0.77      0.79     54745



--------

- Entrnamos red neuronal

In [197]:
def entrenar_red_neuronal_avanzada(X_train, y_train, X_val, y_val, epochs=50, batch_size=256):
    """Entrena una red neuronal más compleja para clasificación binaria"""
    
    model = keras.Sequential([
        layers.Dense(256, activation="relu", input_shape=(X_train.shape[1],)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        
        layers.Dense(128, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.2),
        
        layers.Dense(1, activation="sigmoid")  # salida probabilidades
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss="binary_crossentropy",
        metrics=["accuracy", keras.metrics.AUC(name="auc")]
    )
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1,
        callbacks=[
            keras.callbacks.EarlyStopping(
                patience=5, restore_best_weights=True, monitor="val_auc"
            )
        ]
    )
    
    return model, history

In [211]:
def analizar_modelo_nn(model, X_test, y_test, threshold=0.4):
    """Analiza el rendimiento de la red neuronal"""
    
    y_proba_rd = model.predict(X_test).ravel()
    y_pred_rd = (y_proba >= threshold).astype(int)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(f"🔎 Threshold actual: {threshold}")
    print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
    print("\n📊 Classification Report:")
    print(classification_report(y_test, y_pred_rd))
    
    return y_proba_rd

In [212]:
model, history = entrenar_red_neuronal_avanzada(
    X_f_train, y_f_train,
    X_f_test, y_f_test,
    epochs=50, batch_size=256, 
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.7376 - auc: 0.5962 - loss: 0.5677 - val_accuracy: 0.7740 - val_auc: 0.6582 - val_loss: 0.5051
Epoch 2/50
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7488 - auc: 0.6632 - loss: 0.5310 - val_accuracy: 0.7740 - val_auc: 0.6727 - val_loss: 0.5041
Epoch 3/50
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7490 - auc: 0.6712 - loss: 0.5268 - val_accuracy: 0.7740 - val_auc: 0.6692 - val_loss: 0.5111
Epoch 4/50
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.7490 - auc: 0.6771 - loss: 0.5237 - val_accuracy: 0.7740 - val_auc: 0.6662 - val_loss: 0.5142
Epoch 5/50
[1m856/856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.7490 - auc: 0.6809 - loss: 0.5217 - val_accuracy: 0.7740 - val_auc: 0.6786 - val_loss: 0.4980
Epoch 6/50
[1m856/856[0m

In [204]:
y_proba_rd = analizar_modelo_nn(model, X_f_test, y_f_test, threshold=0.48)


[1m1711/1711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
🔎 Threshold actual: 0.48
TN: 32235, FP: 10137, FN: 2909, TP: 9464

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.78      0.84     42372
           1       0.50      0.75      0.60     12373

    accuracy                           0.77     54745
   macro avg       0.71      0.76      0.72     54745
weighted avg       0.82      0.77      0.79     54745



In [206]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(
        units=hp.Int("units_1", min_value=64, max_value=512, step=64),
        activation="relu",
        input_shape=(X_f_train.shape[1],)
    ))
    model.add(layers.Dropout(hp.Float("dropout_1", 0.2, 0.5, step=0.1)))
    
    # Capa oculta adicional
    model.add(layers.Dense(
        units=hp.Int("units_2", min_value=32, max_value=256, step=32),
        activation="relu"
    ))
    model.add(layers.Dropout(hp.Float("dropout_2", 0.2, 0.5, step=0.1)))
    
    model.add(layers.Dense(1, activation="sigmoid"))
    
    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=hp.Choice("learning_rate", [1e-2, 1e-3, 1e-4])
        ),
        loss="binary_crossentropy",
        metrics=["accuracy", keras.metrics.AUC(name="auc")]
    )
    return model

# Búsqueda de hiperparámetros
tuner = kt.RandomSearch(
    build_model,
    objective="val_auc",
    max_trials=10,
    executions_per_trial=2,
    directory="nn_tuning",
    project_name="accidentes"
)

tuner.search(X_f_train, y_f_train, validation_data=(X_f_test, y_f_test), epochs=20, batch_size=512)

best_hps = tuner.get_best_hyperparameters(1)[0]
print("🔎 Mejores hiperparámetros encontrados:")
print(best_hps.values)


Trial 10 Complete [00h 01m 25s]
val_auc: 0.5169914960861206

Best val_auc So Far: 0.675528883934021
Total elapsed time: 00h 12m 03s
🔎 Mejores hiperparámetros encontrados:
{'units_1': 256, 'dropout_1': 0.4, 'units_2': 96, 'dropout_2': 0.2, 'learning_rate': 0.001}


- Hacer prueba ensamble

In [209]:
def ensamble_hibrido(model_lgbm_o, model_rd, X_test, weights=(0.5, 0.5)):
    """Ensamble híbrido LGBM + Red Neuronal (promedio ponderado de probabilidades)"""
    
    # Probabilidades de cada modelo
    y_proba_lgbm = model_lgbm_o.predict_proba(X_test)[:, 1]
    y_proba_nn = model_rd.predict(X_test).flatten()  # NN devuelve un vector
    
    # Promedio ponderado
    y_proba_ensemble = (weights[0] * y_proba_lgbm) + (weights[1] * y_proba_nn)
    
    return y_proba_ensemble


In [220]:
with open("modelo_lgbm_optimizado.pkl", "wb") as f:
    pickle.dump(lgbm_o2, f)
