## Optuna para Insecta

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import optuna
import joblib
import json
import warnings

# Crear carpeta de resultados\output_dir = 'tuning_optuna'
output_dir = 'tuning_optuna'
os.makedirs(output_dir, exist_ok=True)

# Ignorar advertencias futuras
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- 1. Carga y Preparación de Datos ---
try:
    df_final = pd.read_csv('data_insecta_B.csv')
except FileNotFoundError:
    print("Advertencia: 'data_insecta_B.csv' no encontrado. Usando datos de ejemplo.")
    data = {f'feature{i}': np.random.rand(200) for i in range(10)}
    data['is_insecta'] = np.random.randint(0, 2, 200)
    df_final = pd.DataFrame(data)

# Asegurar que la columna objetivo sea 0/1
df_final['is_insecta'] = df_final['is_insecta'].astype(int)

X = df_final.drop(columns=['is_insecta'])
y = df_final['is_insecta']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Balanceo con SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Escalado para modelos lineales y SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)

# Calcular ratio para XGBoost
y_ratio = sum(y_train==0) / sum(y_train==1)

# --- 2. Funciones objetivo para Optuna ---
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300, step=50),
        'max_depth': trial.suggest_int('max_depth', 5, 50, step=5),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5]),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    trial.set_user_attr('f1', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    trial.set_user_attr('accuracy', acc)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('recall', recall)
    return f1

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300, step=50),
        'max_depth': trial.suggest_int('max_depth', 5, 50, step=5),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'scale_pos_weight': y_ratio,
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True)
    }
    model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    trial.set_user_attr('f1', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    trial.set_user_attr('accuracy', acc)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('recall', recall)
    return f1

def objective_lr(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    if penalty == 'elasticnet':
        solver = 'saga'
    else:
        solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    params = {
        'C': trial.suggest_float('C', 1e-5, 100, log=True),
        'penalty': penalty,
        'solver': solver,
        'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0) if penalty == 'elasticnet' else 0.0,
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }
    model = LogisticRegression(**params, max_iter=1000, random_state=42)
    model.fit(X_train_scaled, y_train_bal)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:,1]

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    trial.set_user_attr('f1', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    trial.set_user_attr('accuracy', acc)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('recall', recall)
    return f1

def objective_svm(trial):
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly'])
    params = {
        'C': trial.suggest_float('C', 1e-3, 1000, log=True),
        'kernel': kernel,
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'tol': trial.suggest_float('tol', 1e-4, 1e-2, log=True)
    }
    if kernel == 'poly': params['degree'] = trial.suggest_int('degree', 2, 5)
    model = SVC(**params, probability=True, random_state=42, max_iter=10000)
    model.fit(X_train_scaled, y_train_bal)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:,1]

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    trial.set_user_attr('f1', f1)
    trial.set_user_attr('roc_auc', roc_auc)
    trial.set_user_attr('accuracy', acc)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('recall', recall)
    return f1

# --- 3. Bucle de Afinamiento y Guardado ---
objectives = {'Random Forest': objective_rf, 'XGBoost': objective_xgb, 'Logistic Regression': objective_lr, 'SVM': objective_svm}
for model_name, objective in objectives.items():
    print(f"--- Afinando: {model_name} ---")
    study = optuna.create_study(direction='maximize', storage='sqlite:///optuna_tuning.db', study_name=f"{model_name}_tuning", load_if_exists=True)
    study.optimize(objective, n_trials=50)

    # Guardar resultados de todos los trials
    records = []
    for t in study.trials:
        rec = {'trial': t.number, 'model': model_name}
        rec.update(t.params)
        rec.update({k: t.user_attrs[k] for k in ['f1','roc_auc','accuracy','precision','recall']})
        records.append(rec)
    df_res = pd.DataFrame(records)
    df_res.to_json(os.path.join(output_dir, f"results_{model_name}.json"), orient='records', indent=4)

    # Guardar mejores hiperparámetros en JSON
    best_params = study.best_params
    summary_file = os.path.join(output_dir, f"best_params_{model_name}.json")
    with open(summary_file, 'w') as f:
        json.dump(best_params, f, indent=4)
    print(f"Mejores hiperparámetros guardados en: {summary_file}")

print("--- Afinamiento completo y parámetros guardados en 'tuning_optuna' ---")