In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, log_loss, roc_curve

def improved_random_forest(csv_file, target_column='quality', scale=False):
    data = pd.read_csv(csv_file, sep=',')  # <— ajusta aquí
    y = data['quality'].apply(lambda v: 1 if v >= 7 else 0)
    X = data.drop('quality', axis=1)

    # 2. Escalado
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 3. Train/test split estratificado
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    # 4. Definir Random Forest con pesos de clase
    base_rf = RandomForestClassifier(
        n_jobs=-1,
        random_state=42,
        class_weight='balanced_subsample'
    )

    # 5. Búsqueda aleatoria de hiperparámetros (puedes ajustar n_iter)
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 0.5],
        'bootstrap': [True, False]
    }

    search = RandomizedSearchCV(
        base_rf, param_distributions=param_dist,
        n_iter=5, cv=3, scoring='accuracy',
        random_state=42, n_jobs=-1, verbose=1
    )
    search.fit(X_train, y_train)
    best_rf = search.best_estimator_

    # 6. Predicciones probabilísticas
    y_pred_probs = best_rf.predict_proba(X_test)[:, 1]

    # 7. Umbral óptimo via ROC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    y_pred_opt = (y_pred_probs >= optimal_threshold).astype(int)

    # 8. Cálculo de métricas universales
    auc = roc_auc_score(y_test, y_pred_probs)
    mcc = matthews_corrcoef(y_test, y_pred_opt)
    ll  = log_loss(y_test, y_pred_probs)

    print(f"AUC-ROC    : {auc:.4f}")
    print(f"MCC        : {mcc:.4f}")
    print(f"Log-Loss   : {ll:.4f}")
    print(f"Threshold  : {optimal_threshold:.4f}")

In [36]:
metrics = improved_random_forest("../cleanDatasets/winequality-red_cleaned.csv", target_column="quality", scale=True)

metrics = improved_random_forest("../cleanDatasets/winequality-white_cleaned.csv", target_column="quality", scale=True)

KeyError: 'quality'