# ***SVC RFF***

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.svm import SVC
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, log_loss, roc_auc_score, roc_curve, auc
)
import optuna
from optuna.pruners import MedianPruner
import optuna.visualization as vis
import plotly.graph_objects as go
import lime.lime_tabular
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler


from reporte_metricas import ReporteMetricas
reporte = ReporteMetricas()

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:

# =======================
# LOAD DATA
# =======================
file_path = "../Saber_pro_sampled_data.csv"
df = pd.read_csv(file_path)
#df = df.head(3000)
X = df.drop(columns=["MOD_INGLES_DESEM"])
y = df["MOD_INGLES_DESEM"]

categorical_features = X.select_dtypes(include='object').columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
le = LabelEncoder()
y = pd.Series(le.fit_transform(y), index=y.index)

class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class mapping:", class_mapping)

In [None]:
df.shape

In [None]:
# ============================
# FILE PATHS
# ============================
model_filename = "../Models/best_svc_model_rff.pkl"
study_filename = "../Study/optuna_study_SVC_nested_rff.pkl"
metrics_filename = "../Models/best_svc_metrics_rff.pkl"
fold_metrics_filename = "../Metrics/svc_folds_summary_rff.csv"

In [None]:
# ============================
# MODEL PIPELINE BUILDER
# ============================
def build_pipeline(params: dict, numeric_features: list, categorical_features: list):
    preprocessor = ColumnTransformer([
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="constant", fill_value="Sin Dato")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), categorical_features)
    ],
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
    )

    rff = RBFSampler(
        gamma=params.pop("gamma_rff", 1.0),
        random_state=SEED
    )

    sgd = SGDClassifier(
        loss="hinge",
        random_state=SEED,
        class_weight='balanced',
        **params
    )

    model = Pipeline([
        ("rff", rff),
        ("sgd", sgd)
    ])

    return Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])


In [None]:
# ============================
# MODEL EVALUATION
# ============================
def evaluate_model_sgd(model, X_data, y_data):
    y_pred = model.predict(X_data)
    #y_proba = model.predict_proba(X_data)
    f1 = f1_score(y_data, y_pred, average='weighted')
    acc = accuracy_score(y_data, y_pred)
    #loss = log_loss(y_data, y_proba)
    #auc = roc_auc_score(y_data, y_proba, multi_class='ovr', average='weighted')
    report = classification_report(y_data, y_pred)
    cm = confusion_matrix(y_data, y_pred)
    return f1, acc, report, cm, y_pred

# ============================
# CONFUSION MATRIX PLOTTER
# ============================
def plot_confusion_matrix(cm, labels, title):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format="d")
    disp.ax_.set_title(title)

In [None]:
# ============================
# OPTUNA INNER CV OPTIMIZATION
# ============================
def run_inner_optuna(X_inner, y_inner, numeric_features, categorical_features, n_trials=50):
    def objective(trial):
        params = {
            "alpha": trial.suggest_float("alpha", 1e-6, 1e-2, log=True),
            "penalty": trial.suggest_categorical("penalty", ["l2"]),
            "gamma_rff": trial.suggest_float("gamma_rff", 0.01, 1.0, log=True),
            "max_iter": 1000,
            "tol": 1e-3
        }

        model = build_pipeline(params, numeric_features, categorical_features)
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
        scores = []

        for train_idx, val_idx in skf.split(X_inner, y_inner):
            X_t, X_v = X_inner.iloc[train_idx], X_inner.iloc[val_idx]
            y_t, y_v = y_inner.iloc[train_idx], y_inner.iloc[val_idx]
            model.fit(X_t, y_t)
            y_pred = model.predict(X_v)
            scores.append(f1_score(y_v, y_pred, average='weighted'))

        return np.mean(scores)

    study = optuna.create_study(direction="maximize", pruner=MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=5)
    return study.best_params, study


In [None]:
# ============================
# SAVE FOLD METRICS
# ============================
def save_metrics_folds(folds_metrics: list, filename: str):
    df = pd.DataFrame(folds_metrics)
    resumen = df.describe().T[['mean', 'std']].reset_index()
    resumen.rename(columns={'index': 'metric'}, inplace=True)
    df_full = pd.concat([df, resumen], axis=0)
    df_full.to_csv(filename, index=False)
    return df, resumen

In [None]:
# ============================
# LIME EXPLAINER BUILDER
# ============================
def get_lime_explainer(model_pipeline, X_train_raw, y_train_raw):
    X_transformed = model_pipeline.named_steps['preprocessor'].transform(X_train_raw)
    feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()
    class_names = np.unique(y_train_raw).astype(str)
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X_transformed,
        feature_names=feature_names,
        class_names=class_names,
        mode='classification'
    )
    return explainer, X_transformed

In [None]:
# ============================
# NESTED CV LOOP
# ============================
def nested_cv(X, y, numeric_features, categorical_features):
    visualizations = {}
    all_folds_metrics = []

    if os.path.exists(model_filename) and os.path.exists(metrics_filename):
        best_model = joblib.load(model_filename)
        best_metrics = joblib.load(metrics_filename)
        if "best_fold" in best_metrics:
            study_dt = joblib.load(study_filename)['studies'][best_metrics['best_fold']]
        else:
            study_dt = None
        if study_dt:
            visualizations['optimization_history'] = vis.plot_optimization_history(study_dt)
            visualizations['parallel_coordinate'] = vis.plot_parallel_coordinate(study_dt)
            visualizations['param_importances'] = vis.plot_param_importances(study_dt)
            f1_scores = [t.value for t in study_dt.trials]
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=list(range(len(f1_scores))), y=f1_scores, mode='lines+markers', name='F1-score'))
            fig.update_layout(title='F1-Score Evolution During Optuna Optimization', xaxis_title='Trial', yaxis_title='F1-Score', template='plotly_dark')
            visualizations['f1_score_evolution'] = fig
        return best_model, best_metrics, study_dt, visualizations

    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    best_model, best_metrics = None, {}
    best_f1 = -np.inf
    optuna_studies = []
    best_fold = 1

    for fold, (train_idx, test_idx) in tqdm(
        enumerate(outer_cv.split(X, y), 1),
        total=outer_cv.get_n_splits(),
        desc = "Training folds"
    ):
        
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

        X_tr, X_val, y_tr, y_val = train_test_split(
                X_train_fold, y_train_fold,
                test_size=0.2, stratify=y_train_fold, random_state=SEED)

        start_time = time.time()
        best_params, study = run_inner_optuna(X_tr, y_tr, numeric_features, categorical_features)
        elapsed = time.time() - start_time
        optuna_studies.append(study)

        model = build_pipeline(best_params, numeric_features, categorical_features)
        model.fit(X_tr, y_tr)

        f1_train, acc_train, report_train, cm_train, y_train_pred = evaluate_model_sgd(model, X_tr, y_tr)
        f1_val, acc_val, report_val, cm_val, y_val_pred = evaluate_model_sgd(model, X_val, y_val)
        f1_test, acc_test, report_test, cm_test, y_test_pred = evaluate_model_sgd(model, X_test_fold, y_test_fold)

        fold_metrics = {
            "fold": fold,
            "f1_train": f1_train,
            "accuracy_train": acc_train,
            "log_loss_train": 999,
            "auc_train": 0,
            "f1_val": f1_val,
            "accuracy_val": acc_val,
            "log_loss_val": 999,
            "auc_val": 0,
            "f1_test": f1_test,
            "accuracy_test": acc_test,
            "log_loss_test":999,
            "auc_test": 0,
            "optuna_time": elapsed
        }
        all_folds_metrics.append(fold_metrics)

        if f1_test > best_f1:
            best_f1 = f1_test
            best_model = model
            best_metrics = {
                **fold_metrics,
                "params": best_params,
                "labels": np.unique(y),
                "best_fold": fold - 1,
                "X_train_fold": X_tr,
                "y_train_fold": y_tr,
                "y_train_pred": y_train_pred,
                "y_train_true": y_tr,
                #"y_proba_train": y_proba_train,
                "classification_report_train": report_train,
                "confusion_matrix_train": cm_train,
                "y_val_true": y_val,
                "y_val_pred": y_val_pred,
                #"y_proba_val": y_proba_val,
                "classification_report_val": report_val,
                "confusion_matrix_val": cm_val,
                "y_test_fold": y_test_fold,
                "y_test_pred": y_test_pred,
                #"y_proba_test": y_proba_test,
                "classification_report_test": report_test,
                "confusion_matrix_test": cm_test
            }

    joblib.dump(best_model, model_filename)
    joblib.dump({"studies": optuna_studies}, study_filename)
    joblib.dump(best_metrics, metrics_filename)
    save_metrics_folds(all_folds_metrics, fold_metrics_filename)

    study_dt = optuna_studies[best_metrics['best_fold']]
    visualizations['optimization_history'] = vis.plot_optimization_history(study_dt)
    visualizations['parallel_coordinate'] = vis.plot_parallel_coordinate(study_dt)
    visualizations['param_importances'] = vis.plot_param_importances(study_dt)

    f1_scores = [t.value for t in study_dt.trials]
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(len(f1_scores))), y=f1_scores, mode='lines+markers', name='F1-score'))
    fig.update_layout(title='F1-Score Evolution During Optuna Optimization', xaxis_title='Trial', yaxis_title='F1-Score', template='plotly_dark')
    visualizations['f1_score_evolution'] = fig

    return best_model, best_metrics, study_dt, visualizations


In [None]:
# Run pipeline
model, metrics, study, visualizations = nested_cv(X, y, numeric_features, categorical_features)

In [None]:
model

In [None]:
print(metrics['classification_report_train'])

In [None]:
plot_confusion_matrix(metrics['confusion_matrix_train'], metrics['labels'], "Train Confusion Matrix")

In [None]:
print(metrics['classification_report_val'])

In [None]:
plot_confusion_matrix(metrics['confusion_matrix_val'], metrics['labels'], "Validation Confusion Matrix")

In [None]:
print(metrics['classification_report_test'])

In [None]:
plot_confusion_matrix(metrics['confusion_matrix_test'], metrics['labels'], "Test Confusion Matrix")

In [None]:
visualizations['optimization_history']

In [None]:
visualizations['parallel_coordinate']

In [None]:
visualizations['f1_score_evolution']

In [None]:
visualizations['param_importances']

In [None]:
reporte.save(metrics, model_name="SVC_RFF")

In [None]:
df = reporte.load()

In [None]:
df.query("Model == 'SVC_RFF' and Type == 'train' and Class != 'global'").iloc[:, 0:6]

In [None]:
df.query("Model == 'SVC_RFF' and Type == 'val' and Class != 'global'").iloc[:, 0:6]

In [None]:
df.query("Model == 'SVC_RFF' and Type == 'test' and Class != 'global'").iloc[:, 0:6]


In [None]:
df.query("Model == 'SVC_RFF' and auc != '-'")[["Model", "Type", "accuracy", "log_loss", "auc"]]


In [None]:
df.query("Model == 'SVC_RFF' and Type == 'val' and Class	== 'global'")[["Model", "Type", "accuracy", "log_loss", "auc"]]

In [None]:
model.named_steps['classifier'].get_params()