In [None]:
# Importamos las librerias
import matplotlib.pyplot as plt
import seaborn as sns
import os

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, label_binarize, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, cohen_kappa_score
)

from sklearn.feature_selection import chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression, Lasso, Ridge

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Definimos las funciones
# Funcion para crear las DF necesarios para cada columna
def prepare_filtered_datasets(df):
    filters = {
        "sinovitis": ["Assay", "deposito", "microcristales", "suero", "sexo", "edad","ac_urico"],
        "deposito": ["Assay", "microcristales", "sinovitis", "suero", "sexo", "edad","ac_urico"],
        "microcristales": ["Assay", "deposito", "sinovitis", "suero", "sexo", "edad","ac_urico"]
    }

    datasets = {}
    for target, drop_cols in filters.items():
        if target == 'sinovitis':
            df['leucocito'] = df['leucocito'].fillna(df['leucocito'].median())
        df_filtered = df.drop(columns=[col for col in drop_cols if col in df.columns])
        df_filtered = df_filtered.dropna(subset=[target])
        datasets[target] = df_filtered
    return datasets

def get_metrics(y_true, y_pred, y_proba, target, average='macro'):
    metrics = {}
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        vpn = tn / (tn + fn) if (tn + fn) > 0 else np.nan
        metrics['VPN'] = round(vpn, 2)
    else:
        metrics['VPN'] = np.nan

    # if target == 'microcristales':
    #     clases = sorted(set(y_true))
    #     y_true_bin = label_binarize(y_true, classes=clases)
    #     for i, clase in enumerate(clases):
    #         auc = roc_auc_score(y_true_bin[:, i], y_proba[:, i])
    #         metrics[f'AUC_class_{clase}'] = round(auc, 2)
            
    metrics.update({
        'Accuracy': round(accuracy_score(y_true, y_pred), 2),
        'F1_macro': round(f1_score(y_true, y_pred, average=average), 2),
        'F1_weighted': round(f1_score(y_true, y_pred, average='weighted'), 2),
        'Precision (VPP)': round(precision_score(y_true, y_pred, average=average), 2),
        'Recall_macro': round(recall_score(y_true, y_pred, average=average), 2),
        'Recall_weighted': round(recall_score(y_true, y_pred, average='weighted'), 2),
        'AUC_macro': round(roc_auc_score(y_true, y_proba, average=average, multi_class='ovo'), 2),
        'AUC_weighted': round(roc_auc_score(y_true, y_proba, average='weighted', multi_class='ovo'), 2),
        'Kappa': round(cohen_kappa_score(y_true, y_pred), 2)
    })
    return metrics

def build_selector(method, k):
    if method == 'all':
        return 'passthrough'
    if method == 'f_classif':
        return SelectKBest(score_func=f_classif, k=k)
    if method == 'chi2':
        return SelectKBest(score_func=chi2, k=k)
    if method == 'rfe':
        return RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=k)
    if method == 'lasso':
        return SelectFromModel(Lasso(alpha=0.01), max_features=k)
    if method == 'ridge':
        return SelectFromModel(Ridge(alpha=1.0, max_iter=1000))
    raise ValueError(f"Unknown method: {method}")

def run_pipeline_feature_selection(df, target_col, feature_counts=[10], methods=['all', 'f_classif'], models=None, output_dir='./plots'):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    results = []

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

    if models is None:
        # Selección condicional del modelo de regresión logística
        if target_col == 'microcristales':
            logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        else:
            logistic_model = LogisticRegression()

        models = {
            "LogisticRegression": logistic_model,
            "RandomForest": RandomForestClassifier(random_state=42),
            "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
            "SVM": SVC(kernel='linear', probability=True, random_state=42),
            "NaiveBayes": GaussianNB(),
            "KNN": KNeighborsClassifier(),
            "LightGBM": LGBMClassifier(random_state=42),
            "AdaBoost": AdaBoostClassifier(random_state=42),
            "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
        }

    importance_dir = os.path.join(output_dir, 'feature_importance')
    os.makedirs(importance_dir, exist_ok=True)

    for method in methods:
        for k in feature_counts:
            selector = build_selector(method, k)
            if method == 'chi2':
                positive_transform = FunctionTransformer(lambda x: (x - np.min(x, axis=0)) + 1e-9, validate=False)
                preprocessor = ColumnTransformer(transformers=[
                    ('num', Pipeline([
                        ('positive', positive_transform),
                        ('selector', selector)
                    ]), numeric_features)
                ])
            else:
                preprocessor = ColumnTransformer(transformers=[
                    ('num', Pipeline([
                        ('scaler', StandardScaler()),
                        ('selector', selector)
                    ]), numeric_features)
                ])

            for name, model in models.items():
                pipe = Pipeline([
                    ('preprocessor', preprocessor),
                    ('classifier', model)
                ])
                try:
                    pipe.fit(X_train, y_train)
                    y_pred = pipe.predict(X_test)
                    y_proba = pipe.predict_proba(X_test)
                    y_proba_input = y_proba[:, 1] if y_proba.shape[1] == 2 else y_proba

                    metrics = get_metrics(y_test, y_pred, y_proba_input, target=target_col)
                    metrics.update({
                        'Model': name,
                        'Feature_Selection': method,
                        'Num_Features': k
                    })
                    results.append(metrics)

                    # Obtener nombres de variables seleccionadas
                    fitted_selector = pipe.named_steps['preprocessor'].named_transformers_['num'].named_steps['selector']
                    if hasattr(fitted_selector, 'get_support'):
                        mask = fitted_selector.get_support()
                        selected_features = [f for f, m in zip(numeric_features, mask) if m]
                    else:
                        selected_features = numeric_features

                    model_final = pipe.named_steps['classifier']
                    importances = None
                    
                    if hasattr(model_final, "feature_importances_"):
                        importances = model_final.feature_importances_
                    elif hasattr(model_final, "coef_"):
                        coef = np.abs(model_final.coef_)
                        importances = coef.mean(axis=0) if coef.ndim > 1 else coef.ravel()

                    # Fallback: Permutation importance si no hay atributos nativos
                    if importances is None:
                        try:
                            # X_test procesado (ya transformado por preprocessor)
                            X_test_transformed = pipe.named_steps['preprocessor'].transform(X_test)
                            result = permutation_importance(
                                model_final,
                                X_test_transformed,
                                y_test,
                                n_repeats=10,
                                random_state=42,
                                scoring='accuracy',
                                n_jobs=-1  # Usa todos los núcleos disponibles
                            )
                            importances = result.importances_mean
                        except Exception as e:
                            print(f"⚠️ No se pudo calcular permutation_importance para {name} ({method}, k={k}): {e}")


                    if importances is not None and selected_features is not None and len(importances) == len(selected_features):
                        imp_df = pd.DataFrame({
                            'Feature': selected_features,
                            'Importance': importances
                        }).sort_values(by='Importance', ascending=False)

                        imp_df = imp_df.head(20)

                        column_map = {
                            "microcristales": "Microcrystals",
                            "deposito": "Deposits",
                            "sinovitis": "Synovitis"
                        }

                        if target_col in column_map:
                            target_col = column_map[target_col]

                        plt.figure(figsize=(10, 6))
                        sns.barplot(data=imp_df, x='Importance', y='Feature')
                        plt.title(f'Feature selection — {target_col} — {name} ({method}, k={k})')
                        plt.tight_layout()

                        fname = f"{target_col}_{name}_{method}_k{k}_importance.png"
                        fpath = os.path.join(importance_dir, fname)
                        plt.savefig(fpath)
                        plt.close()

                except Exception as e:
                    print(f"⚠️ Error con modelo {name}, método {method}, k={k}: {e}")
    
    return pd.DataFrame(results)

def save_boxplots(results_df, target_name, metrics=None, by_fields=["Model", "Feature_Selection"], output_dir="./plots"):
    os.makedirs(output_dir, exist_ok=True)

    if metrics is None:
        metrics = ["F1_macro", 'F1_weighted', 'Recall_macro', 'Recall_weighted', 'AUC_macro', 'AUC_weighted', "Kappa"]
        
    for metric in metrics:
        for by in by_fields:
            plt.figure(figsize=(12, 6))
            sns.boxplot(data=results_df, x=by, y=metric)
            plt.title(f"{metric} por {by} — {target_name}")
            plt.xticks(rotation=45)
            plt.grid(True, axis='y')
            plt.tight_layout()

            filename = f"{output_dir}/{target_name}_{metric}_by_{by}.png"
            plt.savefig(filename)
            plt.close()

def run_all_experiments(df):
    datasets = prepare_filtered_datasets(df)
    feature_methods = ['all', 'f_classif', 'chi2', 'rfe', 'lasso', 'ridge']
    feature_counts = [5, 10, 15, 20]

    results = {}
    for target, df_clean in datasets.items():
        df_clean = df_clean.drop(df_clean.columns[0], axis=1) #Remove sample ID
        print(df_clean.columns)
        print(f"\n▶️ Ejecutando para target: {target.upper()} — {df_clean.shape[0]} muestras, {df_clean.shape[1]-1} features")
        try:
            result = run_pipeline_feature_selection(
                df=df_clean,
                target_col=target,
                feature_counts=feature_counts,
                methods=feature_methods,
                output_dir=f"./plots/{target}"
            )
            results[target] = result
            result.to_csv(f"./plots/results_{target}.csv", index=False)
            result[result['Kappa'] > 0.5].to_csv(f"./plots/results_{target}_filter_kappa.csv", index=False)
            save_boxplots(result, target_name=target, output_dir=f"./plots/{target}")
        except Exception as e:
            print(f"⚠️ Error ejecutando target {target}: {e}")
            results[target] = None

    return results

In [None]:
# Leemos los datos
df = pd.read_csv("./datos/combinacion.csv",sep=';')

In [None]:
all_results = run_all_experiments(df)