In [None]:
#Cargar librerias
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Cargar los datos
df=pd.read_spss('DB.sav')

In [None]:
# Lista de variables para transformar en categóricas
cols = ["TIPO_HIST","FENOTIPO","TvsS","NECR","INVA","GANGLIOS","RECIDIVA","SIT_ACTUAL"]

for col in cols:
    df[col] = df[col].astype("category")
    df[f"{col}_CAT"] = df[col].cat.codes.replace(-1, np.nan)

## Funciones

In [None]:
def crear_dfs(df, columna):
    """
    Filtra el DataFrame basado en los valores especificados en la columna.
    
    :param df: DataFrame de entrada.
    :param columna: Columna resultado.
    :return: Dos DataFrames (uno con todas las citoquinas y otro con las significativas).
    """
    # Filtramos las columnas interesantes (Individual cytokine names have been anonymized as ‘cytokines’ to preserve confidentiality)
    columnas_total=['EDAD', 'TAMAÑO', 'GDIF2CAT', 'LUM_NO_LUM', 'KI67','TILS', 'SLE','SG', 'cytokines', 'TIPO_HIST_CAT','FENOTIPO_CAT', 'TvsS_CAT', 'NECR_CAT', 'INVA_CAT', 'GANGLIOS_CAT', 'RECIDIVA_CAT', 'SIT_ACTUAL_CAT']
    
    # Filtramos las columnas interesantes (Individual cytokine names have been anonymized as ‘cytokines’ to preserve confidentiality)
    columnas_cito = ['cytokines'] + [columna]

    # Filtramos las columnas interesantes (resultado de VolcanoPlot; Individual cytokine names have been anonymized as ‘cytokines_volcano_plot’ to preserve confidentiality)
    columnas_seleccion = ['cytokines_volcano_plot'] + [columna]


    df_todo = df[columnas_total]
    df_cito = df[columnas_cito]
    df_seleccion = df[columnas_seleccion]
    return df_todo, df_cito, df_seleccion

# Función para target binario (TvsS_CAT y LUM_NO_LUM)
def train_model_binary(df, columna):
    """
    Entrena y evalúa modelos de Machine Learning (RandomForest, XGBoost y LightGBM).
    
    :param df: DataFrame de entrada con los datos.
    :param columna: Columna objetivo.
    """
    # Eliminar los NA de la columna resultado
    df=df.dropna(subset=[columna])
    
    # Separar features y target     
    X = df.drop(columns=[columna])
    y = df[columna]
    
    # Dividir en conjunto de entrenamiento y prueba (80/20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f'Tamaño entrenamiento {len(X_train)}')
    print(f'Tamaño entrenamiento {len(X_test)}')
    
    # Definir hiperparámetros
    rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
    xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
    lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
    
    models = {'Random Forest': rf_model, 'XGBoost': xgb_model, 'LightGBM': lgb_model}
    
    for model_name, model in models.items():
        print(f"Entrenando {model_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluación del modelo
        print(f"{model_name} - Classification Report:\n", classification_report(y_test, y_pred))
        print(f"{model_name} - AUC-ROC Score:", roc_auc_score(y_test, y_pred))
        
        # Matriz de Confusión
        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=set(y), yticklabels=set(y))
        plt.title(f'{model_name} - Matriz de Confusión')
        plt.show()
        
        # Cálculo de métricas adicionales
        TP = conf_matrix[1, 1] if conf_matrix.shape == (2, 2) else 0
        TN = conf_matrix[0, 0] if conf_matrix.shape == (2, 2) else 0
        FP = conf_matrix[0, 1] if conf_matrix.shape == (2, 2) else 0
        FN = conf_matrix[1, 0] if conf_matrix.shape == (2, 2) else 0
        
        if conf_matrix.shape == (2, 2):
            accuracy = (TP + TN) / (TP + TN + FP + FN)
            ppv = TP / (TP + FP) if (TP + FP) > 0 else 0  # Valor Predictivo Positivo
            npv = TN / (TN + FN) if (TN + FN) > 0 else 0  # Valor Predictivo Negativo
            sensibilidad = TP / (TP + FN) if (TP + FN) > 0 else 0  # Sensibilidad
            especificidad = TN / (TN + FP) if (TN + FP) > 0 else 0  # Especificidad
            riesgo = (TP / (TP + FN)) / (FP / (FP + TN)) if (FP + TN) > 0 and (TP + FN) > 0 else 0  # Odds ratio
            
            # Imprimir KPIs
            print(f"{model_name} - Accuracy: {accuracy:.4f}")
            print(f"{model_name} - PPV (Positive Predictive Value): {ppv:.4f}")
            print(f"{model_name} - NPV (Negative Predictive Value): {npv:.4f}")
            print(f"{model_name} - Sensibilidad (Recall): {sensibilidad:.4f}")
            print(f"{model_name} - Especificidad: {especificidad:.4f}")
            print(f"{model_name} - Riesgo (Odds Ratio): {riesgo:.4f}")
        
        # Importancia de características
        feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
        
        # Visualización de importancia de variables
        plt.figure(figsize=(10, 6))
        sns.barplot(x=feature_importances[:20], y=feature_importances.index[:20], hue=feature_importances.index[:20], palette='viridis', dodge=False, legend=False)
        plt.title(f"{model_name} - Importancia de Variables (Top 20)")
        plt.xlabel("Importancia")
        plt.ylabel("Variables (Proteínas)")
        plt.show()
    
    return models

# Función para target multiple (FENOTIPO_CAT)
def train_model_multi(df, columna):
    """
    Entrena y evalúa modelos de Machine Learning (RandomForest, XGBoost y LightGBM).
    
    :param df: DataFrame de entrada con los datos.
    :param columna: Columna objetivo con categoría múltiple.
    """
    X = df.drop(columns=[columna])  # variables independientes
    y = df[columna]                 # variable dependiente
    
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Definición de modelos (hiperparámetros)
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=500, random_state=42),
        "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42)
    }
    
    # Entrenamiento, predicción y evaluación
    for name, model in models.items():
        print(f"\n--- {name} ---")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        try:
            auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
            print(f"AUC: {auc:.4f}")
        except:
            pass 
        
        # Matriz de confusión
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"{name} - Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()
    
        # Importancia de características (solo para modelos basados en árboles)
        feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
        feature_importances[:20].plot(kind='barh', title=f"{name} - Top 20 Features", figsize=(8, 6))
        plt.gca().invert_yaxis()
        plt.show()

In [None]:
df_tvss, df_cito_tvss, df_tvss_seleccion = crear_dfs(df, 'TvsS_CAT')
df_lum, df_cito_lum, df_lum_seleccion = crear_dfs(df, 'LUM_NO_LUM')
df_fen, df_cito_fen,df_fen_seleccion = crear_dfs(df, 'FENOTIPO_CAT')

# TvsS

In [None]:
train_model_binary(df_tvss, 'TvsS_CAT')

In [None]:
train_model_binary(df_cito_tvss, 'TvsS_CAT')

In [None]:
train_model_binary(df_tvss_seleccion, 'TvsS_CAT')

# Luminal vs No Luminal

In [None]:
train_model_binary(df_lum, 'LUM_NO_LUM')

In [None]:
train_model_binary(df_cito_lum, 'LUM_NO_LUM')

In [None]:
train_model_binary(df_lum_seleccion, 'LUM_NO_LUM')

# Fenotipo

In [None]:
train_model_multi(df_fen, 'FENOTIPO_CAT')

In [None]:
train_model_multi(df_cito_fen, 'FENOTIPO_CAT')

In [None]:
train_model_multi(df_fen_seleccion, 'FENOTIPO_CAT')