In [2]:
import torch
import pandas as pd
import numpy as np
from captum.attr import IntegratedGradients
from convolutional import ConvRegressorModel, ConvClassifierModel, train_test_val_split, ConvRegressor, ConvClassifier
from label_encoder_unk import LabelEncoderWithUnknown
from sklearn.preprocessing import StandardScaler


def interpret_model(model_path: str,
                    df: pd.DataFrame,
                    target_col: str,
                    task: str,
                    embedding_dim: int,
                    conv_filters: list,
                    dense_units: int,
                    sample_idx: int = 0):
    """
    Muestra importancia de variables numéricas (IG) y categóricas (Ablation)
    para la predicción de df.iloc[sample_idx].
    """
    other = 'eco' if task=='regression' else 'punt_matematicas'
    exclude = {target_col, other}

    cat_cols = [c for c in df.select_dtypes(include=['object','category']).columns if c not in exclude]
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns   if c not in exclude]

    df_proc = df.dropna().copy()
    for c in cat_cols:
        le = LabelEncoderWithUnknown()
        df_proc[c] = le.fit_transform(df_proc[c].astype(str))
    scaler = StandardScaler()
    df_proc[num_cols] = scaler.fit_transform(df_proc[num_cols])

    cat_dims = [df_proc[c].nunique() for c in cat_cols]
    num_feats = len(num_cols)
    if task=='regression':
        model = ConvRegressorModel(cat_dims, num_feats,
                                   embedding_dim, conv_filters, dense_units)
        tgt = None
    else:
        model = ConvClassifierModel(cat_dims, num_feats,
                                    embedding_dim, conv_filters, dense_units)
        tgt = 0
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()

    sample = df_proc.iloc[[sample_idx]]
    cat_tensor = torch.tensor(sample[cat_cols].values, dtype=torch.long)
    num_tensor = torch.tensor(sample[num_cols].values, dtype=torch.float32)

    def forward_fn(num_in, cat_in):
        return model(cat_in, num_in)

    ig = IntegratedGradients(forward_fn)
    baseline_num = torch.zeros_like(num_tensor)
    attributions_num = ig.attribute(
        inputs=num_tensor,
        baselines=baseline_num,
        additional_forward_args=(cat_tensor,),
        target=tgt
    ).squeeze().detach().numpy()

    orig_pred = model(cat_tensor, num_tensor).item()
    cat_importances = {}
    for idx, name in enumerate(cat_cols):
        cat_baseline = cat_tensor.clone()
        cat_baseline[:, idx] = 0         
        pred_baseline = model(cat_baseline, num_tensor).item()
        cat_importances[name] = orig_pred - pred_baseline

    print(f"\n— Importancias muestra idx={sample_idx} ({task}) —\n")

    print("Numéricas (Integrated Gradients):")
    for name, val in zip(num_cols, attributions_num):
        print(f"  {name}: {val:.4f}")

    print("\nCategóricas (Feature Ablation Δpredicción):")
    for name, imp in sorted(cat_importances.items(), key=lambda x: abs(x[1]), reverse=True):
        print(f"  {name}: Δpred = {imp:.4f}")



In [3]:
df = pd.read_csv("./data/datos_variables_seleccionadas.csv")

In [4]:
interpret_model(
        model_path="./best_models/regression/model_regression_20250522_180347.pt",
        df=df,
        target_col="punt_matematicas",
        task="regression",
        embedding_dim=4,
        conv_filters=[32, 64],
        dense_units=64,
        sample_idx=0
    )

interpret_model(
    model_path="./best_models/classification/model_classification_20250522_100219.pt",
    df=df,
    target_col="eco",
    task="classification",
    embedding_dim=8,
    conv_filters=[32, 64],
    dense_units=64,
    sample_idx=0
)



— Importancias muestra idx=0 (regression) —

Numéricas (Integrated Gradients):
  periodo: -0.0578
  cole_cod_dane_establecimiento: -0.0181
  cole_cod_depto_ubicacion: 0.8829
  cole_codigo_icfes: 0.8491
  fami_estratovivienda: -1.1398
  fami_tieneautomovil: -0.8658
  fami_tienelavadora: 0.1326
  punt_ingles: -0.0351
  fami_cuartoshogar_int: -0.0393
  edad: 0.4375
  fami_nivel_tecnologia: 0.2796

Categóricas (Feature Ablation Δpredicción):
  cole_mcpio_ubicacion: Δpred = 3.3308
  estu_tipodocumento: Δpred = 3.0369
  desemp_ingles: Δpred = 1.9709
  estu_mcpio_reside: Δpred = 1.4625
  cole_naturaleza: Δpred = 0.9739
  fami_educacionmadre: Δpred = -0.7295
  fami_educacionpadre: Δpred = -0.7123
  fami_cuartoshogar: Δpred = -0.6007
  estu_mcpio_presentacion: Δpred = 0.5738
  cole_jornada: Δpred = -0.4479
  cole_area_ubicacion: Δpred = 0.2719
  estu_depto_presentacion: Δpred = -0.2356
  cole_depto_ubicacion: Δpred = 0.1357
  cole_caracter: Δpred = 0.1039
  estu_depto_reside: Δpred = -0.0126
 

# Importancia Regresion

| Variable                         | Tipo       | Importancia |
| -------------------------------- | ---------- | ----------: |
| cole\_mcpio\_ubicacion           | Categórica |      3.3308 |
| estu\_tipodocumento              | Categórica |      3.0369 |
| desemp\_ingles                   | Categórica |      1.9709 |
| estu\_mcpio\_reside              | Categórica |      1.4625 |
| fami\_estratovivienda            | Numérica   |     -1.1398 |
| cole\_naturaleza                 | Categórica |      0.9739 |
| cole\_cod\_depto\_ubicacion      | Numérica   |      0.8829 |
| fami\_tieneautomovil             | Numérica   |     -0.8658 |
| cole\_codigo\_icfes              | Numérica   |      0.8491 |
| fami\_educacionmadre             | Categórica |     -0.7295 |
| fami\_educacionpadre             | Categórica |     -0.7123 |
| fami\_cuartoshogar               | Categórica |     -0.6007 |
| estu\_mcpio\_presentacion        | Categórica |      0.5738 |
| cole\_jornada                    | Categórica |     -0.4479 |
| edad                             | Numérica   |      0.4375 |
| fami\_nivel\_tecnologia          | Numérica   |      0.2796 |
| cole\_area\_ubicacion            | Categórica |      0.2719 |
| estu\_depto\_presentacion        | Categórica |     -0.2356 |
| cole\_depto\_ubicacion           | Categórica |      0.1357 |
| fami\_tienelavadora              | Numérica   |      0.1326 |
| cole\_caracter                   | Categórica |      0.1039 |
| periodo                          | Numérica   |     -0.0578 |
| fami\_cuartoshogar\_int          | Numérica   |     -0.0393 |
| punt\_ingles                     | Numérica   |     -0.0351 |
| cole\_cod\_dane\_establecimiento | Numérica   |     -0.0181 |
| estu\_depto\_reside              | Categórica |     -0.0126 |
| estu\_genero                     | Categórica |      0.0000 |


# Importancia Clasificacion

| Variable                         | Tipo       | Importancia |
| -------------------------------- | ---------- | ----------: |
| estu\_tipodocumento              | Categórica |      0.1063 |
| desemp\_ingles                   | Categórica |      0.0790 |
| cole\_jornada                    | Categórica |     -0.0279 |
| fami\_tienelavadora              | Numérica   |      0.0268 |
| cole\_mcpio\_ubicacion           | Categórica |      0.0252 |
| cole\_area\_ubicacion            | Categórica |      0.0224 |
| fami\_estratovivienda            | Numérica   |     -0.0215 |
| cole\_caracter                   | Categórica |      0.0176 |
| fami\_educacionpadre             | Categórica |     -0.0168 |
| fami\_cuartoshogar               | Categórica |     -0.0152 |
| fami\_cuartoshogar\_int          | Numérica   |      0.0149 |
| cole\_cod\_depto\_ubicacion      | Numérica   |      0.0142 |
| cole\_codigo\_icfes              | Numérica   |      0.0133 |
| cole\_naturaleza                 | Categórica |      0.0128 |
| estu\_mcpio\_presentacion        | Categórica |      0.0073 |
| estu\_depto\_reside              | Categórica |      0.0055 |
| edad                             | Numérica   |      0.0048 |
| estu\_mcpio\_reside              | Categórica |     -0.0042 |
| estu\_depto\_presentacion        | Categórica |      0.0021 |
| fami\_educacionmadre             | Categórica |     -0.0016 |
| cole\_cod\_dane\_establecimiento | Numérica   |      0.0011 |
| cole\_depto\_ubicacion           | Categórica |      0.0011 |
| fami\_nivel\_tecnologia          | Numérica   |      0.0009 |
| fami\_tieneautomovil             | Numérica   |     -0.0005 |
| punt\_ingles                     | Numérica   |     -0.0005 |
| periodo                          | Numérica   |     -0.0002 |
| estu\_genero                     | Categórica |      0.0000 |


# Guardar Predicciones

In [5]:
X_train_r, y_train_r, X_val_r, y_val_r, X_test_r, y_test_r = train_test_val_split(
    df, target_cols=['punt_matematicas'], test_size=0.2, val_frac=0.1
)
for X in (X_train_r, X_val_r, X_test_r):
    if 'eco' in X.columns:
        X.drop(columns=['eco'], inplace=True)

X_train_c, y_train_c, X_val_c, y_val_c, X_test_c, y_test_c = train_test_val_split(
    df, target_cols=['eco'], test_size=0.2, val_frac=0.1
)
for X in (X_train_c, X_val_c, X_test_c):
    if 'punt_matematicas' in X.columns:
        X.drop(columns=['punt_matematicas'], inplace=True)


In [6]:

reg = ConvRegressor(embedding_dim=4, conv_filters=[32, 64], dense_units=64)
df_train_r = pd.concat([X_train_r, y_train_r], axis=1)
reg.build_model(df_train_r, target_col='punt_matematicas')
reg.model.load_state_dict(torch.load(
    "./best_models/regression/model_regression_20250522_180347.pt",
    map_location=reg.device
))

<All keys matched successfully>

In [7]:
reg.model.to(reg.device).eval()
preds_r = reg.predict(X_test_r).flatten()

In [8]:
preds_r

array([46.62287 , 48.274952, 58.614334, ..., 46.79555 , 55.85653 ,
       59.16332 ], dtype=float32)

In [9]:
clf = ConvClassifier(embedding_dim=8, conv_filters=[32, 64], dense_units=64)
df_train_c = pd.concat([X_train_c, y_train_c], axis=1)
clf.build_model(df_train_c, target_col='eco')
clf.model.load_state_dict(torch.load(
    "./best_models/classification/model_classification_20250522_100219.pt",
    map_location=clf.device
))

<All keys matched successfully>

In [10]:
clf.model.to(clf.device).eval()
preds_proba_c = clf.predict(X_test_c).flatten()
preds_label_c = (preds_proba_c >= 0.5).astype(int)



In [11]:
df_test_full = df.loc[X_test_r.index].reset_index(drop=True)

df_test_full['punt_matematicas_pred'] = preds_r
df_test_full['eco_pred_proba']        = preds_proba_c
df_test_full['eco_pred_label']        = preds_label_c

df_test_full.to_csv("data/predicciones_completas.csv", index=False)


In [None]:
import joblib
import os
from datetime import datetime

os.makedirs("./encoders", exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

reg_cat_dims = [reg.preprocessed_df[col].nunique() for col in reg.cat_columns]
clf_cat_dims = [clf.preprocessed_df[col].nunique() for col in clf.cat_columns]

encoders_data = {
    'reg_encoders': reg.encoders,
    'reg_scaler': reg.scaler,
    'reg_cat_columns': reg.cat_columns,
    'reg_num_columns': reg.num_columns,
    'reg_cat_dims': reg_cat_dims,  
    'clf_encoders': clf.encoders,
    'clf_scaler': clf.scaler,
    'clf_cat_columns': clf.cat_columns,
    'clf_num_columns': clf.num_columns,
    'clf_cat_dims': clf_cat_dims   
}

joblib.dump(encoders_data, f"./encoders/all_encoders_{timestamp}.pkl")
print(f"Encoders guardados: all_encoders_{timestamp}.pkl")

Encoders guardados: all_encoders_20250525_164501.pkl


In [None]:
import pandas as pd
import torch
import joblib

def predict_new_sample(sample_data, encoders_file):
    """
    Hace predicción en una nueva muestra
    """
    
    reg_model_path = "best_models/regression/model_regression_20250522_180347.pt"
    clf_model_path = "best_models/classification/model_classification_20250522_100219.pt"
    
    encoders_data = joblib.load(encoders_file)
    
    columns = ['periodo','estu_tipodocumento','cole_area_ubicacion','cole_caracter',
               'cole_cod_dane_establecimiento','cole_cod_depto_ubicacion','cole_codigo_icfes',
               'cole_depto_ubicacion','cole_jornada','cole_mcpio_ubicacion','cole_naturaleza',
               'estu_depto_presentacion','estu_depto_reside','estu_genero','estu_mcpio_presentacion',
               'estu_mcpio_reside','fami_cuartoshogar','fami_educacionmadre','fami_educacionpadre',
               'fami_estratovivienda','fami_tieneautomovil','fami_tienelavadora','desemp_ingles',
               'punt_ingles','punt_matematicas','eco','fami_cuartoshogar_int','edad','fami_nivel_tecnologia']
    
    values = sample_data.strip().split(',')
    df = pd.DataFrame([values], columns=columns)
    
    numeric_cols = ['periodo','cole_cod_dane_establecimiento','cole_cod_depto_ubicacion',
                   'cole_codigo_icfes','fami_cuartoshogar','fami_estratovivienda','fami_tieneautomovil',
                   'fami_tienelavadora','punt_ingles','punt_matematicas','eco','fami_cuartoshogar_int',
                   'edad','fami_nivel_tecnologia']
    
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col])
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # === REGRESIÓN ===
    df_reg = df.drop(columns=['eco', 'punt_matematicas'])
    
    for col in encoders_data['reg_cat_columns']:
        if col in df_reg.columns:
            df_reg[col] = encoders_data['reg_encoders'][col].transform(df_reg[col].astype(str))
    
    df_reg[encoders_data['reg_num_columns']] = encoders_data['reg_scaler'].transform(df_reg[encoders_data['reg_num_columns']])
    
    reg_cat = torch.tensor(df_reg[encoders_data['reg_cat_columns']].values, dtype=torch.long).to(device)
    reg_num = torch.tensor(df_reg[encoders_data['reg_num_columns']].values, dtype=torch.float32).to(device)
    
    # CAMBIO: usar dimensiones guardadas en lugar de calcularlas
    reg_cat_dims = encoders_data['reg_cat_dims']
    reg_num_features = len(encoders_data['reg_num_columns'])
    
    reg_model = ConvRegressorModel(
        cat_dims=reg_cat_dims,
        num_features=reg_num_features,
        embedding_dim=4,
        conv_filters=[32, 64],
        dense_units=64
    ).to(device)
    
    reg_model.load_state_dict(torch.load(reg_model_path, map_location=device))
    reg_model.eval()
    
    with torch.no_grad():
        reg_prediction = reg_model(reg_cat, reg_num).cpu().numpy()[0][0]
    
    df_clf = df.drop(columns=['punt_matematicas', 'eco'])
    
    for col in encoders_data['clf_cat_columns']:
        if col in df_clf.columns:
            df_clf[col] = encoders_data['clf_encoders'][col].transform(df_clf[col].astype(str))
    
    df_clf[encoders_data['clf_num_columns']] = encoders_data['clf_scaler'].transform(df_clf[encoders_data['clf_num_columns']])
    
    clf_cat = torch.tensor(df_clf[encoders_data['clf_cat_columns']].values, dtype=torch.long).to(device)
    clf_num = torch.tensor(df_clf[encoders_data['clf_num_columns']].values, dtype=torch.float32).to(device)
    
    clf_cat_dims = encoders_data['clf_cat_dims']
    clf_num_features = len(encoders_data['clf_num_columns'])
    
    clf_model = ConvClassifierModel(
        cat_dims=clf_cat_dims,
        num_features=clf_num_features,
        embedding_dim=8,
        conv_filters=[32, 64],
        dense_units=64
    ).to(device)
    
    clf_model.load_state_dict(torch.load(clf_model_path, map_location=device))
    clf_model.eval()
    
    with torch.no_grad():
        clf_prediction_proba = clf_model(clf_cat, clf_num).cpu().numpy()[0][0]
        clf_prediction_label = int(clf_prediction_proba >= 0.5)
    
    return {
        'punt_matematicas_pred': float(reg_prediction),
        'eco_pred_proba': float(clf_prediction_proba),
        'eco_pred_label': clf_prediction_label
    }

sample = "20102,TI,urbano,TÉCNICO,186219000070,86,29025,OTROS,MAÑANA,Colón,OFICIAL,OTROS,OTROS,F,SIBUNDOY,COLÓN,5,SECUNDARIA (BACHILLERATO) INCOMPLETA,PRIMARIA COMPLETA,1,0,1,A1,49.66,48.17,0,5,31.0,1"

predictions = predict_new_sample(sample, "encoders/all_encoders_20250525_164501.pkl")
print(predictions)
#usuario escriba los datos en el formato que espera esta función y que queden en una concatenación de un strinh dividido por comas

{'punt_matematicas_pred': 51.94544982910156, 'eco_pred_proba': 0.6068722605705261, 'eco_pred_label': 1}
