In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, accuracy_score
import torch

print("PyTorch GPU disponible:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Importar los modelos y funciones
from convolutional import ConvRegressor, ConvClassifier, run_kfold_cv, train_test_val_split

# Cargar datos
print("\nCargando datos...")
df = pd.read_csv("data/datos_variables_seleccionadas.csv")
print(f"Dimensiones del DataFrame: {df.shape}")

# Tomar una muestra pequeña para pruebas rápidas
df_sample = df.sample(frac=0.05, random_state=42).reset_index(drop=True)
print(f"Muestra para pruebas: {df_sample.shape} (5% del total)")

# Definir las variables objetivo
target_cols = ['punt_matematicas', 'eco']

# Dividir en train/val/test
print("\nDividiendo datos en train/val/test...")
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val_split(
    df_sample, target_cols, test_size=0.2, val_frac=0.1
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# === Conjunto para regresión (target: punt_matematicas) ===
target_col_reg = 'punt_matematicas'

X_train_reg = X_train.drop(columns=['eco'], errors='ignore')
X_val_reg   = X_val.drop(columns=['eco'], errors='ignore')
X_test_reg  = X_test.drop(columns=['eco'], errors='ignore')

df_train_reg = pd.concat([X_train_reg, y_train[[target_col_reg]]], axis=1)
df_val_reg   = pd.concat([X_val_reg,   y_val[[target_col_reg]]], axis=1)
df_test_reg  = pd.concat([X_test_reg,  y_test[[target_col_reg]]], axis=1)

# === Conjunto para clasificación (target: eco) ===
target_col_clf = 'eco'

X_train_clf = X_train.drop(columns=['punt_matematicas'], errors='ignore')
X_val_clf   = X_val.drop(columns=['punt_matematicas'], errors='ignore')
X_test_clf  = X_test.drop(columns=['punt_matematicas'], errors='ignore')

df_train_clf = pd.concat([X_train_clf, y_train[[target_col_clf]]], axis=1)
df_val_clf   = pd.concat([X_val_clf,   y_val[[target_col_clf]]], axis=1)
df_test_clf  = pd.concat([X_test_clf,  y_test[[target_col_clf]]], axis=1)


PyTorch GPU disponible: True
GPU: NVIDIA GeForce RTX 4060 Laptop GPU

Cargando datos...
Dimensiones del DataFrame: (565658, 29)
Muestra para pruebas: (28283, 29) (5% del total)

Dividiendo datos en train/val/test...
Train: (20363, 27), Validation: (2263, 27), Test: (5657, 27)


In [2]:

# 1. Probar modelo de regresión
print("\n=== PRUEBA MODELO DE REGRESIÓN (punt_matematicas) ===")
target_col_reg = 'punt_matematicas'

# Crear y entrenar el modelo
reg_model = ConvRegressor(
    embedding_dim=4,
    conv_filters=[16, 32],
    dense_units=64
)

print("Construyendo modelo...")
reg_model.build_model(df_train_reg.copy(), 'punt_matematicas')



=== PRUEBA MODELO DE REGRESIÓN (punt_matematicas) ===
Construyendo modelo...


ConvRegressorModel(
  (embeddings): ModuleList(
    (0): Embedding(4, 4)
    (1): Embedding(3, 4)
    (2): Embedding(5, 4)
    (3-4): 2 x Embedding(7, 4)
    (5): Embedding(933, 4)
    (6): Embedding(3, 4)
    (7-8): 2 x Embedding(7, 4)
    (9): Embedding(3, 4)
    (10): Embedding(421, 4)
    (11): Embedding(941, 4)
    (12-14): 3 x Embedding(7, 4)
    (15): Embedding(6, 4)
  )
  (conv_layers): ModuleList(
    (0): Conv1d(1, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (bn_layers): ModuleList(
    (0): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (pool_layers): ModuleList(
    (0-1): 2 x MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (dropouts): ModuleList(
    (0-1): 2 x Dropout(p=0.2, inplace=False)
  )
  (final_conv): Conv1d(32, 64, kernel_size

In [None]:
reg_model.train(
    target_col='punt_matematicas',
    val_df=df_val_reg,             
    early_stopping=True,            
    patience=15,
    epochs=30,
    batch_size=32
)


[LabelEncoderWithUnknown] 20 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 2 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 14 valores no vistos transformados a '__unknown__'
 Iniciando entrenamiento en cuda con 20363 muestras, 27 características (16 categóricas, 11 numéricas)


                                                                          

Validation loss decreased (inf --> 148.289413). Saving model...
 Época 1/30 | Train Loss: 243.4732 | Tiempo: 8.60s | Val Loss: 148.2894


                                                                        

Validation loss decreased (148.289413 --> 114.299919). Saving model...
 Época 2/30 | Train Loss: 153.3909 | Tiempo: 7.38s | Val Loss: 114.2999


                                                                        

Validation loss decreased (114.299919 --> 99.289520). Saving model...
 Época 3/30 | Train Loss: 148.8191 | Tiempo: 7.01s | Val Loss: 99.2895


                                                                         

Validation loss decreased (99.289520 --> 98.590302). Saving model...
 Época 4/30 | Train Loss: 145.3541 | Tiempo: 6.84s | Val Loss: 98.5903


                                                                         

EarlyStopping counter: 1 out of 5
 Época 5/30 | Train Loss: 144.4756 | Tiempo: 6.79s | Val Loss: 120.0122


                                                                         

Validation loss decreased (98.590302 --> 79.903175). Saving model...
 Época 6/30 | Train Loss: 141.1890 | Tiempo: 6.85s | Val Loss: 79.9032


                                                                        

EarlyStopping counter: 1 out of 5
 Época 7/30 | Train Loss: 140.1187 | Tiempo: 6.96s | Val Loss: 119.3887


                                                                        

EarlyStopping counter: 2 out of 5
 Época 8/30 | Train Loss: 137.1812 | Tiempo: 7.23s | Val Loss: 113.4046


                                                                        

EarlyStopping counter: 3 out of 5
 Época 9/30 | Train Loss: 137.6073 | Tiempo: 7.65s | Val Loss: 93.2020


                                                                         

EarlyStopping counter: 4 out of 5
 Época 10/30 | Train Loss: 137.2121 | Tiempo: 7.24s | Val Loss: 88.1607


                                                                         

EarlyStopping counter: 5 out of 5
 Época 11/30 | Train Loss: 132.2996 | Tiempo: 7.12s | Val Loss: 105.0071
Early stopping en época 11




In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluar en validación
print("Evaluando en conjunto de validación...")
y_true_val_reg = df_val_reg[target_col_reg].values
y_pred_val_reg = reg_model.predict(df_val_reg.drop(columns=[target_col_reg])).squeeze()

mae_val = mean_absolute_error(y_true_val_reg, y_pred_val_reg)
mse_val = mean_squared_error(y_true_val_reg, y_pred_val_reg)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_true_val_reg, y_pred_val_reg)

print(f"MAE  (val): {mae_val:.4f}")
print(f"MSE  (val): {mse_val:.4f}")
print(f"RMSE (val): {rmse_val:.4f}")
print(f"R²   (val): {r2_val:.4f}")

# Evaluar en test
print("\nEvaluando en conjunto de test...")
y_true_test_reg = df_test_reg[target_col_reg].values
y_pred_test_reg = reg_model.predict(df_test_reg.drop(columns=[target_col_reg])).squeeze()

mae_test = mean_absolute_error(y_true_test_reg, y_pred_test_reg)
mse_test = mean_squared_error(y_true_test_reg, y_pred_test_reg)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_true_test_reg, y_pred_test_reg)

print(f"MAE  (test): {mae_test:.4f}")
print(f"MSE  (test): {mse_test:.4f}")
print(f"RMSE (test): {rmse_test:.4f}")
print(f"R²   (test): {r2_test:.4f}")


Evaluando en conjunto de validación...
[LabelEncoderWithUnknown] 20 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 2 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 14 valores no vistos transformados a '__unknown__'
MAE  (val): 6.8542
MSE  (val): 79.9032
RMSE (val): 8.9389
R²   (val): 0.2418

Evaluando en conjunto de test...
[LabelEncoderWithUnknown] 38 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 2 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 32 valores no vistos transformados a '__unknown__'
MAE  (test): 6.8062
MSE  (test): 76.0322
RMSE (test): 8.7196
R²   (test): 0.2408


In [5]:
clf_model = ConvClassifier()
clf_model.build_model(df_train_clf.copy(), 'eco')
clf_model.train(
    target_col='eco',
    val_df=df_val_clf,              
    early_stopping=True,            
    patience=5,                    
    epochs=50
)


[LabelEncoderWithUnknown] 20 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 2 valores no vistos transformados a '__unknown__'
[LabelEncoderWithUnknown] 14 valores no vistos transformados a '__unknown__'
 Iniciando entrenamiento en cuda con 20363 muestras, 27 características (16 categóricas, 11 numéricas)


                                                                           

Validation loss decreased (inf --> 0.597047). Saving model...
 Época 1/50 | Train Loss: 0.6289 | Tiempo: 14.55s | Val Loss: 0.5970


                                                                            

Validation loss decreased (0.597047 --> 0.596874). Saving model...
 Época 2/50 | Train Loss: 0.6068 | Tiempo: 14.13s | Val Loss: 0.5969


                                                                           

Validation loss decreased (0.596874 --> 0.592899). Saving model...
 Época 3/50 | Train Loss: 0.6030 | Tiempo: 14.01s | Val Loss: 0.5929


                                                                           

Validation loss decreased (0.592899 --> 0.587385). Saving model...
 Época 4/50 | Train Loss: 0.5993 | Tiempo: 15.07s | Val Loss: 0.5874


                                                                            

EarlyStopping counter: 1 out of 5
 Época 5/50 | Train Loss: 0.5990 | Tiempo: 14.13s | Val Loss: 0.5943


                                                                           

Validation loss decreased (0.587385 --> 0.585343). Saving model...
 Época 6/50 | Train Loss: 0.5942 | Tiempo: 13.66s | Val Loss: 0.5853


                                                                            

EarlyStopping counter: 1 out of 5
 Época 7/50 | Train Loss: 0.5917 | Tiempo: 14.14s | Val Loss: 0.5874


                                                                           

Validation loss decreased (0.585343 --> 0.582730). Saving model...
 Época 8/50 | Train Loss: 0.5907 | Tiempo: 14.92s | Val Loss: 0.5827


                                                                           

EarlyStopping counter: 1 out of 5
 Época 9/50 | Train Loss: 0.5896 | Tiempo: 15.57s | Val Loss: 0.5835


                                                                            

Validation loss decreased (0.582730 --> 0.581689). Saving model...
 Época 10/50 | Train Loss: 0.5851 | Tiempo: 15.45s | Val Loss: 0.5817


                                                                             

EarlyStopping counter: 1 out of 5
 Época 11/50 | Train Loss: 0.5837 | Tiempo: 14.05s | Val Loss: 0.5851


                                                                            

EarlyStopping counter: 2 out of 5
 Época 12/50 | Train Loss: 0.5829 | Tiempo: 14.44s | Val Loss: 0.5878


                                                                            

EarlyStopping counter: 3 out of 5
 Época 13/50 | Train Loss: 0.5804 | Tiempo: 15.08s | Val Loss: 0.5876


                                                                            

EarlyStopping counter: 4 out of 5
 Época 14/50 | Train Loss: 0.5768 | Tiempo: 15.36s | Val Loss: 0.5818


                                                                            

EarlyStopping counter: 5 out of 5
 Época 15/50 | Train Loss: 0.5753 | Tiempo: 15.42s | Val Loss: 0.5847
Early stopping en época 15


In [None]:
# Evaluar en validación
print("Evaluando en conjunto de validación...")
y_true_val_clf = df_val_clf[target_col_clf].values
y_pred_val_clf = clf_model.predict(df_val_clf.drop(columns=[target_col_clf])).squeeze()
y_pred_val_clf_binary = (y_pred_val_clf > 0.5).astype(int)
accuracy_val = accuracy_score(y_true_val_clf, y_pred_val_clf_binary)
print(f"Accuracy en validación: {accuracy_val:.4f}")

# Evaluar en test
print("Evaluando en conjunto de test...")
y_true_test_clf = df_test_clf[target_col_clf].values
y_pred_test_clf = clf_model.predict(df_test_clf.drop(columns=[target_col_clf])).squeeze()
y_pred_test_clf_binary = (y_pred_test_clf > 0.5).astype(int)
accuracy_test = accuracy_score(y_true_test_clf, y_pred_test_clf_binary)
print(f"Accuracy en test: {accuracy_test:.4f}")
