In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score

In [22]:
df_modelo = pd.read_csv("preprocessing_pisa_data.csv")

In [23]:
df_modelo.columns

Index(['PV1MATH', 'PV2MATH', 'PV3MATH', 'PV4MATH', 'PV5MATH', 'PV6MATH',
       'PV7MATH', 'PV8MATH', 'PV9MATH', 'PV10MATH', 'AGE', 'TFGender',
       'REPEAT', 'MISSSC', 'SKIPPING', 'TARDYSD', 'EXERPRAC', 'STUDYHMW',
       'WORKPAY', 'WORKHOME', 'ST034Q01TA', 'ST034Q02TA', 'ST034Q03TA',
       'ST034Q04TA', 'ST034Q05TA', 'ST034Q06TA', 'ST300Q01JA', 'ST300Q02JA',
       'ST300Q03JA', 'ST300Q04JA', 'ST300Q05JA', 'ST300Q06JA', 'ST300Q07JA',
       'ST300Q08JA', 'ST300Q09JA', 'ST300Q10JA', 'ST270Q01JA', 'ST270Q02JA',
       'ST270Q03JA', 'ST270Q04JA', 'ST038Q03NA', 'ST038Q04NA', 'ST038Q05NA',
       'ST038Q06NA', 'ST038Q07NA', 'ST038Q08NA', 'ST250Q01JA', 'ST250Q02JA',
       'ST250Q03JA', 'ST250Q04JA', 'ST250Q05JA', 'ST251Q01JA', 'ST251Q02JA',
       'ST251Q03JA', 'ST251Q04JA', 'ST255Q01JA', 'MATH_CANTIDAD',
       'MATH_CAMBIO_REL', 'MATH_ESPACIO_FORMA', 'MATH_DATOS_INCERT',
       'MATH_FORMULACION', 'MATH_PROCEDIMIENTOS', 'MATH_INTERPRETACION',
       'MATH_RAZONAMIENTO', 'MATH_PROMED

In [26]:
# ===============================
# 1. Carga y selección de columnas
# ===============================
# Ya deberías tener df_modelo preparado con columnas limpias y subcompetencias agregadas

# Definir variables input y target
X = df_modelo.drop(columns=['MATH_LOGRO'])
y = df_modelo['MATH_LOGRO']

# ===============================
# 2. Escalado de características
# ===============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ===============================
# 3. Redimensionar para RNN (samples, timesteps, features)
# Aquí usamos 1 timestep (puede ser ajustado si lo deseas)
# ===============================
X_rnn = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# ===============================
# 4. División de datos
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X_rnn, y, test_size=0.2, random_state=42)

# ===============================
# 5. Definición del modelo RNN
# ===============================
model = Sequential()
model.add(SimpleRNN(units=64, input_shape=(X_rnn.shape[1], X_rnn.shape[2]), activation='tanh', return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binaria

# ===============================
# 6. Compilación y entrenamiento
# ===============================
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# ===============================
# 7. Evaluación final
# ===============================
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Pérdida (loss): {loss:.4f}, Precisión (accuracy): {accuracy:.4f}")

  super().__init__(**kwargs)


Epoch 1/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8024 - loss: 0.4097 - val_accuracy: 0.9462 - val_loss: 0.1713
Epoch 2/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9448 - loss: 0.1677 - val_accuracy: 0.9614 - val_loss: 0.1071
Epoch 3/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9488 - loss: 0.1255 - val_accuracy: 0.9668 - val_loss: 0.0811
Epoch 4/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9520 - loss: 0.1100 - val_accuracy: 0.9614 - val_loss: 0.0813
Epoch 5/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9551 - loss: 0.0981 - val_accuracy: 0.9695 - val_loss: 0.0693
Epoch 6/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9685 - loss: 0.0761 - val_accuracy: 0.9668 - val_loss: 0.0662
Epoch 7/50
[1m140/140[0m 

In [25]:
loss, mae = model.evaluate(X_test, y_test)
print(f"Pérdida (MSE): {loss:.4f}, MAE: {mae:.4f}")


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9848 - loss: 0.0378
Pérdida (MSE): 0.0299, MAE: 0.9871


In [27]:
import joblib

# Guardar el modelo
model.save("modelo_rnn_logro_matematica.h5")

# Guardar el scaler
joblib.dump(scaler, "scaler_modelo_rnn.pkl")




['scaler_modelo_rnn.pkl']

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_class = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
[[1218    9]
 [  11  156]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1227
           1       0.95      0.93      0.94       167

    accuracy                           0.99      1394
   macro avg       0.97      0.96      0.97      1394
weighted avg       0.99      0.99      0.99      1394

