## Instalación de dependencias

In [1]:
%pip install pandas numpy scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


## Carga de librerías y archivos

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


os.chdir("C:/Users/lopez/Desktop/udea-ai-4-eng-20251-pruebas-saber-pro-colombia")

# Cargar datos
train = pd.read_csv("train.csv")

## Selección de variables y división de datos

In [3]:
categorical_features = [
    "ESTU_VALORMATRICULAUNIVERSIDAD",
    "ESTU_HORASSEMANATRABAJA",
    "FAMI_ESTRATOVIVIENDA",
    "FAMI_EDUCACIONPADRE",
    "FAMI_EDUCACIONMADRE",
    "ESTU_PRGM_DEPARTAMENTO",
    "FAMI_TIENEINTERNET",
    "ESTU_PAGOMATRICULAPROPIO",
    "ESTU_PRGM_ACADEMICO"
]

# Asegurar que las variables categóricas sean string y sin nulos
temp = train.copy()
for col in categorical_features:
    if col in temp.columns:
        temp[col] = temp[col].astype(str).fillna('missing')

# Codificar el target a valores numéricos
le = LabelEncoder()
y = le.fit_transform(temp["RENDIMIENTO_GLOBAL"])
X = temp.drop(columns=["RENDIMIENTO_GLOBAL", "ID", "PERIODO"] )
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp)

## Preprocesamiento para RandomForestClassifier

In [5]:
numeric_features = [col for col in X_train.select_dtypes(include=[np.number]).columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("scaler", StandardScaler())
    ]), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_features)
 ])

## Pipeline y entrenamiento con RandomForestClassifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=2000, max_depth=8, max_features='sqrt', random_state=42, n_jobs=-1, verbose=1))
]) #Usamos n_jobs = -1 para usar todos los cores disponibles en nuestro PC

pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy validación: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))

y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy test: {test_accuracy:.4f}")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  7.1min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    4.8s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    7.0s
[Parallel(n_jobs=8)]: Do

Accuracy validación: 0.3864
              precision    recall  f1-score   support

        alto       0.45      0.64      0.53     30733
        bajo       0.38      0.60      0.47     30272
  medio-alto       0.30      0.14      0.19     30034
  medio-bajo       0.30      0.17      0.22     30148

    accuracy                           0.39    121187
   macro avg       0.36      0.38      0.35    121187
weighted avg       0.36      0.39      0.35    121187



[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.8s


Accuracy test: 0.3872
              precision    recall  f1-score   support

        alto       0.45      0.64      0.53      4391
        bajo       0.38      0.61      0.47      4325
  medio-alto       0.29      0.13      0.18      4290
  medio-bajo       0.31      0.17      0.22      4307

    accuracy                           0.39     17313
   macro avg       0.36      0.39      0.35     17313
weighted avg       0.36      0.39      0.35     17313



[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    1.2s finished


## Kaggle Submission

In [None]:
test_data = pd.read_csv("test.csv")

In [None]:
for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str).fillna('missing')
X_test_kaggle = test_data.drop(columns=["ID", "PERIODO"], errors='ignore')
predictions = pipeline.predict(X_test_kaggle)
predictions_labels = le.inverse_transform(predictions)
submission_df = test_data[["ID"]].copy()
submission_df["RENDIMIENTO_GLOBAL"] = predictions_labels
submission_df.to_csv("submission_rf.csv", index=False)
print("Archivo de submission generado: submission_rf.csv")
print(submission_df.head())

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    8.8s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    8.8s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:   12.8s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:   12.8s


Archivo de submission generado: submission_rf.csv
       ID RENDIMIENTO_GLOBAL
0  550236         medio-alto
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185         medio-bajo


[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:   14.4s finished
