## Instalacion de dependencias

In [20]:
# Instalación de dependencias necesarias para el análisis
%pip install pandas numpy scikit-learn catboost matplotlib

Note: you may need to restart the kernel to use updated packages.


## Carga de archivos

In [2]:
import os
import pandas as pd
import numpy as np
os.chdir("C:\\Users\\lopez\\Desktop\\udea-ai-4-eng-20251-pruebas-saber-pro-colombia") # Ruta donde se encuentra la carpeta con los archivos
os.listdir()

['catboost_info', 'submission.csv', 'test.csv', 'train.csv']

In [3]:
# Load csv file
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,ID,PERIODO,ESTU_PRGM_ACADEMICO,ESTU_PRGM_DEPARTAMENTO,ESTU_VALORMATRICULAUNIVERSIDAD,ESTU_HORASSEMANATRABAJA,FAMI_ESTRATOVIVIENDA,FAMI_TIENEINTERNET,FAMI_EDUCACIONPADRE,FAMI_TIENELAVADORA,...,ESTU_PRIVADO_LIBERTAD,ESTU_PAGOMATRICULAPROPIO,FAMI_TIENECOMPUTADOR,FAMI_TIENEINTERNET.1,FAMI_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,coef_1,coef_2,coef_3,coef_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


## Selección de variables y división de datos
Definimos las variables categóricas y separamos los datos en entrenamiento, validación y prueba, igual que en el flujo original.

In [4]:
# Definir variables categóricas
categorical_features = [
    "ESTU_VALORMATRICULAUNIVERSIDAD",
    "ESTU_HORASSEMANATRABAJA",
    "FAMI_ESTRATOVIVIENDA",
    "FAMI_EDUCACIONPADRE",
    "FAMI_EDUCACIONMADRE",
    "ESTU_PRGM_DEPARTAMENTO",
    "FAMI_TIENEINTERNET",
    "ESTU_PAGOMATRICULAPROPIO",
    "ESTU_PRGM_ACADEMICO"
 ]

# Asegurar que las variables categóricas sean string y sin nulos
for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype(str).fillna('missing')

from sklearn.model_selection import train_test_split
y = df["RENDIMIENTO_GLOBAL"]
X = df.drop(columns=["RENDIMIENTO_GLOBAL", "ID", "PERIODO"] )
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp)

## Entrenamiento y evaluación del modelo CatBoost
Entrenamos el modelo CatBoost usando las variables categóricas y evaluamos el desempeño en validación y test.

In [None]:
# Forzar todas las columnas categóricas a string y sin nulos
from catboost import CatBoostClassifier
for col in categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype(str).fillna('missing')
    if col in X_val.columns:
        X_val[col] = X_val[col].astype(str).fillna('missing')
    if col in X_test.columns:
        X_test[col] = X_test[col].astype(str).fillna('missing')
# Forzar todas las columnas numéricas a float (si aplica)
for col in X_train.columns:
    if col not in categorical_features:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_val[col] = pd.to_numeric(X_val[col], errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
# Reemplazar nulos numéricos por la media
for col in X_train.columns:
    if col not in categorical_features:
        mean_val = X_train[col].mean()
        X_train[col] = X_train[col].fillna(mean_val)
        X_val[col] = X_val[col].fillna(mean_val)
        X_test[col] = X_test[col].fillna(mean_val)

# Entrenamiento del modelo CatBoost
cat_features_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train.columns]
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    random_state=42,
    verbose=100,
    loss_function="MultiClass",
    cat_features=cat_features_indices
 )
model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

# Evaluación en validación
from sklearn.metrics import accuracy_score, classification_report
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy validación: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred))

# Evaluación en test
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy test: {test_accuracy:.4f}")
print(classification_report(y_test, y_test_pred))

0:	learn: 1.3616059	test: 1.3616969	best: 1.3616969 (0)	total: 2.45s	remaining: 40m 45s
100:	learn: 1.1933344	test: 1.1914801	best: 1.1914801 (100)	total: 5m 11s	remaining: 46m 8s
200:	learn: 1.1876388	test: 1.1875019	best: 1.1875019 (200)	total: 11m 20s	remaining: 45m 6s
300:	learn: 1.1850971	test: 1.1865467	best: 1.1865467 (300)	total: 17m 14s	remaining: 40m 3s
400:	learn: 1.1829284	test: 1.1858228	best: 1.1858228 (400)	total: 22m 48s	remaining: 34m 4s
500:	learn: 1.1810688	test: 1.1855217	best: 1.1855217 (500)	total: 28m 19s	remaining: 28m 12s
600:	learn: 1.1793952	test: 1.1853487	best: 1.1853294 (577)	total: 35m 17s	remaining: 23m 25s
700:	learn: 1.1775497	test: 1.1851160	best: 1.1851120 (697)	total: 41m 1s	remaining: 17m 29s
800:	learn: 1.1758478	test: 1.1849710	best: 1.1849700 (799)	total: 46m 43s	remaining: 11m 36s
900:	learn: 1.1741614	test: 1.1848284	best: 1.1848244 (899)	total: 52m 18s	remaining: 5m 44s
999:	learn: 1.1724975	test: 1.1847790	best: 1.1847726 (969)	total: 57m 50

## Kaggle Submission

In [None]:
# Carga de datos de prueba
test_data = pd.read_csv("test.csv")

### Procesamiento de variables (Igual que en entrenamiento)


In [14]:
# Variables categóricas
for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str).fillna('missing')

# Variables numéricas
for col in test_data.columns:
    if col not in categorical_features and col not in ["ID", "PERIODO"]:
        test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
        mean_val = X_train[col].mean() if col in X_train.columns else 0
        test_data[col] = test_data[col].fillna(mean_val)

In [15]:
X_test_kaggle = test_data.drop(columns=["ID", "PERIODO"], errors='ignore')

# Predicciones en el conjunto de test
predictions = model.predict(X_test_kaggle)
predictions = predictions.flatten() if hasattr(predictions, 'flatten') else predictions

# Crear archivo de submission
submission_df = test_data[["ID"]].copy()
submission_df["RENDIMIENTO_GLOBAL"] = predictions
submission_df.to_csv("submission.csv", index=False)
print("Archivo de submission generado: submission.csv")
print(submission_df.head())

Archivo de submission generado: submission.csv
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-bajo
2  499179               alto
3  782980               bajo
4  785185               bajo


In [16]:
# Guardar el archivo de submission
submission_df.to_csv("submission_catBoost.csv", index=False)