In [2]:
!pip install -U numpy==1.26.4 scipy==1.16.0 scikit-learn==1.6.1 catboost==1.2.3 optuna
!pip uninstall -y spacy thinc fastai tsfresh




In [1]:
import platform, numpy as np, scipy, sklearn, optuna, catboost
print("Python  :", platform.python_version())
print("NumPy   :", np.__version__)
print("SciPy   :", scipy.__version__)
print("sklearn :", sklearn.__version__)
print("Optuna  :", optuna.__version__)
print("CatBoost:", catboost.__version__)

Python  : 3.11.13
NumPy   : 1.26.4
SciPy   : 1.16.0
sklearn : 1.6.1
Optuna  : 4.4.0
CatBoost: 1.2.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR   = "/content/drive/MyDrive/ai_data"
TRAIN_PATH = f"{BASE_DIR}/train.csv"
TEST_PATH  = f"{BASE_DIR}/test.csv"

Mounted at /content/drive


In [3]:
import pandas as pd, numpy as np
df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)
print("Shapes:", df_train.shape, df_test.shape)
display(df_train.head())

Shapes: (692500, 21) (296786, 20)


Unnamed: 0,ID,PERIODO,ESTU_PRGM_ACADEMICO,ESTU_PRGM_DEPARTAMENTO,ESTU_VALORMATRICULAUNIVERSIDAD,ESTU_HORASSEMANATRABAJA,FAMI_ESTRATOVIVIENDA,FAMI_TIENEINTERNET,FAMI_EDUCACIONPADRE,FAMI_TIENELAVADORA,...,ESTU_PRIVADO_LIBERTAD,ESTU_PAGOMATRICULAPROPIO,FAMI_TIENECOMPUTADOR,FAMI_TIENEINTERNET.1,FAMI_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,coef_1,coef_2,coef_3,coef_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


In [4]:
TARGET, ID_COL = "RENDIMIENTO_GLOBAL", "ID"

df_test[TARGET] = np.nan
full = pd.concat([df_train, df_test], ignore_index=True)

cat_cols = [c for c in full.select_dtypes(include=["object", "category"]).columns
            if c != TARGET]
num_cols = [c for c in full.columns if c not in cat_cols + [TARGET]]

# Imputación
for col in num_cols:
    full[col] = full[col].fillna(full[col].median())
for col in cat_cols:
    full[col] = full[col].fillna(full[col].mode()[0])

df_train = full[full[TARGET].notna()].copy()
df_test  = full[full[TARGET].isna()].drop(columns=[TARGET]).copy()

print(f"Categorical: {len(cat_cols)}  |  Numeric: {len(num_cols)}")

Categorical: 14  |  Numeric: 6


In [5]:
from sklearn.model_selection import train_test_split

label_map     = {lbl: idx for idx, lbl in enumerate(sorted(df_train[TARGET].unique()))}
inv_label_map = {v: k for k, v in label_map.items()}

df_train[TARGET] = df_train[TARGET].map(label_map).astype(int)

X_full = df_train.drop(columns=[TARGET, ID_COL])
y_full = df_train[TARGET].values
cat_idx = [X_full.columns.get_loc(c) for c in cat_cols]

SEED = 42
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.20, stratify=y_full, random_state=SEED
)
print("Train/Val shapes:", X_train.shape, X_val.shape)

Train/Val shapes: (554000, 19) (138500, 19)


In [6]:
from collections import Counter
cnt    = Counter(y_train)
total  = len(y_train)
class_weights = [total / cnt[c] for c in sorted(cnt)]
print(dict(zip(sorted(cnt), class_weights)))

{0: 3.9432008256521582, 1: 4.003179420478358, 2: 4.035106886630977, 3: 4.0197358873893485}


In [7]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

def objective(trial):
    params = {
        "loss_function" : "MultiClass",
        "depth"         : trial.suggest_int("depth", 5, 10),
        "learning_rate" : trial.suggest_float("learning_rate", 0.03, 0.2, log=True),
        "iterations"    : trial.suggest_int("iterations", 800, 2000),
        "l2_leaf_reg"   : trial.suggest_float("l2_leaf_reg", 1.0, 6.0, log=True),
        "random_seed"   : SEED,
        "eval_metric"   : "Accuracy",
        "verbose"       : False,
        "allow_writing_files": False,
        "cat_features"  : cat_idx,
        "task_type"     : "GPU",
        "devices"       : "0",
        "class_weights" : class_weights,
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=50,
              verbose=False)
    preds = model.predict(X_val)
    return accuracy_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, show_progress_bar=True)

best_params = study.best_trial.params
print("🟢 Mejores hiperparámetros:", best_params)
print("🟢 Best accuracy:", study.best_value)

[I 2025-07-02 01:59:49,167] A new study created in memory with name: no-name-378140d7-bf40-4f7b-af50-63f2c4ec9c54


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-07-02 02:00:22,241] Trial 0 finished with value: 0.4284981949458484 and parameters: {'depth': 8, 'learning_rate': 0.030761852571312077, 'iterations': 1315, 'l2_leaf_reg': 2.6186281178157094}. Best is trial 0 with value: 0.4284981949458484.
[I 2025-07-02 02:00:48,373] Trial 1 finished with value: 0.42934296028880864 and parameters: {'depth': 8, 'learning_rate': 0.1066494150077193, 'iterations': 1789, 'l2_leaf_reg': 4.902140702370841}. Best is trial 1 with value: 0.42934296028880864.
[I 2025-07-02 02:01:03,868] Trial 2 finished with value: 0.4284765342960289 and parameters: {'depth': 9, 'learning_rate': 0.14956248574082412, 'iterations': 1935, 'l2_leaf_reg': 1.0181871538004512}. Best is trial 1 with value: 0.42934296028880864.
[I 2025-07-02 02:01:23,173] Trial 3 finished with value: 0.4287653429602888 and parameters: {'depth': 10, 'learning_rate': 0.12097472856550007, 'iterations': 1309, 'l2_leaf_reg': 1.1301374725551259}. Best is trial 1 with value: 0.42934296028880864.
[I 2025-

In [8]:
# Construimos el diccionario completo (ya tiene nombres correctos)
final_params = {
    **best_params,
    "loss_function": "MultiClass",
    "cat_features" : cat_idx,
    "random_seed"  : SEED,
    "eval_metric"  : "Accuracy",
    "task_type"    : "GPU",
    "devices"      : "0",
    "verbose"      : 200,
    "class_weights": class_weights,
}

model_final = CatBoostClassifier(**final_params)
model_final.fit(
    X_full, y_full,
    eval_set=(X_val, y_val),
    early_stopping_rounds=60,   # ← aquí (no en __init__)
)

0:	learn: 0.3723514	test: 0.3721350	best: 0.3721350 (0)	total: 58.4ms	remaining: 1m 35s
200:	learn: 0.4335402	test: 0.4344576	best: 0.4344790 (198)	total: 7.21s	remaining: 51.6s
400:	learn: 0.4457812	test: 0.4446913	best: 0.4447050 (399)	total: 14.9s	remaining: 46.1s
600:	learn: 0.4573823	test: 0.4548661	best: 0.4548661 (600)	total: 23.5s	remaining: 40.6s
800:	learn: 0.4675278	test: 0.4622463	best: 0.4623048 (797)	total: 30.1s	remaining: 31.4s
1000:	learn: 0.4772183	test: 0.4705780	best: 0.4705780 (1000)	total: 38.8s	remaining: 24.7s
1200:	learn: 0.4869533	test: 0.4783568	best: 0.4784636 (1197)	total: 45.4s	remaining: 16.5s
1400:	learn: 0.4964237	test: 0.4857589	best: 0.4857589 (1400)	total: 54s	remaining: 9.14s
1600:	learn: 0.5056314	test: 0.4932016	best: 0.4932016 (1600)	total: 1m 2s	remaining: 1.45s
1637:	learn: 0.5072326	test: 0.4946184	best: 0.4946184 (1637)	total: 1m 4s	remaining: 0us
bestTest = 0.4946183781
bestIteration = 1637


<catboost.core.CatBoostClassifier at 0x7dca813d4890>

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

N_FOLDS = 5
oof, test_proba = np.zeros(len(df_train)), np.zeros((len(df_test), len(label_map)))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
for f, (tr, va) in enumerate(skf.split(X_full, y_full), 1):
    print(f"Fold {f}/{N_FOLDS}")
    m = CatBoostClassifier(**final_params)
    m.fit(X_full.iloc[tr], y_full[tr],
          eval_set=(X_full.iloc[va], y_full[va]),
          early_stopping_rounds=60, verbose=False)
    oof[va] = m.predict(X_full.iloc[va]).flatten()
    test_proba += m.predict_proba(df_test.drop(columns=[ID_COL])) / N_FOLDS

print("OOF accuracy:", accuracy_score(y_full, oof))

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
OOF accuracy: 0.43042310469314077


In [10]:
#@title 9. Predicción y archivo de envío (corregido)
# ——— Generamos las predicciones como lista para evitar alineación por índice
test_features = df_test.drop(columns=[ID_COL])
pred_ints     = model_final.predict(test_features).astype(int).flatten()
pred_labels   = [inv_label_map[i] for i in pred_ints]

# ——— Construimos el DataFrame usando directamente los valores
submission = pd.DataFrame({
    ID_COL: df_test[ID_COL].values,
    TARGET: pred_labels
})

# ——— Guardamos sin índice para que quede “ID,RENDIMIENTO_GLOBAL” en la primera fila
OUT_PATH = f"{BASE_DIR}/submission_catboost.csv"
submission.to_csv(OUT_PATH, index=False)

print(f"✅ Archivo guardado en: {OUT_PATH}")
display(submission.head())

✅ Archivo guardado en: /content/drive/MyDrive/ai_data/submission_catboost.csv


Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,bajo
1,98545,medio-alto
2,499179,alto
3,782980,bajo
4,785185,bajo
