In [2]:
!pip install -q lightgbm==3.3.5 category_encoders scikit-learn pandas numpy

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m1.9/2.0 MB[0m [31m56.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [4]:
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = '/content/drive/MyDrive/ai_data'
train = pd.read_csv(f"{BASE_DIR}/train.csv")
test  = pd.read_csv(f"{BASE_DIR}/test.csv")

Mounted at /content/drive


In [5]:
print("Train shape:", train.shape)
print(train['RENDIMIENTO_GLOBAL'].value_counts(normalize=True))
train.info()

Train shape: (692500, 21)
RENDIMIENTO_GLOBAL
alto          0.253601
bajo          0.249801
medio-bajo    0.248773
medio-alto    0.247825
Name: proportion, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692500 entries, 0 to 692499
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   ID                              692500 non-null  int64  
 1   PERIODO                         692500 non-null  int64  
 2   ESTU_PRGM_ACADEMICO             692500 non-null  object 
 3   ESTU_PRGM_DEPARTAMENTO          692500 non-null  object 
 4   ESTU_VALORMATRICULAUNIVERSIDAD  686213 non-null  object 
 5   ESTU_HORASSEMANATRABAJA         661643 non-null  object 
 6   FAMI_ESTRATOVIVIENDA            660363 non-null  object 
 7   FAMI_TIENEINTERNET              665871 non-null  object 
 8   FAMI_EDUCACIONPADRE             669322 non-null  object 
 9   FAMI_TIENELAVADORA              

In [15]:
TARGET = 'RENDIMIENTO_GLOBAL'
ID_COL = 'ID'

# Unir train/test para procesamiento
full = pd.concat([train.assign(_is_train=1), test.assign(_is_train=0)], axis=0)

# Identificar columnas
cat_cols = full.select_dtypes(include=['object']).columns.drop([TARGET])
num_cols = full.select_dtypes(include=[np.number]).columns.drop([ID_COL, '_is_train'])

# Ajustar TargetEncoder solo con datos sin NaN en TARGET
tencoder = TargetEncoder(cols=cat_cols)
train_mask = full['_is_train'] == 1
# Entrenar encoder con datos de train
tencoder.fit(full.loc[train_mask, cat_cols], full.loc[train_mask, TARGET])
# Aplicar transform tanto a train como a test
full_cat = tencoder.transform(full[cat_cols])

# Imputación simple de numéricos
imp = SimpleImputer(strategy='median')
full_num = pd.DataFrame(
    imp.fit_transform(full[num_cols]),
    columns=num_cols,
    index=full.index
)

# Combinamos ID, indicador y columnas procesadas
df_proc = pd.concat([
    full[[ID_COL, TARGET, '_is_train']].reset_index(drop=True),
    full_cat.reset_index(drop=True),
    full_num.reset_index(drop=True)
], axis=1)
# Eliminamos indicador temporal
df_proc.drop(columns=['_is_train'], inplace=True)
# %%
# 10. Reconstruir sets y preparar modelo para submission
# Reconstruir conjuntos de train/test
df_train = df_proc[df_proc[TARGET].notna()].copy()
df_test  = df_proc[df_proc[TARGET].isna()].copy().drop(columns=[TARGET])

# Mapear etiquetas a enteros y back
y_map = {'bajo':0, 'medio-bajo':1, 'medio-alto':2, 'alto':3}
inv_y = {v:k for k,v in y_map.items()}

# Features y target
X = df_train.drop(columns=[ID_COL, TARGET])
y = df_train[TARGET].map(y_map)
X_test = df_test.drop(columns=[ID_COL], errors='ignore')

In [16]:
# 11. Entrenamiento LightGBM sobre todo el train
from lightgbm import LGBMClassifier
model_lgb = LGBMClassifier(
    objective='multiclass',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)
model_lgb.fit(X, y)
print('✅ Model_lgb entrenado sobre todo el train')

✅ Model_lgb entrenado sobre todo el train


In [17]:
final_pred = model_lgb.predict(X_test)
sub = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET: [inv_y[int(i)] for i in final_pred]
})
sub.to_csv(f"{BASE_DIR}/submission_alternative.csv", index=False)
print("✅ Submission saved en submission_alternative.csv")

✅ Submission saved en submission_alternative.csv
