<a href="https://colab.research.google.com/github/OsirisValencia/InteligenciaArtificialUdeA/blob/main/99_modelo_soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!kaggle
!chmod 600 ./kaggle.json

usage: kaggle [-h] [-v] [-W]
              {competitions,c,datasets,d,kernels,k,models,m,files,f,config}
              ...
kaggle: error: the following arguments are required: command


In [4]:
!kaggle competitions download -c udea-ai-4-eng-20251-pruebas-saber-pro-colombia
!unzip udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip

Downloading udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 836MB/s]
Archive:  udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
import lightgbm as lgb
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

print("🚀 SOLUCIÓN ULTRA-RÁPIDA SABER PRO (LightGBM)")
print("="*50)

# ===================================================================
# 1. CARGA Y MUESTREO AGRESIVO
# ===================================================================
print("📂 Cargando datos...")
dtr = pd.read_csv('train.csv')
dts = pd.read_csv('test.csv')

print(f"Dataset original: {dtr.shape}")

# Muestreo ultra‑agresivo: solo 50k ejemplos
print("⚡ Aplicando muestreo ultra-agresivo...")
_, dtr = train_test_split(
    dtr,
    test_size=50000,
    random_state=42,
    stratify=dtr['RENDIMIENTO_GLOBAL']
)
print(f"Dataset muestreado: {dtr.shape}")

# ===================================================================
# 2. PREPROCESAMIENTO ULTRA-RÁPIDO
# ===================================================================
def ultra_fast_preprocessing(df_train, df_test):
    print("⚡ Preprocesamiento ultra-rápido...")

    # Guardar IDs y target
    test_ids = df_test['ID'].copy()
    y_train = df_train['RENDIMIENTO_GLOBAL'].copy()

    # Columnas a eliminar
    drop_cols = ['ID', 'FAMI_TIENEINTERNET.1', 'RENDIMIENTO_GLOBAL']
    df_train = df_train.drop(drop_cols, axis=1, errors='ignore')
    df_test  = df_test.drop(drop_cols, axis=1, errors='ignore')

    # 1. Período → SEMESTRE
    for df in [df_train, df_test]:
        df['SEMESTRE'] = df['PERIODO'].astype(str).str[-1].astype(int)
        df.drop('PERIODO', axis=1, inplace=True)

    # 2. Suma y desv. estándar de coeficientes
    coef_cols = ['coef_1', 'coef_2', 'coef_3', 'coef_4']
    for df in [df_train, df_test]:
        df['coef_sum'] = df[coef_cols].sum(axis=1)
        df['coef_std'] = df[coef_cols].std(axis=1).fillna(0)

    # 3. Variables ordinales
    horas_map = {
        'Menos de 10 horas': 5,
        'Entre 11 y 20 horas': 15,
        'Entre 21 y 30 horas': 25,
        'Más de 30 horas': 35,
        '0': 0
    }
    estrato_map = {
        'Estrato 1': 1, 'Estrato 2': 2, 'Estrato 3': 3,
        'Estrato 4': 4, 'Estrato 5': 5, 'Estrato 6': 6,
        'Sin Estrato': 0, 'No aplica': 0
    }
    for df in [df_train, df_test]:
        df['HORAS_NUM'] = df['ESTU_HORASSEMANATRABAJA'].astype(str)\
                            .map(horas_map).fillna(0)
        df['ESTRATO_NUM'] = df['FAMI_ESTRATOVIVIENDA'].astype(str)\
                            .map(estrato_map).fillna(2)

    # 4. Variables binarias
    binary_cols = ['FAMI_TIENEINTERNET', 'FAMI_TIENECOMPUTADOR', 'FAMI_TIENEAUTOMOVIL']
    for col in binary_cols:
        for df in [df_train, df_test]:
            if col in df.columns:
                df[f'{col}_BIN'] = (df[col].astype(str).isin(['Si', 'S'])).astype(int)

    # 5. Score socioeconómico
    for df in [df_train, df_test]:
        df['SOCIO_SCORE'] = (
            df['ESTRATO_NUM']
            + df.get('FAMI_TIENEAUTOMOVIL_BIN', 0)
            + df.get('FAMI_TIENECOMPUTADOR_BIN', 0)
        )

    # 6. Educación de padres
    edu_map = {
        'Ninguno': 0, 'No sabe': 0,
        'Primaria incompleta': 1, 'Primaria completa': 2,
        'Secundaria (Bachillerato) incompleta': 3,
        'Secundaria (Bachillerato) completa': 4,
        'Técnica o tecnológica incompleta': 5,
        'Técnica o tecnológica completa': 6,
        'Educación profesional incompleta': 7,
        'Educación profesional completa': 8,
        'Postgrado': 9
    }
    for df in [df_train, df_test]:
        df['EDU_PADRE'] = df['FAMI_EDUCACIONPADRE'].map(edu_map).fillna(4)
        df['EDU_MADRE']  = df['FAMI_EDUCACIONMADRE'].map(edu_map).fillna(4)
        df['EDU_MAX']   = df[['EDU_PADRE', 'EDU_MADRE']].max(axis=1)

    # Identificar numéricas y categóricas
    numeric_cols = [c for c in df_train.columns
                    if df_train[c].dtype in ['int64','float64']]
    categorical_cols = [c for c in df_train.columns
                        if c not in numeric_cols]

    print(f"Procesando {len(numeric_cols)} numéricas y {len(categorical_cols)} categóricas")

    # Imputación numérica
    if numeric_cols:
        medians = df_train[numeric_cols].median()
        df_train[numeric_cols] = df_train[numeric_cols].fillna(medians)
        df_test[numeric_cols]  = df_test[numeric_cols].fillna(medians)

    # Imputación + LabelEncoding categóricas
    for col in categorical_cols:
        mode_val = df_train[col].mode().iloc[0] if not df_train[col].mode().empty else 'unknown'
        df_train[col] = df_train[col].fillna(mode_val)
        df_test[col]  = df_test[col].fillna(mode_val)
        le = LabelEncoder()
        combined = pd.concat([df_train[col], df_test[col]]).astype(str)
        le.fit(combined)
        df_train[col] = le.transform(df_train[col].astype(str))
        df_test[col]  = le.transform(df_test[col].astype(str))

    return df_train.values, df_test.values, y_train, test_ids

# Aplicar preprocesamiento
X, X_test, y, test_ids = ultra_fast_preprocessing(dtr, dts)
print(f"Features finales: {X.shape[1]}")

# ===================================================================
# 3. SPLIT RÁPIDO
# ===================================================================
print("📊 Dividiendo datos en train/val...")
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2,
    random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Validation: {X_val.shape}")

# ===================================================================
# 4. ENTRENAR LightGBM (sin early stopping en fit)
# ===================================================================
print("🎯 Entrenando LightGBM con parámetros fijos...")
model = LGBMClassifier(
    n_estimators=200,
    max_depth=15,
    learning_rate=0.1,
    num_leaves=31,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42,
    n_jobs=-1
)

# Fit simple
model.fit(X_train, y_train)

# ===================================================================
# 5. EVALUACIÓN
# ===================================================================
print("📈 Evaluando en el set de validación...")
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"\n🎯 ACCURACY (LightGBM): {acc:.4f}")
print("\n📊 Reporte de clasificación:")
print(classification_report(y_val, y_pred, zero_division=0))

# ===================================================================
# 6. ENTRENAMIENTO FINAL Y PREDICCIONES
# ===================================================================
print("🔄 Reentrenando en todos los datos disponibles...")
X_full = np.vstack([X_train, X_val])
y_full = np.hstack([y_train, y_val])
model.fit(X_full, y_full)

print("🎯 Generando predicciones para test...")
preds_test = model.predict(X_test)
# ===================================================================
# 7. CREAR SUBMISSION
# ===================================================================
submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': preds_test
})
submission.to_csv('submission_lightgbm.csv', index=False)
print("✅ Submission creada: submission_lightgbm.csv")

# ===================================================================
# 8. IMPORTANCIAS DE FEATURE
# ===================================================================
print("\n🔝 Top 5 características (LightGBM):")
fi = model.feature_importances_
top5 = np.argsort(fi)[-5:][::-1]
for i, idx in enumerate(top5, 1):
    print(f"  {i}. Feature {idx} → {fi[idx]}")

print("\n🎉 ¡Todo listo para Kaggle!")


🚀 SOLUCIÓN ULTRA-RÁPIDA SABER PRO (LightGBM)
📂 Cargando datos...
Dataset original: (692500, 21)
⚡ Aplicando muestreo ultra-agresivo...
Dataset muestreado: (50000, 21)
⚡ Preprocesamiento ultra-rápido...
Procesando 16 numéricas y 13 categóricas
Features finales: 29
📊 Dividiendo datos en train/val...
Train: (40000, 29), Validation: (10000, 29)
🎯 Entrenando LightGBM con parámetros fijos...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1726
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 28
[LightGBM] [Info] Start training from score -1.371997
[LightGBM] [Info] Start training from score -1.387095
[LightGBM] [Info] Start training from score -1.395032
[LightGBM] [Info] Start training from score -1.391206
📈 Evaluando en el set de validación...

🎯 

In [9]:
!kaggle competitions submit -c udea-ai-4-eng-20251-pruebas-saber-pro-colombia -f submission_lightgbm.csv -m 'Solución final con lightGBM'


100% 4.06M/4.06M [00:00<00:00, 6.09MB/s]
Successfully submitted to UDEA/ai4eng 20251 - Pruebas Saber Pro Colombia