In [None]:
# Instalación limpia de dependencias compatibles
!pip uninstall -y -q catboost numpy scipy scikit-learn
!pip install -q numpy==1.24.4
!pip install -q scipy==1.10.1
!pip install -q scikit-learn==1.2.2
!pip install -q catboost==1.2.3 optuna

In [17]:
# 0. Instalación de librerías
!pip install -q scikit-learn pandas numpy category_encoders lightgbm

# %%
# 1. Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
from google.colab import drive

# %%
# 2. Montar Google Drive y rutas
drive.mount('/content/drive')
BASE_DIR = '/content/drive/MyDrive/ai_data'
TRAIN_PATH = f"{BASE_DIR}/train.csv"
TEST_PATH  = f"{BASE_DIR}/test.csv"

# %%
# 3. Carga de datos
df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

# %%
# 4. Preprocesamiento
TARGET = 'RENDIMIENTO_GLOBAL'
ID_COL = 'ID'

def freq_encode(df, cols):
    for c in cols:
        freq = df[c].value_counts(normalize=True)
        df[c] = df[c].map(freq)
    return df

# Unir train/test
df_test[TARGET] = np.nan
full = pd.concat([df_train, df_test], ignore_index=True)
cat_cols = full.select_dtypes(include=['object']).columns.drop([TARGET])
num_cols = full.select_dtypes(include=[np.number]).columns.drop([ID_COL, TARGET], errors='ignore')

# Encoding + Imputación/Escalado
full_enc = freq_encode(full.copy(), cat_cols)
imp = SimpleImputer(strategy='median')
scaler = StandardScaler()
num_scaled = scaler.fit_transform(imp.fit_transform(full_enc[num_cols]))

# PCA
pca = PCA(n_components=0.95, random_state=42)
pca_feats = pca.fit_transform(num_scaled)
print(f"Componentes PCA: {pca.n_components_}")

# DataFrame procesado
df_proc = pd.DataFrame(pca_feats, columns=[f"pc{i+1}" for i in range(pca.n_components_)])
df_proc[cat_cols] = full_enc[cat_cols].reset_index(drop=True)
meta = full[[ID_COL, TARGET]].reset_index(drop=True)
df_proc = pd.concat([meta, df_proc], axis=1)

# %%
# 5. Separar train/test
df_train2 = df_proc[df_proc[TARGET].notna()]
df_test2  = df_proc[df_proc[TARGET].isna()].drop(columns=[TARGET])

# Mapeo target
y_map = {'bajo':0, 'medio-bajo':1, 'medio-alto':2, 'alto':3}
inv_map = {v:k for k,v in y_map.items()}

X = df_train2.drop(columns=[ID_COL, TARGET])
y = df_train2[TARGET].map(y_map).astype(int)
X_test = df_test2.drop(columns=[ID_COL])

# %%
# 6. Split para validación local
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Imputación SVM
cols = X_tr.columns.astype(str)
X_tr.columns = cols; X_val.columns = cols; X_test.columns = cols
imp_s = SimpleImputer(strategy='median')
X_tr = pd.DataFrame(imp_s.fit_transform(X_tr), columns=cols)
X_val = pd.DataFrame(imp_s.transform(X_val), columns=cols)
X_test = pd.DataFrame(imp_s.transform(X_test), columns=cols)

# %%
# 7. LightGBM GPU rápido (sin errores de early_stop en wrapper)
gbm_quick = lgb.LGBMClassifier(
    objective='multiclass', num_class=4,
    device='gpu', gpu_platform_id=0, gpu_device_id=0,
    num_leaves=31, learning_rate=0.1, n_estimators=200, max_depth=10,
    random_state=42
)

gbm_quick.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=50)
    ]
)
print("✅ LightGBM rápido completado")

# %%
# 8. Evaluación local
pred_val = gbm_quick.predict(X_val)
print("Accuracy validación:", accuracy_score(y_val, pred_val))
print(classification_report(y_val, pred_val, target_names=list(inv_map.values())))

# %%
# %%
# 9. Entrenamiento final y submission final

# Entrenamiento final sin early stopping (usar todo el set sin evaluación interna)
gbm_quick.fit(
    pd.concat([X_tr, X_val]),
    pd.concat([y_tr, y_val])
)

# Predicción y creación de archivo de submission
preds = gbm_quick.predict(X_test)
submission = pd.DataFrame({
    ID_COL: df_test2[ID_COL].values,
    TARGET: [inv_map[int(p)] for p in preds]
})
submission_path = f"{BASE_DIR}/submission_pca_lgbm_quick.csv"
submission.to_csv(submission_path, index=False)
print("✅ Submission creada en:", submission_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train shape: (692500, 21)
Test shape: (296786, 20)
Componentes PCA: 4
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1328
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 18
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (8.45 MB) transferred to GPU in 0.014582 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371993
Training un