<a href="https://colab.research.google.com/github/Nomaqui/ENTREGAS/blob/main/02%20-%20preprocesado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv('train.csv')

# Guardar columna objetivo y ID
y = df['RENDIMIENTO_GLOBAL']

X = df.drop(columns=['RENDIMIENTO_GLOBAL', 'ID', 'ESTU_PRIVADO_LIBERTAD'])


Separar columnas por tipo

In [None]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categóricas:", cat_cols)
print("Numéricas:", num_cols)

Imputar numéricas con la media

In [None]:
imp_num = SimpleImputer(strategy='mean')
X[num_cols] = imp_num.fit_transform(X[num_cols])

Imputar categóricas con la moda

In [None]:
imp_cat = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imp_cat.fit_transform(X[cat_cols])

In [None]:
frecuencias = X['ESTU_PRGM_ACADEMICO'].value_counts()

# Crear una máscara para las que aparecen solo una vez
categorias_poco_frecuentes = frecuencias[frecuencias <= 1000].index

# Reemplazar esas categorías por "POCO_FRECUENTE"
X['ESTU_PRGM_ACADEMICO'] = X['ESTU_PRGM_ACADEMICO'].replace(categorias_poco_frecuentes, 'POCO_FRECUENTE')

Convertir columnas en one-hot

In [None]:
X_convertida = pd.get_dummies(X, columns=cat_cols, drop_first=True)

Mostrar la tabla con las columnas debidas en one-hot

In [None]:
X_convertida

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_convertida, y,  test_size=0.2, random_state=19, stratify=y)
clf = RandomForestClassifier(
    n_estimators=300,
    #class_weight='balanced',
    # Más árboles suelen mejorar el rendimiento
    max_depth=20,           # Limita profundidad para evitar overfitting
    min_samples_split=5,      # Requiere más muestras para dividir
    min_samples_leaf=2,       # Reduce overfitting
    max_features='sqrt',      # Usa solo raíz de variables en cada split
    random_state=19
)
clf.fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Escalar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modelos
clf1 = LogisticRegression(max_iter=2000, solver='lbfgs', random_state=19)
clf2 = SVC(probability=True, random_state=19)
clf3 = RandomForestClassifier(n_estimators=300, max_depth=20,
                              min_samples_split=5, min_samples_leaf=2,
                              max_features='sqrt', random_state=19)

# Ensamble
eclf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('svc', clf2),
    ('rf', clf3)
], voting='soft')

# Entrenar
eclf.fit(X_train_scaled, y_train)

# Predecir
y_pred = eclf.predict(X_test_scaled)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        alto       0.50      0.61      0.55     35124
        bajo       0.42      0.54      0.47     34597
  medio-alto       0.31      0.21      0.25     34324
  medio-bajo       0.31      0.25      0.28     34455

    accuracy                           0.41    138500
   macro avg       0.39      0.40      0.39    138500
weighted avg       0.39      0.41      0.39    138500

[[21595  4066  5590  3873]
 [ 4101 18663  4303  7530]
 [10856  8720  7287  7461]
 [ 6596 13104  6105  8650]]


In [None]:
df_test = pd.read_csv("test.csv")
ids_test = df_test["ID"]
X_test = df_test.drop(columns=["ID"])

In [None]:
# Imputar valores faltantes
X_test[num_cols] = imp_num.transform(X_test[num_cols])
X_test[cat_cols] = imp_cat.transform(X_test[cat_cols])

# Codificación one-hot con las mismas columnas que el entrenamiento
X_test_convertida = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

# Asegurar que tenga las mismas columnas que X_convertida
X_test_convertida = X_test_convertida.reindex(columns=X_convertida.columns, fill_value=0)

In [None]:
y_test_pred = clf.predict(X_test_convertida)


In [None]:
print(len(y_test_pred))

296786


In [None]:
submission = pd.DataFrame({
    "ID": ids_test,
    "RENDIMIENTO_GLOBAL": y_test_pred
})

In [None]:
submission.to_csv("submission.csv", index=False)
