In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Cargar el dataset con el separador correcto
print("Cargando dataset...")
try:
    df = pd.read_csv('data.csv', sep=';', encoding='utf-8')
    # Limpiar nombres de columnas
    df.columns = df.columns.str.replace(r'[\t";]', '', regex=True).str.strip()
except Exception as e:
    print(f"Error al cargar el archivo: {e}")
    raise

# 2. Mostrar información del dataset
print("\nInformación del dataset:")
print(f"Filas: {df.shape[0]}, Columnas: {df.shape[1]}")
print("\nPrimeras 5 filas:")
print(df.head())

# 3. Seleccionar características relevantes y objetivo
feature_columns = [
    'Marital status',
    'Application mode',
    'Application order',
    'Daytime/evening attendance',
    'Previous qualification',
    'Previous qualification (grade)',
    'Admission grade',
    'Debtor',
    'Tuition fees up to date',
    'Gender',
    'Scholarship holder',
    'Age at enrollment'
]

target_column = 'Target'

# Verificar columnas
missing_features = [col for col in feature_columns if col not in df.columns]
if missing_features:
    print("\n¡Advertencia! Columnas no encontradas:", missing_features)
    print("Columnas disponibles:", df.columns.tolist())
    feature_columns = [col for col in feature_columns if col in df.columns]

if target_column not in df.columns:
    raise ValueError(f"Columna objetivo '{target_column}' no encontrada")

# 4. Dividir datos
X = df[feature_columns]
y = df[target_column]

# 5. Preprocesamiento
# Identificar tipos de columnas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 6. Crear pipeline completo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    ))
])

# 7. Dividir datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 8. Entrenar modelo
print("\nEntrenando modelo...")
model.fit(X_train, y_train)

# 9. Evaluar modelo
print("\nEvaluación del modelo:")
y_pred = model.predict(X_test)

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred))

# 10. Guardar modelo
joblib.dump(model, 'student_dropout_predictor.joblib')
print("\n✅ Modelo guardado como 'student_dropout_predictor.joblib'")

# Opcional: Mostrar importancia de características
if hasattr(model.named_steps['classifier'], 'feature_importances_'):
    print("\nImportancia de las características:")
    try:
        # Obtener nombres de características después del preprocesamiento
        feature_names = numeric_features.copy()
        if categorical_features:
            ohe = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
            cat_names = ohe.get_feature_names_out(categorical_features)
            feature_names.extend(cat_names)
        
        importances = pd.DataFrame({
            'Feature': feature_names,
            'Importance': model.named_steps['classifier'].feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print(importances.head(10))
    except Exception as e:
        print(f"No se pudo obtener importancia de características: {e}")

Cargando dataset...

Información del dataset:
Filas: 4424, Columnas: 37

Primeras 5 filas:
   Marital status  Application mode  Application order  Course  \
0               1                17                  5     171   
1               1                15                  1    9254   
2               1                 1                  5    9070   
3               1                17                  2    9773   
4               2                39                  1    8014   

   Daytime/evening attendance  Previous qualification  \
0                           1                       1   
1                           1                       1   
2                           1                       1   
3                           1                       1   
4                           0                       1   

   Previous qualification (grade)  Nacionality  Mother's qualification  \
0                           122.0            1                      19   
1                    