In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos procesados
data_path = './data/label_encoded_scaled_data.csv'  # Reemplaza con la ruta correcta
data = pd.read_csv(data_path)

# Remover columnas no necesarias
data = data.drop(['dropout.semester'], axis=1)

# Verificar la naturaleza de la variable objetivo
target_column = 'retention'
y = data[target_column]

# Si 'retention' no es categórica, la convertimos
if y.dtype != 'object' and len(y.unique()) > 20:
    y = (y > y.median()).astype(int)  # Convertir a binario basado en la mediana

# Usar LabelEncoder si 'retention' es categórica
y = LabelEncoder().fit_transform(y)

# Separar las características
X = data.drop(target_column, axis=1)

# Escalar las características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# División del dataset en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Modelos a evaluar
models = {
    'DecisionTree': DecisionTreeClassifier(),
    'NeuralNetwork': MLPClassifier(max_iter=1000, random_state=42)
}

# Entrenamiento y evaluación de los modelos
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }

# Mostrar los resultados
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1_score']}")
    print("Classification Report:")
    print(metrics['classification_report'])
    print("Confusion Matrix:")
    print(metrics['confusion_matrix'])
    print("\n" + "-"*60 + "\n")

# Ajuste de hiperparámetros usando GridSearchCV para el mejor modelo
param_grid = {
    'DecisionTree': {
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10]
    },
    'NeuralNetwork': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd']
    }
}

best_model = None
best_score = 0

for model_name, params in param_grid.items():
    grid_search = GridSearchCV(models[model_name], params, cv=5, scoring='f1', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best F1 Score: {grid_search.best_score_}")

    if grid_search.best_score_ > best_score:
        best_model = grid_search.best_estimator_
        best_score = grid_search.best_score_

# Evaluar el mejor modelo en el conjunto de prueba
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

print("Best Model Performance on Test Set")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best)}")
print(f"Precision: {precision_score(y_test, y_pred_best)}")
print(f"Recall: {recall_score(y_test, y_pred_best)}")
print(f"F1 Score: {f1_score(y_test, y_pred_best)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))


Model: DecisionTree
Accuracy: 0.9039199561403509
Precision: 0.9557986870897155
Recall: 0.9427338129496403
F1 Score: 0.9492212966316552
Classification Report:
              precision    recall  f1-score   support

           0       0.10      0.12      0.11       346
           1       0.96      0.94      0.95      6950

    accuracy                           0.90      7296
   macro avg       0.53      0.53      0.53      7296
weighted avg       0.92      0.90      0.91      7296

Confusion Matrix:
[[  43  303]
 [ 398 6552]]

------------------------------------------------------------

Model: NeuralNetwork
Accuracy: 0.928453947368421
Precision: 0.9542114188807236
Recall: 0.9715107913669064
F1 Score: 0.9627834022529588
Classification Report:
              precision    recall  f1-score   support

           0       0.10      0.06      0.08       346
           1       0.95      0.97      0.96      6950

    accuracy                           0.93      7296
   macro avg       0.53      0.


Los resultados de los modelos muestran que, aunque tanto el árbol de decisión como la red neuronal tienen un buen rendimiento general en términos de precisión y recall para la clase mayoritaria (1), ambos modelos tienen dificultades para predecir correctamente la clase minoritaria (0). Esto se refleja en las matrices de confusión y en los valores de precisión y recall para la clase 0.

In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.combine import SMOTEENN

# Combinar sobremuestreo y submuestreo
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Modelos a evaluar con las nuevas técnicas
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'NeuralNetwork': MLPClassifier(hidden_layer_sizes=(100, 50), activation='tanh', solver='sgd', max_iter=1000, random_state=42)
}

# Entrenamiento y evaluación de los modelos
results = {}

for model_name, model in models.items():
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }

# Mostrar los resultados
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1_score']}")
    print("Classification Report:")
    print(metrics['classification_report'])
    print("Confusion Matrix:")
    print(metrics['confusion_matrix'])
    print("\n" + "-"*60 + "\n")


Model: RandomForest
Accuracy: 0.9255756578947368
Precision: 0.9591514977784148
Recall: 0.9628776978417266
F1 Score: 0.9610109858548144
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.18      0.18       346
           1       0.96      0.96      0.96      6950

    accuracy                           0.93      7296
   macro avg       0.58      0.57      0.57      7296
weighted avg       0.92      0.93      0.92      7296

Confusion Matrix:
[[  61  285]
 [ 258 6692]]

------------------------------------------------------------

Model: GradientBoosting
Accuracy: 0.891858552631579
Precision: 0.9612217397814043
Recall: 0.9237410071942446
F1 Score: 0.9421087387189082
Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.25      0.18       346
           1       0.96      0.92      0.94      6950

    accuracy                           0.89      7296
   macro avg       0.55     

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Ajuste de hiperparámetros usando GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10]
}

# Random Forest Grid Search
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid_search_rf.fit(X_resampled, y_resampled)

print(f"Best parameters for RandomForest: {grid_search_rf.best_params_}")
print(f"Best F1 Score: {grid_search_rf.best_score_}")

# Gradient Boosting Grid Search
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid_search_gb.fit(X_resampled, y_resampled)

print(f"Best parameters for GradientBoosting: {grid_search_gb.best_params_}")
print(f"Best F1 Score: {grid_search_gb.best_score_}")

# Evaluar el mejor modelo en el conjunto de prueba
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_

# Evaluación de RandomForest con mejores hiperparámetros
best_rf_model.fit(X_resampled, y_resampled)
y_pred_best_rf = best_rf_model.predict(X_test)
print("Best RandomForest Model Performance on Test Set")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_best_rf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_best_rf)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_best_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_rf))

# Evaluación de GradientBoosting con mejores hiperparámetros
best_gb_model.fit(X_resampled, y_resampled)
y_pred_best_gb = best_gb_model.predict(X_test)
print("Best GradientBoosting Model Performance on Test Set")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_gb)}")
print(f"Precision: {precision_score(y_test, y_pred_best_gb)}")
print(f"Recall: {recall_score(y_test, y_pred_best_gb)}")
print(f"F1 Score: {f1_score(y_test, y_pred_best_gb)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_best_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_gb))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for RandomForest: {'class_weight': 'balanced_subsample', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Best F1 Score: 0.9377029017959628
Fitting 5 folds for each of 36 candidates, totalling 180 fits


KeyboardInterrupt: 