In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
%pip install shap
import shap
import warnings

In [24]:
df = pd.read_csv('dataset.csv')

In [None]:
print(df.columns.tolist())

In [26]:
df['evasao'] = df['Target'].apply(
    lambda x: 1 if x == 'Dropout' else 0 if x == 'Graduate' else np.nan
).astype('Int64')
df_clean = df.dropna(subset=['evasao']).copy()


In [None]:
print("\nDistribuição da variável target:")
print(df_clean['evasao'].value_counts().rename(index={0: 'Não Evasão', 1: 'Evasão'}))
print(f"Taxa de evasão: {df_clean['evasao'].mean():.2%}")

In [28]:
features_keep = [
    'Age at enrollment',
    'Gender',
    'Daytime/evening attendance',
    'Scholarship holder',
    'Educational special needs',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (grade)'
]

X = df_clean[features_keep].copy()
y = df_clean['evasao']

In [29]:
numeric_features = [
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (grade)'
]

categorical_features = [
    'Gender',
    'Daytime/evening attendance',
    'Scholarship holder',
    'Educational special needs'
]

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



Divisão dos dados:
Treino: 2904 amostras (39.15% evasão)
Teste:  726 amostras (39.12% evasão)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [200, 300],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 4],
    'classifier__min_samples_leaf': [20, 50],
    'classifier__subsample': [0.8, 1.0],
}

grid_search = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_gb_model = grid_search.best_estimator_


🚀 OTIMIZANDO GRADIENT BOOSTING...
⏳ Executando Grid Search...
Fitting 5 folds for each of 32 candidates, totalling 160 fits

🎯 Melhores parâmetros:
  classifier__learning_rate: 0.05
  classifier__max_depth: 3
  classifier__min_samples_leaf: 50
  classifier__n_estimators: 200
  classifier__subsample: 1.0


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    average_precision_score
)


y_pred = best_gb_model.predict(X_test)
y_proba = best_gb_model.predict_proba(X_test)[:, 1]

metrics = {
    'AUC': roc_auc_score(y_test, y_proba),
    'PR-AUC': average_precision_score(y_test, y_proba),
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0)
}

print("\nMétricas Finais:")
for metric, value in metrics.items():
    print(f"{metric:12}: {value:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Não Evasão', 'Evasão']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import json, joblib
from pathlib import Path
from sklearn.metrics import precision_recall_curve

MODEL_PATH = Path("gradient_boosting_dropout.pkl")
META_PATH  = Path("model_meta.json")

joblib.dump(best_gb_model, MODEL_PATH)

try:
    _prec, _rec, _thr = precision_recall_curve(y_test, y_proba)
    _f1s = 2 * _prec * _rec / (np.clip(_prec + _rec, 1e-9, None))
    _best_idx = int(np.argmax(_f1s))
    best_threshold = float(_thr[_best_idx]) if _best_idx < len(_thr) else 0.5
except Exception:
    best_threshold = 0.5  

risk_bands = {
    "low": 0.40,
    "high": 0.70
}

meta = {
    "model_file": str(MODEL_PATH),
    "features_expected": list(X.columns),
    "metrics": metrics,             
    "best_threshold": best_threshold,
    "risk_bands": risk_bands
}
with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2)
