In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
%pip install shap
import shap
import warnings

In [24]:
df = pd.read_csv('dataset.csv')

In [None]:
print(df.columns.tolist())

In [26]:
df['evasao'] = df['Target'].apply(
    lambda x: 1 if x == 'Dropout' else 0 if x == 'Graduate' else np.nan
).astype('Int64')
df_clean = df.dropna(subset=['evasao']).copy()


In [41]:
print("\nDistribui√ß√£o da vari√°vel target:")
print(df_clean['evasao'].value_counts().rename(index={0: 'N√£o Evas√£o', 1: 'Evas√£o'}))
print(f"Taxa de evas√£o: {df_clean['evasao'].mean():.2%}")


Distribui√ß√£o da vari√°vel target:
evasao
N√£o Evas√£o    2209
Evas√£o        1421
Name: count, dtype: Int64
Taxa de evas√£o: 39.15%


In [28]:
features_keep = [
    'Age at enrollment',
    'Gender',
    'Daytime/evening attendance',
    'Scholarship holder',
    'Educational special needs',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (grade)'
]

X = df_clean[features_keep].copy()
y = df_clean['evasao']

In [29]:
numeric_features = [
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (grade)'
]

categorical_features = [
    'Gender',
    'Daytime/evening attendance',
    'Scholarship holder',
    'Educational special needs'
]

In [30]:
print(f"Vari√°veis num√©ricas: {len(numeric_features)}")
print(f"Vari√°veis categ√≥ricas: {len(categorical_features)}")

Vari√°veis num√©ricas: 7
Vari√°veis categ√≥ricas: 4


In [31]:
missing_values = X.isnull().sum()
total_missing = missing_values.sum()

if total_missing > 0:
    print(f"\n‚ö†Ô∏è Valores ausentes detectados: {total_missing}")
    print(missing_values[missing_values > 0])
else:
    print("\n‚úÖ Nenhum valor ausente encontrado.")



‚úÖ Nenhum valor ausente encontrado.


In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nDivis√£o dos dados:")
print(f"Treino: {X_train.shape[0]} amostras ({y_train.mean():.2%} evas√£o)")
print(f"Teste:  {X_test.shape[0]} amostras ({y_test.mean():.2%} evas√£o)")


Divis√£o dos dados:
Treino: 2904 amostras (39.15% evas√£o)
Teste:  726 amostras (39.12% evas√£o)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print("\nOTIMIZANDO GRADIENT BOOSTING...")
print("="*50)

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [200, 300],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 4],
    'classifier__min_samples_leaf': [20, 50],
    'classifier__subsample': [0.8, 1.0],
}

print("Executando Grid Search...")
grid_search = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_gb_model = grid_search.best_estimator_
print(f"\nMelhores par√¢metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")



üöÄ OTIMIZANDO GRADIENT BOOSTING...
‚è≥ Executando Grid Search...
Fitting 5 folds for each of 32 candidates, totalling 160 fits

üéØ Melhores par√¢metros:
  classifier__learning_rate: 0.05
  classifier__max_depth: 3
  classifier__min_samples_leaf: 50
  classifier__n_estimators: 200
  classifier__subsample: 1.0


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    average_precision_score
)

print("\nAVALIA√á√ÉO FINAL DO MODELO OTIMIZADO:")
print("="*50)

y_pred = best_gb_model.predict(X_test)
y_proba = best_gb_model.predict_proba(X_test)[:, 1]

metrics = {
    'AUC': roc_auc_score(y_test, y_proba),
    'PR-AUC': average_precision_score(y_test, y_proba),
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0)
}

print("\nM√©tricas Finais:")
for metric, value in metrics.items():
    print(f"{metric:12}: {value:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['N√£o Evas√£o', 'Evas√£o']))

print("\nüî¢ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

tn, fp, fn, tp = cm.ravel()
print(f"\nInterpreta√ß√£o:")
print(f"  Verdadeiros Negativos (TN): {tn}")
print(f"  Falsos Positivos (FP): {fp}")
print(f"  Falsos Negativos (FN): {fn}")
print(f"  Verdadeiros Positivos (TP): {tp}")



üìä AVALIA√á√ÉO FINAL DO MODELO OTIMIZADO:

üìà M√©tricas Finais:
AUC         : 0.9615
PR-AUC      : 0.9583
Accuracy    : 0.9008
Precision   : 0.8841
Recall      : 0.8592
F1-Score    : 0.8714

üìã Classification Report:
              precision    recall  f1-score   support

  N√£o Evas√£o       0.91      0.93      0.92       442
      Evas√£o       0.88      0.86      0.87       284

    accuracy                           0.90       726
   macro avg       0.90      0.89      0.90       726
weighted avg       0.90      0.90      0.90       726


üî¢ Confusion Matrix:
[[410  32]
 [ 40 244]]

Interpreta√ß√£o:
  Verdadeiros Negativos (TN): 410
  Falsos Positivos (FP): 32
  Falsos Negativos (FN): 40
  Verdadeiros Positivos (TP): 244


In [None]:
print("\nIMPORT√ÇNCIA DAS FEATURES:")
print("="*50)

preprocessor = best_gb_model.named_steps['preprocessor']
feature_names = preprocessor.get_feature_names_out()

importances = best_gb_model.named_steps['classifier'].feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Features mais importantes:")
print(feature_importance_df.head(10))



üîç IMPORT√ÇNCIA DAS FEATURES:

Top 10 Features mais importantes:
                                    feature  importance
4  num__Curricular units 2nd sem (approved)    0.778213
2  num__Curricular units 1st sem (enrolled)    0.049346
1  num__Curricular units 1st sem (approved)    0.046671
5  num__Curricular units 2nd sem (enrolled)    0.037272
6     num__Curricular units 2nd sem (grade)    0.030716
0                    num__Age at enrollment    0.021103
9                 cat__Scholarship holder_1    0.018134
3     num__Curricular units 1st sem (grade)    0.014188
7                             cat__Gender_1    0.003018
8         cat__Daytime/evening attendance_1    0.001339


In [47]:
import json, joblib
from pathlib import Path
from sklearn.metrics import precision_recall_curve
import numpy as np
import pandas as pd

print("\nüíæ Salvando modelo + metadados...")

MODEL_PATH = Path("gradient_boosting_dropout.pkl")
META_PATH  = Path("model_meta.json")

joblib.dump(best_gb_model, MODEL_PATH)
print(f"‚úÖ Modelo salvo em: {MODEL_PATH}")

try:
    _prec, _rec, _thr = precision_recall_curve(y_test, y_proba)
    _f1s = 2 * _prec * _rec / (np.clip(_prec + _rec, 1e-9, None))
    _best_idx = int(np.argmax(_f1s))
    best_threshold = float(_thr[_best_idx]) if _best_idx < len(_thr) else 0.5
except Exception:
    best_threshold = 0.5  

risk_bands = {
    "low": 0.40,
    "high": 0.70
}

meta = {
    "model_file": str(MODEL_PATH),
    "features_expected": list(X.columns),
    "metrics": metrics,             
    "best_threshold": best_threshold,
    "risk_bands": risk_bands
}
with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2)
print(f"‚úÖ Metadados salvos em: {META_PATH}")


def predict_with_bands(new_data, model_path=str(MODEL_PATH), meta_path=str(META_PATH)):
    """
    Carrega modelo e metadados, aceita dict ou DataFrame, ajusta colunas,
    retorna r√≥tulo, probabilidade e banda de risco.
    """
    mdl = joblib.load(model_path)
    with open(meta_path, "r") as f:
        m = json.load(f)

    features_expected = m["features_expected"]
    best_thr = float(m.get("best_threshold", 0.5))
    bands = m.get("risk_bands", {"low": 0.4, "high": 0.7})

    if not isinstance(new_data, pd.DataFrame):
        new_data = pd.DataFrame([new_data])

    for col in features_expected:
        if col not in new_data.columns:
            new_data[col] = np.nan
    X_in = new_data[features_expected]

    proba = float(mdl.predict_proba(X_in)[:, 1][0])
    pred  = int(proba >= best_thr)

    label = "Evas√£o" if pred == 1 else "N√£o Evas√£o"
    if proba >= bands["high"]:
        band = "Alto"
    elif proba >= bands["low"]:
        band = "M√©dio"
    else:
        band = "Baixo"

    return {
        "prediction": label,
        "probability": proba,
        "confidence": f"{proba:.2%}",
        "risk_level": band,
        "threshold_used": best_thr,
        "bands": bands
    }

print("\nüß™ Exemplo r√°pido com predict_with_bands:")
res = predict_with_bands(estudante_teste)
print(res)



üíæ Salvando modelo + metadados...
‚úÖ Modelo salvo em: gradient_boosting_dropout.pkl
‚úÖ Metadados salvos em: model_meta.json

üß™ Exemplo r√°pido com predict_with_bands:
{'prediction': 'N√£o Evas√£o', 'probability': 0.5539532236946263, 'confidence': '55.40%', 'risk_level': 'M√©dio', 'threshold_used': 0.6102142877598777, 'bands': {'low': 0.4, 'high': 0.7}}
