In [None]:
import pandas as pd
import joblib
import numpy as np
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

df = pd.read_csv("../data/df_clean.csv")
X = df.drop(columns=["vaga_id", "codigo_candidato", "situacao_ord"])
y = (df["situacao_ord"] == 5).astype(int)

gkf = GroupKFold(n_splits=5)

auc_scores = []
pr_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=df["vaga_id"])):
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    print(f"Fold {fold + 1} - Positives in train: {sum(y_train)}, Positives in val: {sum(y_val)}")

    # Pula fold se não houver positivos em treino ou validação
    if sum(y_train) == 0 or sum(y_val) == 0:
        print(f"Skipping fold {fold + 1} due to lack of positive class")
        continue

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        objective="binary",
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=["auc", "average_precision"],
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=50)],
    )

    y_pred = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]

    auc = roc_auc_score(y_val, y_pred)
    pr = average_precision_score(y_val, y_pred)

    print(f"ROC AUC: {auc:.4f}, Average Precision: {pr:.4f}")

    auc_scores.append(auc)
    pr_scores.append(pr)

print(f"Mean ROC AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print(f"Mean Average Precision: {np.mean(pr_scores):.4f} ± {np.std(pr_scores):.4f}")

# Salve o modelo do último fold para uso posterior
#model.booster_.save_model("../models/model_lgbm.txt")


joblib.dump(model, "../models/model_lgbm.pkl")



Fold 1 - Positives in train: 2374, Positives in val: 610
[LightGBM] [Info] Number of positive: 2374, number of negative: 40633
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 43007, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055200 -> initscore=-2.840004
[LightGBM] [Info] Start training from score -2.840004
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.854524	valid_0's average_precision: 0.279775	valid_0's binary_logloss: 0.167005
[100]	valid_0's auc: 0.854218	valid_0's average_precision: 0.284184	valid_0's binary_logloss: 0.166801
Early stopping, best iteration is:
[68]	valid_0's auc: 0.8558	valid_0's average_precision: 0.279808	valid_0's binary_logl

['../models/model_lgbm.pkl']

Balanceamento das Classes
Fold 1: 2.374 positivos (contratados) no treino, 610 na validação

Fold 2: 2.396 positivos no treino, 588 na validação

Taxa de positivos: ~5.5% (2.374/43.007), que é um balanceamento realista para recrutamento

Performance do Modelo - Excelente
ROC AUC: 0.8655 ± 0.0066 (86,55% de capacidade discriminativa)

Average Precision: 0.3131 ± 0.0206 (31,31% de precisão média)

Estabilidade: Baixo desvio padrão indica modelo consistente entre folds

Interpretação das Métricas
Métrica	Valor	O que Significa
ROC AUC = 0.87	Excelente	Modelo distingue muito bem candidatos que serão contratados vs não contratados
Average Precision = 0.31	Bom	Em dados desbalanceados (5.5% positivos), conseguir 31% de precisão média é muito bom
Early Stopping	~68 iterações	Modelo convergiu sem overfitting
