In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform

In [74]:
data = pd.read_csv('data/features/features.csv')

In [75]:
data = data[['similaridade_cv_competencias','similaridade_cv_atividades','avaliador_idioma_ingles','avaliador_idioma_espanhol','situacao_candidado_avaliador']]

In [76]:
def categorias(x):
    if x <= 0.4:
        return 'baixo'
    elif x <= 0.7:
        return 'medio'
    else:
        return 'alto'

In [77]:
data['situacao_candidado_avaliador'] = data['situacao_candidado_avaliador'].apply(categorias)

In [None]:
X = data[['similaridade_cv_competencias','similaridade_cv_atividades','avaliador_idioma_ingles','avaliador_idioma_espanhol']]
y = data['situacao_candidado_avaliador']
le = LabelEncoder()
y_encoded = le.fit_transform(y)  
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# baixo = 1, medio = 2, alto = 0
# SMOTE para balancear
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [82]:
# Pipeline com XGBoost
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

# Treinar
pipeline_xgb.fit(X_train, y_train)

# Avaliar
print("XGBoost Results:")
y_pred_xgb = pipeline_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
              precision    recall  f1-score   support

           0       0.49      0.09      0.16       659
           1       0.66      0.93      0.77      7134
           2       0.56      0.18      0.27      3515

    accuracy                           0.65     11308
   macro avg       0.57      0.40      0.40     11308
weighted avg       0.62      0.65      0.58     11308



In [98]:
# Pipeline com LightGBM
pipeline_lgb = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('clf', LGBMClassifier(class_weight='balanced', random_state=42))
])
param_grid = {
    'clf__n_estimators': [100],
    'clf__learning_rate': [0.1],
    'clf__max_depth': [5, 10, 15],
}
grid_search = GridSearchCV(
    estimator=pipeline_lgb,
    param_grid=param_grid,
    scoring='f1_macro',  
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Treinar
grid_search.fit(X_train, y_train)

# Resultados
print("✅ Melhor combinação de hiperparâmetros:")
print(grid_search.best_params_)

print("\n📊 Avaliação no conjunto de teste:")

y_pred = grid_search.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.219669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.206899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Number of data points in the train set: 57315, number of used features: 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.229236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Number of data points in the train set: 57315, number of used features: 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.239272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]



[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100; total time=34.6min




[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100; total time=35.0min




[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100; total time=35.3min




[CV] END clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=100; total time=42.6min




[CV] END clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=100; total time=42.6min




[CV] END clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=100; total time=42.6min




[CV] END clf__learning_rate=0.1, clf__max_depth=15, clf__n_estimators=100; total time=42.6min
[CV] END clf__learning_rate=0.1, clf__max_depth=15, clf__n_estimators=100; total time=42.6min




[CV] END clf__learning_rate=0.1, clf__max_depth=15, clf__n_estimators=100; total time=42.6min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 787
[LightGBM] [Info] Number of data points in the train set: 85974, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
✅ Melhor combinação de hiperparâmetros:
{'clf__learning_rate': 0.1, 'clf__max_depth': 15, 'clf__n_estimators': 100}

📊 Avaliação no conjunto de teste:
              precision    recall  f1-score   support

        alto       0.14      0.49      0.22       659
       baixo       0.73      0.49      0.59      7134
       medio       0.41      0.50      0.45      3515

    accuracy            

