In [1]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [2]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [3]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        model.fit(X_train, y_train)

        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [4]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [5]:
def predict_test(model, X_train, y_train, X_test, encoder):

    model.fit(X_train, y_train)

    probabilidades = model.predict_proba(X_test)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [6]:
y = train['Target'].replace({"Enrolled": 0, "Graduate": 1, "Dropout": 2})
X = train[initial_features]

  y = train['Target'].replace({"Enrolled": 0, "Graduate": 1, "Dropout": 2})


In [7]:
encoder = {0: "Enrolled", 1: "Graduate", 2: "Dropout"}

In [8]:
xgb_best_params = {'n_estimators': 813, 'learning_rate': 0.02882415807304588, 'max_depth': 7, 'subsample': 0.9255300384934926, 'colsample_bytree': 0.27455204237965924, 'reg_alpha': 5.570494795060933e-05, 'reg_lambda': 0.42084989223847086, 'gamma': 0.0008504062699390611, 'min_child_weight': 6.248463737443627}

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), initial_features),
    ])

xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(**xgb_best_params)),
])

In [10]:
scores,oof = cross_validation(xgb, X, y, encoder)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Score: 0.8331274032068956


In [42]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

In [43]:
X_test = test[initial_features]

predict_test(xgb, X, y, X_test, encoder)

Unnamed: 0,pred_Enrolled,pred_Graduate,pred_Dropout
0,0.002691,0.003098,0.994210
1,0.012086,0.983531,0.004383
2,0.236701,0.724339,0.038961
3,0.420639,0.411695,0.167666
4,0.661019,0.051783,0.287198
...,...,...,...
51007,0.050255,0.070388,0.879357
51008,0.011786,0.000220,0.987994
51009,0.016953,0.008298,0.974749
51010,0.111754,0.016315,0.871931
