In [27]:
import os
import warnings
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from lightgbm import LGBMClassifier

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [28]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [29]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score, k=FOLDS):

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        model.fit(X_train, y_train)
        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [30]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [31]:
def predict_test(model, X_train, y_train, X_test, encoder):

    model.fit(X_train, y_train)

    probabilidades = model.predict_proba(X_test)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [32]:
cat_features = ['Marital status', 'Application mode', 'Application order', 'Course',
        'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
        "Mother's qualification", "Father's qualification",
        "Mother's occupation", "Father's occupation", "Displaced",
        'Educational special needs', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'International', 'Curricular units 1st sem (credited)',
        'Curricular units 1st sem (enrolled)',
        'Curricular units 1st sem (evaluations)',
        'Curricular units 1st sem (approved)',
        'Curricular units 1st sem (without evaluations)',
        'Curricular units 2nd sem (credited)',
        'Curricular units 2nd sem (enrolled)',
        'Curricular units 2nd sem (evaluations)',
        'Curricular units 2nd sem (approved)',
        'Curricular units 2nd sem (without evaluations)']

for feature in cat_features:
    for df in [train, test]:
        df[feature] = df[feature].astype('category')

num_features = df.select_dtypes(include=['int64', 'float64']).columns

In [33]:
y = train['Target'].replace({"Enrolled": 0, "Graduate": 1, "Dropout": 2})
X = train[initial_features]

In [34]:
encoder = {0: "Enrolled", 1: "Graduate", 2: "Dropout"}

In [35]:
lgbm_best_params = {'n_estimators': 1189, 'num_leaves': 44, 'min_child_samples': 15, 'learning_rate': 0.02259846401043735, 'log_max_bin': 10, 'colsample_bytree': 0.5219137979097763, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.11335128586062176}

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ])

lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(**lgbm_best_params)
    )
])

In [37]:
scores,oof = cross_validation(lgbm, X, y, encoder)

Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1581
[LightGBM] [Info] Number of data points in the train set: 64753, number of used features: 329
[LightGBM] [Info] Start training from score -1.637915
[LightGBM] [Info] Start training from score -0.743325
[LightGBM] [Info] Start training from score -1.108405
Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1576
[LightGBM] [Info] Number of data points in the train set: 64753, number of used features: 326
[LightGBM] [Info] Start training from score -1.637915
[LightGBM] [Info] Start training from score -0.743293
[LightGBM] [Info] Start training from score -1.108452
Fold 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of t

In [38]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)
    
save_oof(oof)

In [39]:
# Função objetivo para a otimização
def objective(trial: Trial, X, y):
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', -1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e1, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10.0, log=True),
        'verbose_eval':False,
        'verbose':-1,
    }

    lgbm = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(**param_grid)
        )
    ])
    
    scores,_ = cross_validation(lgbm, X, y, encoder, k=3)
    accuracy = np.mean(scores)

    return accuracy

In [40]:
# warnings.filterwarnings("ignore")

# study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
# study.optimize(lambda trial: objective(trial, X, y), n_trials=3000)

In [41]:
# # Melhor conjunto de hiperparâmetros
# print(f"Best trial: {study.best_trial.number}")
# print(f"Best value: {study.best_value}")
# print(f"Best params: {study.best_trial.params}")

In [42]:
lgbm_best_params = {'n_estimators': 1634, 'learning_rate': 0.007256713936891003, 'num_leaves': 164, 'max_depth': 64, 'min_child_samples': 21, 'min_child_weight': 0.3465367660329096, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.4, 'reg_alpha': 0.08079791517562476, 'reg_lambda': 9.816836839794332}

In [43]:
lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(**lgbm_best_params)
    )
])

In [44]:
scores,oof = cross_validation(lgbm, X, y, encoder)

Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1543
[LightGBM] [Info] Number of data points in the train set: 64753, number of used features: 310
[LightGBM] [Info] Start training from score -1.637915
[LightGBM] [Info] Start training from score -0.743325
[LightGBM] [Info] Start training from score -1.108405
Fold 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 64753, number of used features: 310
[LightGBM] [Info] Start training from score -1.637915
[LightGBM] [Info] Start training from score -0.743293
[LightGBM] [Info] Start training from score -1.108452
Fold 3
[LightGBM] 

In [45]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)
    
save_oof(oof)

In [46]:
X_test = test[initial_features]

predict_test(lgbm, X, y, X_test, encoder)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1566
[LightGBM] [Info] Number of data points in the train set: 80942, number of used features: 321
[LightGBM] [Info] Start training from score -1.637909
[LightGBM] [Info] Start training from score -0.743308
[LightGBM] [Info] Start training from score -1.108433


Unnamed: 0,pred_Enrolled,pred_Graduate,pred_Dropout
0,0.002742,0.003835,0.993423
1,0.013872,0.980136,0.005992
2,0.259784,0.696097,0.044119
3,0.329280,0.525888,0.144832
4,0.664679,0.034908,0.300414
...,...,...,...
51007,0.067967,0.095318,0.836715
51008,0.014413,0.000710,0.984877
51009,0.013028,0.012390,0.974582
51010,0.108327,0.014593,0.877081
