In [1]:
import os
import warnings
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [3]:
cat_features = ['Marital status', 'Application mode', 'Course',
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation"]

for feature in cat_features:
    for df in [train, test]:
        df[feature] = df[feature].astype('category')

num_features = df.select_dtypes(include=['int64', 'float64']).columns

In [4]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score, k=FOLDS):

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        model.fit(X_train, y_train)
        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [5]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [6]:
def predict_test(model, X_train, y_train, X_test, encoder):

    model.fit(X_train, y_train)

    probabilidades = model.predict_proba(X_test)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [7]:
y = train['Target'].replace({"Enrolled": 0, "Graduate": 1, "Dropout": 2})
X = train[initial_features]

In [8]:
encoder = {0: "Enrolled", 1: "Graduate", 2: "Dropout"}

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ])

cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier()),
])

In [10]:
scores,oof = cross_validation(cat, X, y, encoder)

Fold 1
Learning rate set to 0.097881
0:	learn: 0.9957865	total: 60.5ms	remaining: 1m
1:	learn: 0.9172033	total: 72.1ms	remaining: 36s
2:	learn: 0.8565766	total: 84.5ms	remaining: 28.1s
3:	learn: 0.8053204	total: 95.7ms	remaining: 23.8s
4:	learn: 0.7618755	total: 107ms	remaining: 21.3s
5:	learn: 0.7257481	total: 118ms	remaining: 19.5s
6:	learn: 0.6951589	total: 129ms	remaining: 18.4s
7:	learn: 0.6687095	total: 141ms	remaining: 17.4s
8:	learn: 0.6467005	total: 152ms	remaining: 16.7s
9:	learn: 0.6275694	total: 163ms	remaining: 16.2s
10:	learn: 0.6105136	total: 174ms	remaining: 15.7s
11:	learn: 0.5958180	total: 186ms	remaining: 15.3s
12:	learn: 0.5827742	total: 198ms	remaining: 15.1s
13:	learn: 0.5707986	total: 211ms	remaining: 14.8s
14:	learn: 0.5604127	total: 221ms	remaining: 14.5s
15:	learn: 0.5514776	total: 232ms	remaining: 14.3s
16:	learn: 0.5439288	total: 244ms	remaining: 14.1s
17:	learn: 0.5364292	total: 254ms	remaining: 13.9s
18:	learn: 0.5299926	total: 266ms	remaining: 13.7s
19:	l

In [11]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

In [12]:
# Função objetivo para a otimização
def objective(trial: Trial, X, y):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'depth': trial.suggest_int('depth', 4, 12),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 100),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Plain']),
        'task_type': 'CPU',
        'loss_function': 'MultiClass'
    }

    # O CatBoost não suporta bootstrap_type='Bayesian' com boosting_type='Plain'
    if params['bootstrap_type'] == 'Bayesian' and params['boosting_type'] == 'Plain':
        params['bootstrap_type'] = 'Bernoulli'

    cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(**params, verbose=0)),
                    ])

    scores,_ = cross_validation(cat, X, y, encoder, k=3)
    accuracy = np.mean(scores)


    return accuracy

In [13]:
# warnings.filterwarnings("ignore")

# # Configuração do estudo
# study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
# study.optimize(lambda trial: objective(trial, X, y), n_trials=1000)

In [14]:
# # Melhor conjunto de hiperparâmetros
# print(f"Best trial: {study.best_trial.number}")
# print(f"Best value: {study.best_value}")
# print(f"Best params: {study.best_trial.params}")

In [15]:
cat_best_params = {'iterations': 1740, 'learning_rate': 0.04505281135910402, 'depth': 6, 'l2_leaf_reg': 0.07065084253966697, 'bootstrap_type': 'Bernoulli', 'od_type': 'IncToDec', 'random_strength': 40, 'border_count': 95, 'boosting_type': 'Plain'}
# cat_best_params = {'iterations': 956, 'learning_rate': 0.0450347941806318, 'depth': 7, 'l2_leaf_reg': 2.419872461609025e-05, 'bootstrap_type': 'MVS', 'od_type': 'IncToDec', 'random_strength': 6, 'border_count': 132, 'boosting_type': 'Plain'}

In [16]:
cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(**cat_best_params, verbose=0)),
])

In [17]:
scores,oof = cross_validation(cat, X, y, encoder)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Score: 0.8321761308455409


In [18]:
scores

[0.8322935326456236,
 0.8316758292667861,
 0.8320360761057574,
 0.830491722263405,
 0.834383493946133]

In [19]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

In [20]:
X_test = test[initial_features]

predict_test(cat, X, y, X_test, encoder)

Unnamed: 0,pred_Enrolled,pred_Graduate,pred_Dropout
0,0.001140,0.003083,0.995776
1,0.009426,0.986765,0.003809
2,0.210910,0.758224,0.030866
3,0.461117,0.303266,0.235617
4,0.682174,0.065745,0.252081
...,...,...,...
51007,0.085861,0.110532,0.803608
51008,0.009205,0.000206,0.990588
51009,0.014722,0.010461,0.974818
51010,0.098330,0.012846,0.888824
