In [90]:
import numpy as np
import pandas as pd
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor
import os
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [91]:
target = "Target"

In [92]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [93]:
def cross_validation(label, X, y, encoder, hyperparameters, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")

        model = TabularPredictor(label=label)
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        train_fold = pd.concat([X_train, y_train.rename('Target')], axis=1)
        
        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        val_fold = pd.concat([X_val, y_val.rename('Target')], axis=1)

        train_data = TabularDataset(train_fold)
        val_data = TabularDataset(val_fold)

        model.fit(
                train_data,
                presets='best_quality',
                time_limit=3600*1.5, 
                hyperparameters=hyperparameters
            )

        probabilidades = model.predict_proba(val_data)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = np.array(model.classes_)[indices_predicoes]
        
        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true').reset_index(drop=True)
        pred_label_df = pd.DataFrame(probabilidades).reset_index(drop=True)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        print(oof_pred.shape)
        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [94]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [95]:
def predict_test(target, X_train, y_train, X_test, encoder, hyperparameters):

    train_fold = pd.concat([X_train, y_train.rename('Target')], axis=1)

    model = TabularPredictor(label=target)

    train_data = TabularDataset(train_fold)
    test_data  = TabularDataset(X_test)

    model.fit(
            train_data,
            presets='best_quality',
            time_limit=3600*2, 
            hyperparameters=hyperparameters
        )

    probabilidades = model.predict_proba(test_data)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [96]:
y = train.pop('Target')
X = train

initial_features = list(X.columns)

encoder = {
        'Graduate':'Graduate',
        'Enrolled':'Enrolled',
        'Dropout':'Dropout'
           }

In [97]:
hyperparameters = {
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
    'XT': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}

In [98]:
warnings.filterwarnings("ignore")

scores,oof = cross_validation(target, X, y, encoder, hyperparameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20240622_013857"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #35~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue May  7 09:00:52 UTC 2
CPU Count:          16
Memory Avail:       7.60 GB / 22.84 GB (33.3%)
Disk Space Avail:   13.79 GB / 95.56 GB (14.4%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout

Fold 1


Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0        LightGBMXT_BAG_L2       0.827797   0.831318    accuracy        5.599851      38.199768  538.495377                 0.112821                0.341934           9.251419            2       True         12
1     LightGBMLarge_BAG_L2       0.827241   0.831596    accuracy        5.766252      38.736983  549.000515                 0.279222                0.879149          19.756557            2       True         17
2      WeightedEnsemble_L3       0.827241   0.832274    accuracy        6.620630      40.249900  689.869828                 0.002578                0.004351           1.650802            3       True         18
3   NeuralNetFastAI_BAG_L2       0.827102   0.831526    accuracy        6.267866      39.387248  648.300016          

(16189, 4)
Fold 2


Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0          LightGBM_BAG_L2       0.831967   0.830137    accuracy        4.773469      15.406202  334.043512                 0.128734                0.359432           8.550268            2       True         13
1     LightGBMLarge_BAG_L2       0.831411   0.830328    accuracy        4.867823      15.691530  344.654612                 0.223087                0.644760          19.161367            2       True         17
2   NeuralNetFastAI_BAG_L2       0.830855   0.830241    accuracy        5.400217      16.326406  419.707553                 0.755481                1.279636          94.214308            2       True         11
3           XGBoost_BAG_L1       0.830716   0.830571    accuracy        0.486028       1.466990   13.304776          

(16189, 4)
Fold 3


Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     LightGBMLarge_BAG_L2       0.828214   0.831373    accuracy        4.982735      15.328460  373.272161                 0.248788                0.802093          23.528622            2       True         17
1    NeuralNetTorch_BAG_L2       0.828214   0.831634    accuracy        5.190775      15.251335  459.357677                 0.456828                0.724968         109.614138            2       True         16
2   NeuralNetFastAI_BAG_L2       0.827658   0.831894    accuracy        5.540198      15.588882  445.354517                 0.806250                1.062515          95.610978            2       True         11
3        LightGBMXT_BAG_L2       0.827519   0.832051    accuracy        4.878831      14.940564  357.937488          

(16188, 4)
Fold 4


Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           XGBoost_BAG_L2       0.837248   0.830748    accuracy        5.822788      20.249507  373.260993                 0.214089                0.485047          12.073379            2       True         15
1          LightGBM_BAG_L2       0.836414   0.830626    accuracy        5.728186      20.099093  370.288321                 0.119487                0.334633           9.100708            2       True         13
2     LightGBMLarge_BAG_L1       0.836136   0.829879    accuracy        0.762292       4.263760   18.820672                 0.762292                4.263760          18.820672            1       True          9
3          LightGBM_BAG_L1       0.835997   0.830018    accuracy        0.526825       1.662234   11.006286          

(16188, 4)
Fold 5


Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0        LightGBMXT_BAG_L2       0.835719   0.829914    accuracy        5.837531      20.933567  382.825358                 0.164621                0.448862           8.111563            2       True         12
1   NeuralNetFastAI_BAG_L2       0.835719   0.829723    accuracy        6.574782      21.565141  474.223644                 0.901872                1.080436          99.509849            2       True         11
2          LightGBM_BAG_L2       0.835302   0.830487    accuracy        5.777387      20.754549  381.757032                 0.104476                0.269843           7.043237            2       True         13
3      WeightedEnsemble_L3       0.835302   0.830487    accuracy        5.778923      20.759094  383.430094          

(16188, 4)
Score: 0.8306688682060775


In [99]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

In [100]:
X_test = test[initial_features]

predict_test(target, X, y, X_test, encoder, hyperparameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20240622_034550"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #35~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue May  7 09:00:52 UTC 2
CPU Count:          16
Memory Avail:       11.54 GB / 22.84 GB (50.5%)
Disk Space Avail:   12.37 GB / 95.56 GB (12.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Leaderboard on holdout data (DyStack):
                     model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     LightGBMLarge_BAG_L2       0.837225   0.830503    accuracy        7.918051      27.164440  483.602638                 0.480039                1.359168          31.412802            2       True         17
1    NeuralNetTorch_BAG_L2       0.837114   0.831128    accuracy        7.978929      26.664206  572.762059                 0.540917                0.858934         120.572223            2       True         16
2      WeightedEnsemble_L3       0.837114   0.831601    accuracy        9.052578      28.208344  704.543140                 0.003487                0.005432           2.170193            3       True         18
3     LightGBMLarge_BAG_L1       0.836669   0.829655    accuracy        1.382966       4.637996   22.568185          

Unnamed: 0_level_0,pred_Dropout,pred_Enrolled,pred_Graduate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
76518,0.994802,0.002493,0.002705
76519,0.005714,0.013154,0.981132
76520,0.032434,0.221852,0.745713
76521,0.250632,0.442643,0.306725
76522,0.315104,0.638861,0.046036
...,...,...,...
127525,0.853473,0.043565,0.102961
127526,0.986771,0.012971,0.000258
127527,0.977890,0.012382,0.009728
127528,0.884646,0.098007,0.017347
