In [25]:
import shutil
import numpy as np
import pandas as pd
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor
import os
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [26]:
target = "Target"

In [27]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [28]:
def cross_validation(label, X, y, encoder, hyperparameters, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):

        print(f"Fold {i + 1}")

        model = TabularPredictor(label=label)
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        train_fold = pd.concat([X_train, y_train.rename('Target')], axis=1)
        
        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        val_fold = pd.concat([X_val, y_val.rename('Target')], axis=1)

        train_data = TabularDataset(train_fold)
        val_data = TabularDataset(val_fold)

        model.fit(
                train_data,
                presets='best_quality',
                time_limit=3600*2, 
                # hyperparameters=hyperparameters
            )

        probabilidades = model.predict_proba(val_data)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = np.array(model.classes_)[indices_predicoes]
        
        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true').reset_index(drop=True)
        pred_label_df = pd.DataFrame(probabilidades).reset_index(drop=True)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        print(score)
        out_of_fold.append(oof_pred)

        shutil.rmtree('/home/rafael/Documentos/notebooks/classification_academic_success/playground-series-s4e6/models/autoGluon_1/AutogluonModels')


    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [29]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [30]:
def predict_test(target, X_train, y_train, X_test, encoder, hyperparameters):

    train_fold = pd.concat([X_train, y_train.rename('Target')], axis=1)

    model = TabularPredictor(label=target)

    train_data = TabularDataset(train_fold)
    test_data  = TabularDataset(X_test)

    model.fit(
            train_data,
            presets='best_quality',
            time_limit=3600*2, 
            hyperparameters=hyperparameters
        )

    probabilidades = model.predict_proba(test_data)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [31]:
y = train.pop('Target')
X = train

initial_features = list(X.columns)

encoder = {
        'Graduate':'Graduate',
        'Enrolled':'Enrolled',
        'Dropout':'Dropout'
           }

In [32]:
hyperparameters = {
    # 'NN_TORCH': {},
    # 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    # 'CAT': {},
    # 'XGB': {},
    # 'FASTAI': {},
    # 'RF': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
    # 'XT': [{'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}}],
    # 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}

In [33]:
warnings.filterwarnings("ignore")

scores,oof = cross_validation(target, X, y, encoder, hyperparameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20240627_035707"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #41~22.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jun  3 11:32:55 UTC 2
CPU Count:          16
Memory Avail:       18.15 GB / 22.84 GB (79.5%)
Disk Space Avail:   20.59 GB / 95.56 GB (21.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Fold 1


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         ExtraTreesGini_BAG_L2       0.828214   0.829216    accuracy       20.115663      83.726230  1140.042438                 0.689617                2.530772           5.164579            2       True         29
1        NeuralNetFastAI_BAG_L2       0.827658   0.831874    accuracy       20.246776      82.217633  1224.147849                 0.820729                1.022174          89.269990            2       True         23
2         ExtraTreesEntr_BAG_L2       0.827241   0.828660    accuracy       20.086526      83.555226  1139.872666                 0.660480                2.359768           4.994808            2       True         30
3   NeuralNetFastAI_r191_BAG_L2       0.827241   0.830449    accuracy       20.311714      83

0.8317993699425535


No path specified. Models will be saved in: "AutogluonModels/ag-20240627_055824"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #41~22.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jun  3 11:32:55 UTC 2
CPU Count:          16
Memory Avail:       16.91 GB / 22.84 GB (74.0%)
Disk Space Avail:   20.57 GB / 95.56 GB (21.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Fold 2


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0               LightGBM_BAG_L2       0.831967   0.830849    accuracy       18.763400      76.057972  1153.101242                 0.091597                0.263945          12.162197            2       True         25
1           WeightedEnsemble_L3       0.831967   0.830849    accuracy       18.764965      76.062855  1156.073170                 0.001566                0.004883           2.971928            3       True         39
2        NeuralNetFastAI_BAG_L2       0.831133   0.830571    accuracy       19.506302      76.886406  1237.714378                 0.834500                1.092379          96.775333            2       True         23
3     NeuralNetTorch_r79_BAG_L2       0.830994   0.830501    accuracy       19.412278      76

0.8330965470381123


No path specified. Models will be saved in: "AutogluonModels/ag-20240627_075933"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #41~22.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jun  3 11:32:55 UTC 2
CPU Count:          16
Memory Avail:       15.82 GB / 22.84 GB (69.3%)
Disk Space Avail:   20.55 GB / 95.56 GB (21.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Fold 3


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0          LightGBMLarge_BAG_L2       0.828492   0.831304    accuracy       16.870537      61.857804  1173.517680                 0.294072                0.840685          33.303041            2       True         31
1     NeuralNetTorch_r79_BAG_L2       0.828214   0.831964    accuracy       17.286967      61.929400  1279.678258                 0.710503                0.912281         139.463619            2       True         33
2                XGBoost_BAG_L2       0.827797   0.831738    accuracy       16.818045      61.456520  1153.094787                 0.241580                0.439401          12.880148            2       True         29
3         ExtraTreesEntr_BAG_L2       0.827797   0.829045    accuracy       17.274835      63

0.8303063998023227


No path specified. Models will be saved in: "AutogluonModels/ag-20240627_100040"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #41~22.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jun  3 11:32:55 UTC 2
CPU Count:          16
Memory Avail:       15.82 GB / 22.84 GB (69.3%)
Disk Space Avail:   20.58 GB / 95.56 GB (21.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Fold 4


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0                XGBoost_BAG_L2       0.836692   0.830313    accuracy       17.489901      71.327108  1159.793807                 0.278836                0.598307          16.986262            2       True         29
1         ExtraTreesGini_BAG_L2       0.836414   0.827256    accuracy       17.914774      73.608713  1149.995047                 0.703709                2.879912           7.187501            2       True         27
2          LightGBMLarge_BAG_L1       0.836136   0.829879    accuracy        0.873151       4.453290    15.730089                 0.873151                4.453290          15.730089            1       True         13
3               LightGBM_BAG_L2       0.836136   0.830852    accuracy       17.323929      71

0.8307388188781814


No path specified. Models will be saved in: "AutogluonModels/ag-20240627_120148"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #41~22.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jun  3 11:32:55 UTC 2
CPU Count:          16
Memory Avail:       15.71 GB / 22.84 GB (68.8%)
Disk Space Avail:   20.56 GB / 95.56 GB (21.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

Fold 5


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0          CatBoost_r177_BAG_L2       0.836275   0.829358    accuracy       16.130586      66.617586  1165.172214                 0.076639                0.104657          19.264137            2       True         32
1               CatBoost_BAG_L2       0.835858   0.829601    accuracy       16.134558      66.630620  1169.807006                 0.080611                0.117691          23.898929            2       True         26
2             LightGBMXT_BAG_L2       0.835719   0.830053    accuracy       16.211172      66.941692  1157.142295                 0.157224                0.428763          11.234217            2       True         22
3                XGBoost_BAG_L2       0.835719   0.830661    accuracy       16.405228      67

0.8319125277983692
Score: 0.8315707326919078


In [36]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

[33m(raylet)[0m The node with node id: 31f9ec2d3de7f0db3e1ffc06cb17f76a2435d35ff07b16b9b79681db and address: 192.168.15.200 and node name: 192.168.15.200 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a 	(1) raylet crashes unexpectedly (OOM, preempted node, etc.) 
	(2) raylet has lagging heartbeats due to slow network or busy workload.
[33m(raylet)[0m Raylet is terminated. Termination is unexpected. Possible reasons include: (1) SIGKILL by the user or system OOM killer, (2) Invalid memory access from Raylet causing SIGSEGV or SIGBUS, (3) Other termination signals. Last 20 lines of the Raylet logs:
    *** StackTrace Information ***
    /home/rafael/anaconda3/envs/penidoEnv/lib/python3.9/site-packages/ray/core/src/ray/raylet/raylet(+0xb8124a) [0x63266773a24a] ray::operator<<()
    /home/rafael/anaconda3/envs/penidoEnv/lib/python3.9/site-packages/ray/core/src/ray/raylet/raylet(+0xb82a07) [0x63266773ba07] ray::SpdLogMessage::Flus

: 

In [35]:
# X_test = test[initial_features]

# predict_test(target, X, y, X_test, encoder, hyperparameters)