In [3]:
#surrogate models
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

data_name = '5040'

data_file = f'../{data_name}_vectors_drags_lifts.csv'
df = TabularDataset(data_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=777)

#exclue the first two columns of train data
train_data = train_df.drop(columns=['i', 'name'])
train_data.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_5033,dim_5034,dim_5035,dim_5036,dim_5037,dim_5038,dim_5039,dim_5040,drag,lift
406,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395,0.425
54,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645,0.885
241,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446,0.756
952,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384,0.177
337,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381,0.074


In [4]:
label = 'drag'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count    837.000000
mean       0.445233
std        0.112500
min        0.051000
25%        0.369000
50%        0.427000
75%        0.513000
max        0.944000
Name: drag, dtype: float64


In [5]:
import os
save_path = f'./agModels-{data_name}_{label}'  # specifies folder to store trained models
if not os.path.exists(save_path):
    os.makedirs(save_path)

bag_folds = 5 #suggestion range [5, 10]
bag_sets = 3 #suggestion range [1, 20]
stack_levels = 3 #suggestion range [0, 3]
metric = 'root_mean_squared_error' #Regression:mean_absolute_error, mean_squared_error,root_mean_squared_error (default), r2
predictor = TabularPredictor(label=label, path=save_path, eval_metric=metric).fit(train_data, 
                                                                                  presets='best_quality', 
                                                                                  auto_stack="True", 
                                                                                  num_bag_folds=bag_folds, 
                                                                                  num_bag_sets=bag_sets,
                                                                                  num_stack_levels=stack_levels,
                                                                                  verbosity=4)

Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': 'True',
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'verbosity': 4}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': 'True',
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=3
Saving ./agModels-5040_drag/learner.pkl
Saving ./agModels-5040_drag/predictor.pkl
Beginning AutoGluon training ...
AutoGluon

In [6]:
test_data = test_df.drop(columns=['i', 'name'])
# val_data.head()
y_val = test_data[label]
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_5032,dim_5033,dim_5034,dim_5035,dim_5036,dim_5037,dim_5038,dim_5039,dim_5040,lift
1046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346
657,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.654
127,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052
951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199


In [7]:
# %%capture log_output
# %config InlineBackend.figure_format = 'retina'
# %config Application.log_level = 'DEBUG'
# %config IPCompleter.greedy = True

predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
y_pred = predictor.predict(test_data_nolab)
for item in y_pred:
    print(item)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_val, y_pred=y_pred, auxiliary_metrics=True)
print(perf)

results = predictor.fit_summary(show_plot=True)
print(results)
print(predictor.leaderboard(test_data, silent=True))

# with open('./output_all_parts.log', 'w') as f:
#     f.write(log_output.stdout)

Loading: ./agModels-5040_drag/predictor.pkl
Loading: ./agModels-5040_drag/learner.pkl
Loading: ./agModels-5040_drag/models/trainer.pkl
Loading: ./agModels-5040_drag/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/CatBoost_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L2/model.pk

0.4708501398563385
0.3105873465538025
0.41971200704574585
0.4330872595310211
0.5016636848449707
0.4271191954612732
0.42445090413093567
0.3876354992389679
0.4487707018852234
0.41042816638946533
0.5689775943756104
0.5397976636886597
0.3999384641647339
0.3276209235191345
0.534246563911438
0.5498250126838684
0.35203123092651367
0.5387858748435974
0.583497166633606
0.5126181840896606
0.46681925654411316
0.4243188798427582
0.7640983462333679
0.46134060621261597
0.4196740686893463
0.40404966473579407
0.5462983250617981
0.5255455374717712
0.5018879771232605
0.3336881995201111
0.429036021232605
0.4537949860095978
0.43198448419570923
0.4554148018360138
0.48612555861473083
0.3711499273777008
0.4384874403476715
0.35778361558914185
0.32366979122161865
0.4186623990535736
0.48951417207717896
0.3866749107837677
0.6377565264701843
0.42556917667388916
0.5494204759597778
0.38316482305526733
0.38651859760284424
0.4687873125076294
0.3786322772502899
0.33416813611984253
0.5447913408279419
0.4363080561161041

Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/LightGBM_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/RandomForestMSE_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/CatBoost_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetFastAI_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/XGBoost_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetTorch_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMLarge_BAG_L4/model.pkl
Loading: ./agModels-5040_drag/models/WeightedEnsemble_L5/model.pkl


*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L4  -0.072930      51.320571  4672.522587                0.001133           0.473217            4       True         32
1      WeightedEnsemble_L3  -0.073415      26.095921  3690.381935                0.002025           0.721536            3       True         22
2      WeightedEnsemble_L5  -0.073421      70.555756  5199.052420                0.001169           0.700007            5       True         42
3   RandomForestMSE_BAG_L3  -0.073780      47.357864  4396.293966                3.027060          11.529337            3       True         25
4           XGBoost_BAG_L3  -0.073923      45.423328  4421.864330                1.092523          37.099701            3       True         29
5      WeightedEnsemble_L2  -0.073985       4.893378   164.646418         

Loading: ./agModels-5040_drag/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/LightGBM_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/RandomForestMSE_BAG_L2/model.pkl

                     model  score_test  score_val  pred_time_test  \
0   NeuralNetFastAI_BAG_L4   -0.075341  -0.076997       79.941691   
1   NeuralNetFastAI_BAG_L3   -0.075626  -0.076988       56.934652   
2   NeuralNetFastAI_BAG_L2   -0.076216  -0.075959       31.162003   
3   NeuralNetFastAI_BAG_L1   -0.076675  -0.076616        3.810989   
4      WeightedEnsemble_L3   -0.077370  -0.073415       33.909282   
5          LightGBM_BAG_L2   -0.077624  -0.074573       29.576725   
6      WeightedEnsemble_L2   -0.077920  -0.073985       11.081120   
7    NeuralNetTorch_BAG_L1   -0.078128  -0.075039        2.662219   
8    NeuralNetTorch_BAG_L3   -0.078168  -0.076578       55.571142   
9          CatBoost_BAG_L3   -0.078355  -0.074734       60.774081   
10         CatBoost_BAG_L2   -0.078592  -0.075026       35.584023   
11         LightGBM_BAG_L4   -0.078805  -0.074992       79.973810   
12   NeuralNetTorch_BAG_L2   -0.078905  -0.075747       30.033053   
13   NeuralNetTorch_BAG_L4   -0.07

In [8]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', [])     :    1 | ['lift']
('int', ['bool']) : 4807 | ['dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', ...]


In [9]:
train_data_pred = predictor.predict(train_data)
test_data_pred = predictor.predict(test_data)

import numpy as np
#save np array y_train_hat to a csv file
np.savetxt(f'./{data_name}_vectors_y_test_hat_{label}.csv', test_data_pred, delimiter=',')
np.savetxt(f'./{data_name}_vectors_y_train_hat_{label}.csv', train_data_pred, delimiter=',')

Loading: ./agModels-5040_drag/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_drag/models/CatBoost_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/ExtraTreesMSE_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMLarge_BAG_L2/model.pkl
Loading: ./agModels-5040_drag/models/LightGBMXT_BAG_L2/model.pkl
