In [1]:
#surrogate models
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

data_name = '10240'

data_file = f'../{data_name}_vectors_drags_lifts.csv'
df = TabularDataset(data_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=777)

#exclue the first two columns of train data
train_data = train_df.drop(columns=['i', 'name'])
train_data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_10233,dim_10234,dim_10235,dim_10236,dim_10237,dim_10238,dim_10239,dim_10240,drag,lift
406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395,0.425
54,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645,0.885
241,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446,0.756
952,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384,0.177
337,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381,0.074


In [2]:
label = 'lift'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count    837.000000
mean       0.378389
std        0.238650
min        0.002000
25%        0.184000
50%        0.344000
75%        0.540000
max        0.988000
Name: lift, dtype: float64


In [3]:
import os
save_path = f'./agModels-{data_name}_{label}'  # specifies folder to store trained models
if not os.path.exists(save_path):
    os.makedirs(save_path)

bag_folds = 5 #suggestion range [5, 10]
bag_sets = 3 #suggestion range [1, 20]
stack_levels = 3 #suggestion range [0, 3]
metric = 'root_mean_squared_error' #Regression:mean_absolute_error, mean_squared_error,root_mean_squared_error (default), r2
predictor = TabularPredictor(label=label, path=save_path, eval_metric=metric).fit(train_data, 
                                                                                  presets='best_quality', 
                                                                                  auto_stack="True", 
                                                                                  num_bag_folds=bag_folds, 
                                                                                  num_bag_sets=bag_sets,
                                                                                  num_stack_levels=stack_levels,
                                                                                  verbosity=4)

Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': 'True',
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'verbosity': 4}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': 'True',
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=3
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of

In [4]:
test_data = test_df.drop(columns=['i', 'name'])
# val_data.head()
y_val = test_data[label]
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_10232,dim_10233,dim_10234,dim_10235,dim_10236,dim_10237,dim_10238,dim_10239,dim_10240,drag
1046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527
657,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363
127,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316
951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.495


In [5]:
# %%capture log_output
# %config InlineBackend.figure_format = 'retina'
# %config Application.log_level = 'DEBUG'
# %config IPCompleter.greedy = True

predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
y_pred = predictor.predict(test_data_nolab)
for item in y_pred:
    print(item)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_val, y_pred=y_pred, auxiliary_metrics=True)
print(perf)

results = predictor.fit_summary(show_plot=True)
print(results)
print(predictor.leaderboard(test_data, silent=True))

# with open('./output_all_parts.log', 'w') as f:
#     f.write(log_output.stdout)

Loading: ./agModels-10240_lift/predictor.pkl
Loading: ./agModels-10240_lift/learner.pkl
Loading: ./agModels-10240_lift/models/trainer.pkl
Loading: ./agModels-10240_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/WeightedEnsemble_L2/model.pkl
Evaluation: root_mean_squared_error on test data: -0.19977494276765445
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.19977494276765445,
    "mean_squared_error": -0.03991002775781961,
    "mean_absolute_error": -0.15900082445229802,
    "r2": 0.3270949467979942,
    "pearsonr": 0.573316023569146,
    "median_absolute_error": -0.1364591052532196
}
Loading: ./agModels-10240_lift/models/KNeighborsUn

0.42932721972465515
0.3970879912376404
0.20748108625411987
0.5426919460296631
0.222565159201622
0.4266810417175293
0.3404097557067871
0.6998717784881592
0.36531877517700195
0.24871331453323364
0.33855658769607544
0.23935869336128235
0.35266757011413574
0.48823752999305725
0.39339664578437805
0.37452393770217896
0.3332509994506836
0.2586742639541626
0.7964808940887451
0.6065281629562378
0.2144181728363037
0.3843652606010437
0.26784801483154297
0.29929211735725403
0.11705530434846878
0.5538468956947327
0.4785143733024597
0.41201311349868774
0.34644263982772827
0.4850931763648987
0.2639618515968323
0.3361703157424927
0.5468820333480835
0.2373318076133728
0.22965623438358307
0.4396401643753052
0.4633280634880066
0.18526241183280945
0.26337215304374695
0.3070150911808014
0.22482794523239136
0.5589609742164612
0.2537224292755127
0.36625364422798157
0.5539132356643677
0.6481390595436096
0.432869553565979
0.23155465722084045
0.5392648577690125
0.5980666875839233
0.5983052253723145
0.2857429385

Loading: ./agModels-10240_lift/models/WeightedEnsemble_L3/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMXT_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/RandomForestMSE_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/CatBoost_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/ExtraTreesMSE_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetFastAI_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/XGBoost_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetTorch_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMLarge_BAG_L3/model.pkl
Loading: ./agModels-10240_lift/models/WeightedEnsemble_L4/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMXT_BAG_L4/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L4/model.pkl
Loading: ./agModels-10240_lift/models/RandomForestMSE_BAG_L4/model.pkl
Loading: ./agModels-10240_lift/models/CatBoost_BAG_L4/mo

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2  -0.181083      20.734021  6680.793364                0.001173           0.526274            2       True         12
1      WeightedEnsemble_L3  -0.183377      49.347743  7363.578995                0.002695           0.436705            3       True         22
2          CatBoost_BAG_L1  -0.184126      16.548150  6546.401002               16.548150        6546.401002            1       True          6
3          CatBoost_BAG_L2  -0.184429      42.163069  7244.851973                9.794422         263.534730            2       True         16
4      WeightedEnsemble_L4  -0.184672      64.847459  7645.395595                0.001005           0.419719            4       True         32
5        LightGBMXT_BAG_L2  -0.184767      33.530894  6999.368669         

Loading: ./agModels-10240_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-10240_lift/models/LightGBMXT_BAG_L2/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L2/model.pkl
Loading: ./agModels-10240_lift/models/RandomForestMSE_B

                     model  score_test  score_val  pred_time_test  \
0      WeightedEnsemble_L2   -0.199775  -0.181083       14.486616   
1          LightGBM_BAG_L2   -0.199871  -0.186675       24.056381   
2          LightGBM_BAG_L4   -0.200496  -0.189784       66.873926   
3     LightGBMLarge_BAG_L2   -0.200624  -0.189086       24.291028   
4          CatBoost_BAG_L2   -0.200761  -0.184429       30.564956   
5          CatBoost_BAG_L4   -0.201087  -0.188765       73.513451   
6      WeightedEnsemble_L3   -0.201746  -0.183377       36.009003   
7          CatBoost_BAG_L3   -0.202405  -0.187554       51.980310   
8          CatBoost_BAG_L1   -0.202513  -0.184126        9.135972   
9           XGBoost_BAG_L2   -0.202557  -0.187122       22.896958   
10     WeightedEnsemble_L5   -0.202748  -0.186363       79.409585   
11   NeuralNetTorch_BAG_L4   -0.202915  -0.190347       65.944016   
12   NeuralNetTorch_BAG_L3   -0.202923  -0.189924       44.195225   
13       LightGBMXT_BAG_L2   -0.20

In [6]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', [])     :    1 | ['drag']
('int', ['bool']) : 9783 | ['dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', ...]


In [7]:
train_data_pred = predictor.predict(train_data)
test_data_pred = predictor.predict(test_data)

import numpy as np
#save np array y_train_hat to a csv file
np.savetxt(f'./{data_name}_vectors_y_test_hat_{label}.csv', test_data_pred, delimiter=',')
np.savetxt(f'./{data_name}_vectors_y_train_hat_{label}.csv', train_data_pred, delimiter=',')

Loading: ./agModels-10240_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-10240_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-10240_lift/models/WeightedEnsemble_L2/model.pkl
