In [1]:
#surrogate models
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

data_name = 'wings'

data_file = f'../{data_name}_vectors_drags_lifts.csv'
df = TabularDataset(data_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=777)

#exclue the first two columns of train data
train_data = train_df.drop(columns=['i', 'name'])
train_data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_121,dim_122,dim_123,dim_124,dim_125,dim_126,dim_127,dim_128,drag,lift
406,-0.496165,1.705992,0.197353,1.742359,0.467517,-1.339144,-0.647645,0.697645,0.994912,0.193609,...,-0.770262,-0.448141,-0.866822,-2.491501,0.482521,0.362971,0.389367,-0.237259,0.395,0.425
54,0.033569,1.133478,1.176343,-0.943174,-0.657064,-0.401192,0.747147,1.081339,-1.888797,-1.40757,...,0.767994,-1.674341,0.346039,0.3872,-0.006894,0.338835,0.207995,0.574507,0.645,0.885
241,-0.559282,0.308935,-0.047847,-1.081136,0.175241,-1.149952,0.71952,0.39092,-0.075074,-0.300515,...,1.81544,0.596989,1.033692,-0.362098,0.247644,0.190097,-1.081548,-0.536878,0.446,0.756
952,-0.222745,0.463468,0.462111,0.234607,-0.859618,-2.425318,0.684795,-1.679074,1.190695,-0.385462,...,-2.272429,0.947205,-0.633594,-0.539486,0.290193,-0.221363,-1.451571,0.765826,0.384,0.177
337,1.397874,0.460493,-1.091555,-0.442342,0.823289,-1.108367,0.877267,-0.00336,0.461756,-0.80428,...,-1.359217,-1.532384,0.735832,0.230598,0.350978,0.722657,-0.1557,-0.85193,0.381,0.074


In [2]:
#save train_df to a csv file
train_df.to_csv('./train_df.csv', index=False)


In [3]:
#save test_df to a csv file
test_df.to_csv('./test_df.csv', index=False)

In [4]:
label = 'lift'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count    837.000000
mean       0.378389
std        0.238650
min        0.002000
25%        0.184000
50%        0.344000
75%        0.540000
max        0.988000
Name: lift, dtype: float64


In [5]:
import os
save_path = f'./agModels-{data_name}_{label}'  # specifies folder to store trained models
if not os.path.exists(save_path):
    os.makedirs(save_path)

bag_folds = 5 #suggestion range [5, 10]
bag_sets = 3 #suggestion range [1, 20]
stack_levels = 3 #suggestion range [0, 3]
metric = 'root_mean_squared_error' #Regression:mean_absolute_error, mean_squared_error,root_mean_squared_error (default), r2
predictor = TabularPredictor(label=label, path=save_path, eval_metric=metric).fit(train_data, 
                                                                                  presets='best_quality', 
                                                                                  auto_stack="True", 
                                                                                  num_bag_folds=bag_folds, 
                                                                                  num_bag_sets=bag_sets,
                                                                                  num_stack_levels=stack_levels,
                                                                                  verbosity=4)

Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': 'True',
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'verbosity': 4}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': 'True',
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=3
Saving ./agModels-wings_lift/learner.pkl
Saving ./agModels-wings_lift/predictor.pkl
Beginning AutoGluon training ...
AutoGlu

In [6]:
test_data = test_df.drop(columns=['i', 'name'])
# val_data.head()
y_val = test_data[label]
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_120,dim_121,dim_122,dim_123,dim_124,dim_125,dim_126,dim_127,dim_128,drag
1046,-0.242641,-0.544351,-0.785688,0.915798,0.159682,-0.134473,0.513026,-0.386129,-1.244955,0.406885,...,0.504696,-0.266971,0.339783,-0.708615,-0.384231,-0.775731,0.406653,-0.137757,1.354433,0.527
657,-0.086765,-0.885193,0.826741,-0.850657,0.465779,-1.419797,-0.319893,0.348212,0.397075,-1.688038,...,1.223718,-1.820433,0.109049,0.877518,-0.688451,-0.905782,0.322554,0.742038,-0.282902,0.363
127,0.993629,0.058908,0.058237,-0.193928,-0.465946,0.194533,0.438341,-0.264994,-0.371554,1.000921,...,-1.863156,-0.465709,0.748947,-1.5375,1.178252,0.856316,0.28775,0.008825,0.656394,0.316
951,-0.238668,0.338577,-0.576213,-0.241772,-0.971682,1.144228,1.083222,-2.30459,0.678272,-0.991069,...,1.038386,-0.669204,1.110296,0.581472,0.028183,0.67845,-0.005965,-0.314793,-0.241957,0.413
134,-1.254356,0.55711,0.38981,0.332288,-0.160778,-0.05589,-1.489394,-0.652277,-0.348175,-0.549951,...,0.401014,-2.166166,-0.27722,-0.422792,0.157305,0.493497,-1.234308,-0.001335,0.454736,0.495


In [10]:
# %%capture log_output
# %config InlineBackend.figure_format = 'retina'
# %config Application.log_level = 'DEBUG'
# %config IPCompleter.greedy = True

predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
y_pred = predictor.predict(test_data_nolab)
for item in y_pred:
    print(item)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_val, y_pred=y_pred, auxiliary_metrics=True)
print(perf)

results = predictor.fit_summary(show_plot=True)
print(results)
print(predictor.leaderboard(test_data, silent=True))

# with open('./output_all_parts.log', 'w') as f:
#     f.write(log_output.stdout)

Loading: ./agModels-wings_lift/predictor.pkl
Loading: ./agModels-wings_lift/learner.pkl
Loading: ./agModels-wings_lift/models/trainer.pkl
Loading: ./agModels-wings_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/CatBoost_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/ExtraTreesMSE

0.3442482650279999
0.3341943323612213
0.24837888777256012
0.5390891432762146
0.32465222477912903
0.46712812781333923
0.2915904223918915
0.7035018801689148
0.2875654995441437
0.22906048595905304
0.36077025532722473
0.299252450466156
0.49676987528800964
0.41333383321762085
0.2809992730617523
0.38399356603622437
0.49942126870155334
0.27499938011169434
0.7440868616104126
0.3678441047668457
0.20136786997318268
0.5726386308670044
0.19072376191616058
0.41729411482810974
0.08388695865869522
0.5651516318321228
0.47351232171058655
0.453468918800354
0.3657052218914032
0.5460712909698486
0.1427127718925476
0.33554792404174805
0.5036322474479675
0.28792697191238403
0.20153550803661346
0.7893061637878418
0.5713226795196533
0.3251776397228241
0.35871103405952454
0.250104695558548
0.30711594223976135
0.594516396522522
0.23106613755226135
0.24658720195293427
0.516240119934082
0.6024110913276672
0.2534847855567932
0.28530585765838623
0.6087101101875305
0.5530984997749329
0.616531252861023
0.343616068363

Loading: ./agModels-wings_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMXT_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/LightGBM_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/RandomForestMSE_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/CatBoost_BAG_L2/

                     model  score_test  score_val  pred_time_test  \
0        LightGBMXT_BAG_L4   -0.190108  -0.189691        9.932374   
1      WeightedEnsemble_L5   -0.190346  -0.184764       11.448825   
2      WeightedEnsemble_L4   -0.190401  -0.182298        8.653435   
3          LightGBM_BAG_L4   -0.190968  -0.187521        9.835571   
4   RandomForestMSE_BAG_L4   -0.191115  -0.190105        9.830006   
5          CatBoost_BAG_L4   -0.191245  -0.189928        9.771147   
6     ExtraTreesMSE_BAG_L4   -0.191452  -0.187319        9.828883   
7     ExtraTreesMSE_BAG_L3   -0.191804  -0.188647        7.062682   
8     LightGBMLarge_BAG_L4   -0.191813  -0.189669       10.643529   
9           XGBoost_BAG_L4   -0.192089  -0.191984        9.855117   
10  RandomForestMSE_BAG_L3   -0.192288  -0.189529        7.021478   
11          XGBoost_BAG_L3   -0.192414  -0.189288        7.100477   
12         LightGBM_BAG_L3   -0.192568  -0.189368        7.110546   
13    LightGBMLarge_BAG_L3   -0.19

In [8]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', []) : 129 | ['dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', ...]


In [9]:
train_data_pred = predictor.predict(train_data)
test_data_pred = predictor.predict(test_data)

import numpy as np
#save np array y_train_hat to a csv file
np.savetxt(f'./{data_name}_vectors_y_test_hat_{label}.csv', test_data_pred, delimiter=',')
np.savetxt(f'./{data_name}_vectors_y_train_hat_{label}.csv', train_data_pred, delimiter=',')

Loading: ./agModels-wings_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-wings_lift/models/CatBoost_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/ExtraTreesMSE_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMLarge_BAG_L2/model.pkl
Loading: ./agModels-wings_lift/models/LightGBMXT_BA