In [1]:
#surrogate models
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

data_name = '5040'

data_file = f'../{data_name}_vectors_drags_lifts.csv'
df = TabularDataset(data_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=777)

#exclue the first two columns of train data
train_data = train_df.drop(columns=['i', 'name'])
train_data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_5033,dim_5034,dim_5035,dim_5036,dim_5037,dim_5038,dim_5039,dim_5040,drag,lift
406,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395,0.425
54,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645,0.885
241,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446,0.756
952,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384,0.177
337,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381,0.074


In [2]:
label = 'lift'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count    837.000000
mean       0.378389
std        0.238650
min        0.002000
25%        0.184000
50%        0.344000
75%        0.540000
max        0.988000
Name: lift, dtype: float64


In [3]:
import os
save_path = f'./agModels-{data_name}_{label}'  # specifies folder to store trained models
if not os.path.exists(save_path):
    os.makedirs(save_path)

bag_folds = 5 #suggestion range [5, 10]
bag_sets = 3 #suggestion range [1, 20]
stack_levels = 3 #suggestion range [0, 3]
metric = 'root_mean_squared_error' #Regression:mean_absolute_error, mean_squared_error,root_mean_squared_error (default), r2
predictor = TabularPredictor(label=label, path=save_path, eval_metric=metric).fit(train_data, 
                                                                                  presets='best_quality', 
                                                                                  auto_stack="True", 
                                                                                  num_bag_folds=bag_folds, 
                                                                                  num_bag_sets=bag_sets,
                                                                                  num_stack_levels=stack_levels,
                                                                                  verbosity=4)

Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': 'True',
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'verbosity': 4}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': 'True',
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=3
Saving ./agModels-5040_lift/learner.pkl
Saving ./agModels-5040_lift/predictor.pkl
Beginning AutoGluon training ...
AutoGluon

In [4]:
test_data = test_df.drop(columns=['i', 'name'])
# val_data.head()
y_val = test_data[label]
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_5032,dim_5033,dim_5034,dim_5035,dim_5036,dim_5037,dim_5038,dim_5039,dim_5040,drag
1046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527
657,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363
127,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316
951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.495


In [5]:
# %%capture log_output
# %config InlineBackend.figure_format = 'retina'
# %config Application.log_level = 'DEBUG'
# %config IPCompleter.greedy = True

predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
y_pred = predictor.predict(test_data_nolab)
for item in y_pred:
    print(item)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_val, y_pred=y_pred, auxiliary_metrics=True)
print(perf)

results = predictor.fit_summary(show_plot=True)
print(results)
print(predictor.leaderboard(test_data, silent=True))

# with open('./output_all_parts.log', 'w') as f:
#     f.write(log_output.stdout)

Loading: ./agModels-5040_lift/predictor.pkl
Loading: ./agModels-5040_lift/learner.pkl
Loading: ./agModels-5040_lift/models/trainer.pkl
Loading: ./agModels-5040_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/WeightedEnsemble_L2/model.pkl
Evaluation: root_mean_squared_error on test data: -0.19836444597725816
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.19836444597725816,
    "mean_squared_error": -0.039348453427864576,
    "mean_absolute_error": -0.15665775403522308,
    "r2": 0.3365633993550394,
    "pearsonr": 0.5822806143495615,
    "median_absolute_error": -0.14193891912698747
}
Loading: ./agModels-5040_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/KNeighborsDi

0.434601366519928
0.38144391775131226
0.2341814488172531
0.5718986392021179
0.24345624446868896
0.38411757349967957
0.3263978362083435
0.6769461631774902
0.36208415031433105
0.20437046885490417
0.32453641295433044
0.32044512033462524
0.34254518151283264
0.41828224062919617
0.360204815864563
0.4078839421272278
0.3431909680366516
0.29472625255584717
0.8118496537208557
0.6269351243972778
0.20629572868347168
0.3213784694671631
0.2298872172832489
0.2883269190788269
0.10987351834774017
0.5589358806610107
0.4724496603012085
0.3939610421657562
0.27881497144699097
0.5845784544944763
0.23175981640815735
0.4143196940422058
0.5065229535102844
0.26694586873054504
0.23399005830287933
0.45550939440727234
0.43043380975723267
0.1908467561006546
0.3253483176231384
0.27649712562561035
0.24184317886829376
0.5112299919128418
0.27731043100357056
0.3605896532535553
0.5932923555374146
0.6432680487632751
0.4511674642562866
0.23544904589653015
0.6026232838630676
0.6209136843681335
0.6326862573623657
0.265559405

Loading: ./agModels-5040_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-5040_lift/models/LightGBMXT_BAG_L2/model.pkl
Loading: ./agModels-5040_lift/models/LightGBM_BAG_L2/model.pkl
Loading: ./agModels-5040_lift/models/RandomForestMSE_BAG_L2/model.pkl

                     model  score_test  score_val  pred_time_test  \
0          LightGBM_BAG_L2   -0.196974  -0.187252       26.465887   
1     LightGBMLarge_BAG_L2   -0.198303  -0.188716       25.506973   
2      WeightedEnsemble_L2   -0.198364  -0.181852       13.793705   
3    NeuralNetTorch_BAG_L1   -0.198842  -0.189665        2.903978   
4      WeightedEnsemble_L3   -0.198966  -0.184022       32.293105   
5          CatBoost_BAG_L3   -0.199285  -0.187572       42.473668   
6          LightGBM_BAG_L3   -0.199462  -0.188678       38.456475   
7          CatBoost_BAG_L2   -0.199546  -0.185326       30.248079   
8        LightGBMXT_BAG_L2   -0.200064  -0.187252       26.723383   
9          LightGBM_BAG_L4   -0.200077  -0.191384       49.257061   
10          XGBoost_BAG_L2   -0.200130  -0.189151       25.501092   
11     WeightedEnsemble_L4   -0.200284  -0.186720       45.880813   
12         LightGBM_BAG_L1   -0.200496  -0.187029        1.828466   
13         CatBoost_BAG_L4   -0.20

In [6]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', [])     :    1 | ['drag']
('int', ['bool']) : 4807 | ['dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', ...]


In [7]:
train_data_pred = predictor.predict(train_data)
test_data_pred = predictor.predict(test_data)

import numpy as np
#save np array y_train_hat to a csv file
np.savetxt(f'./{data_name}_vectors_y_test_hat_{label}.csv', test_data_pred, delimiter=',')
np.savetxt(f'./{data_name}_vectors_y_train_hat_{label}.csv', train_data_pred, delimiter=',')

Loading: ./agModels-5040_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-5040_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-5040_lift/models/WeightedEnsemble_L2/model.pkl
