In [1]:
#surrogate models
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

data_name = '20000'

data_file = f'../{data_name}_vectors_drags_lifts.csv'
df = TabularDataset(data_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=777)

#exclue the first two columns of train data
train_data = train_df.drop(columns=['i', 'name'])
train_data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_19993,dim_19994,dim_19995,dim_19996,dim_19997,dim_19998,dim_19999,dim_20000,drag,lift
406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395,0.425
54,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645,0.885
241,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446,0.756
952,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384,0.177
337,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381,0.074


In [2]:
label = 'lift'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count    837.000000
mean       0.378389
std        0.238650
min        0.002000
25%        0.184000
50%        0.344000
75%        0.540000
max        0.988000
Name: lift, dtype: float64


In [3]:
import os
save_path = f'./agModels-{data_name}_{label}'  # specifies folder to store trained models
if not os.path.exists(save_path):
    os.makedirs(save_path)

bag_folds = 5 #suggestion range [5, 10]
bag_sets = 3 #suggestion range [1, 20]
stack_levels = 3 #suggestion range [0, 3]
metric = 'root_mean_squared_error' #Regression:mean_absolute_error, mean_squared_error,root_mean_squared_error (default), r2
predictor = TabularPredictor(label=label, path=save_path, eval_metric=metric).fit(train_data, 
                                                                                  presets='best_quality', 
                                                                                  auto_stack="True", 
                                                                                  num_bag_folds=bag_folds, 
                                                                                  num_bag_sets=bag_sets,
                                                                                  num_stack_levels=stack_levels,
                                                                                  verbosity=4)

Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': 'True',
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'verbosity': 4}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': 'True',
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': 5,
 'num_bag_sets': 3,
 'num_stack_levels': 3,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=3
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of

In [4]:
test_data = test_df.drop(columns=['i', 'name'])
# val_data.head()
y_val = test_data[label]
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_19992,dim_19993,dim_19994,dim_19995,dim_19996,dim_19997,dim_19998,dim_19999,dim_20000,drag
1046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527
657,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363
127,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316
951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.495


In [5]:
# %%capture log_output
# %config InlineBackend.figure_format = 'retina'
# %config Application.log_level = 'DEBUG'
# %config IPCompleter.greedy = True

predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file
y_pred = predictor.predict(test_data_nolab)
for item in y_pred:
    print(item)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_val, y_pred=y_pred, auxiliary_metrics=True)
print(perf)

results = predictor.fit_summary(show_plot=True)
print(results)
print(predictor.leaderboard(test_data, silent=True))

# with open('./output_all_parts.log', 'w') as f:
#     f.write(log_output.stdout)

Loading: ./agModels-20000_lift/predictor.pkl
Loading: ./agModels-20000_lift/learner.pkl
Loading: ./agModels-20000_lift/models/trainer.pkl
Loading: ./agModels-20000_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemble_L2/model.pkl
Evaluation: root_mean_squared_error on test data: -0.20034057315282744
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.20034057315282744,
    "mean_squared_error": -0.0401363452512034,
    "mean_absolute_error": -0.15897293161834988,
    "r2": 0.32327910918820635,
    "pearsonr": 0.5709705345130752,
    "median_absolute_error": -0.13883552259206772
}
Loading: ./agModels-20000_lift/models/KNeighbors

0.44742727279663086
0.39136314392089844
0.2183297723531723
0.5614920854568481
0.21204295754432678
0.40233874320983887
0.3584778308868408
0.6853506565093994
0.41112905740737915
0.2288622260093689
0.36519336700439453
0.266526460647583
0.31221476197242737
0.4915744364261627
0.3933456540107727
0.39834171533584595
0.3513725996017456
0.30380406975746155
0.7916274070739746
0.614070475101471
0.1960880011320114
0.39819803833961487
0.2985221743583679
0.2968655228614807
0.11445683240890503
0.5341439843177795
0.4975288510322571
0.3832324743270874
0.34939324855804443
0.4630267024040222
0.23116999864578247
0.40025773644447327
0.5384572148323059
0.24421490728855133
0.22453248500823975
0.4665633738040924
0.43793708086013794
0.18590471148490906
0.2647109627723694
0.2729010283946991
0.21325284242630005
0.5454201698303223
0.263973206281662
0.38129502534866333
0.5732495784759521
0.6473870277404785
0.4651038348674774
0.25143736600875854
0.5124092102050781
0.6243715882301331
0.5872657895088196
0.30158805847

Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/XGBoost_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMLarge_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemble_L3/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMXT_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/RandomForestMSE_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/CatBoost_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/ExtraTreesMSE_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/XGBoost_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMLarge_BAG_L3/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemb

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2  -0.181871      40.829423   9269.639922                0.002664           0.493320            2       True         12
1      WeightedEnsemble_L3  -0.184241     107.537930  13200.058911                0.002346           0.442384            3       True         22
2          LightGBM_BAG_L1  -0.184631       2.273810    144.060642                2.273810         144.060642            1       True          4
3          CatBoost_BAG_L1  -0.184783      31.618476   8848.810605               31.618476        8848.810605            1       True          6
4          CatBoost_BAG_L2  -0.185754      92.531203  11684.248170               25.814616         698.133700            2       True         16
5      WeightedEnsemble_L4  -0.186426     166.917218  14712.526541  

Loading: ./agModels-20000_lift/models/KNeighborsUnif_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/KNeighborsDist_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMXT_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/RandomForestMSE_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/ExtraTreesMSE_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/XGBoost_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMLarge_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-20000_lift/models/LightGBMXT_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L2/model.pkl
Loading: ./agModels-20000_lift/models/RandomForestMSE_B

                     model  score_test  score_val  pred_time_test  \
0          LightGBM_BAG_L4   -0.195664  -0.189755      101.067050   
1          CatBoost_BAG_L2   -0.197697  -0.185754       51.843962   
2          LightGBM_BAG_L3   -0.197978  -0.190819       69.438056   
3      WeightedEnsemble_L5   -0.198087  -0.187602      108.675401   
4      WeightedEnsemble_L3   -0.198690  -0.184241       64.549079   
5          CatBoost_BAG_L4   -0.198903  -0.190514      115.457536   
6        LightGBMXT_BAG_L4   -0.199087  -0.190534      101.484636   
7          LightGBM_BAG_L2   -0.199094  -0.186700       37.348567   
8          CatBoost_BAG_L3   -0.199366  -0.189096       83.848875   
9   RandomForestMSE_BAG_L2   -0.199373  -0.188826       34.514188   
10          XGBoost_BAG_L2   -0.199751  -0.188255       36.676431   
11       LightGBMXT_BAG_L2   -0.199978  -0.186915       36.723819   
12       LightGBMXT_BAG_L3   -0.200244  -0.190199       70.210652   
13     WeightedEnsemble_L2   -0.20

In [6]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  regression
AutoGluon identified the following types of features:
('float', [])     :     1 | ['drag']
('int', ['bool']) : 19054 | ['dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', ...]


In [7]:
train_data_pred = predictor.predict(train_data)
test_data_pred = predictor.predict(test_data)

import numpy as np
#save np array y_train_hat to a csv file
np.savetxt(f'./{data_name}_vectors_y_test_hat_{label}.csv', test_data_pred, delimiter=',')
np.savetxt(f'./{data_name}_vectors_y_train_hat_{label}.csv', train_data_pred, delimiter=',')

Loading: ./agModels-20000_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemble_L2/model.pkl
Loading: ./agModels-20000_lift/models/CatBoost_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/LightGBM_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/NeuralNetTorch_BAG_L1/model.pkl
Loading: ./agModels-20000_lift/models/WeightedEnsemble_L2/model.pkl
