In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from joblib import dump, load
from Soroosh_utilities import *
import tensorflow_probability as tfp
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [2]:
tfd = tfp.distributions
tfpl = tfp.layers
tfk = tf.keras
tfkl = tf.keras.layers


In [3]:
if tf.test.gpu_device_name() != '/device:GPU:0':
    print('WARNING: GPU device not found.')
else:
    print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))



## The purpose of the study:

- We intent to predict the critical current, determine the important features, and visual them.

- Previously, it has been done for the real-world data. In this study, not only, we will repeat those experiments, but also we use the synthetically generated data to increase our test samples. To this end, we persui the following framework.

    - 1) Train regressors on only real-world data;
    - 2) Train regressors on only synthetic data;
    - 3) Train regressors on combination of both (50%-50%).


- To this end we use, three regression algorithms: a)RF, b) GBR-LS, c) DNNR.

- Note: For more see the notion page below

https://www.notion.so/SuperOx-936b1b2ce7b14f20bd76578c82305e2b


Note: we studied and tunned the parameters in previous jupyter notebook. Thus we will use them as the default, and given here.

### Initializatin for comparison of methods:

In [4]:
reg_algs = ['RF', 'GBR-Ls', 'DNN-Reg',]  # Regression algorithms ('Bayes_Reg')
reg_results = ['MAE', 'MRAE', 'RMSE', 'R^2-Score',]  # 'Predictions', 'Ground Truth'
df_reg_real = pd.DataFrame(index=reg_algs, columns=reg_results) 
df_reg_real

Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,,,,


In [30]:
reg_algs = ['RF', 'GBR-Ls', 'DNN-Reg', ]  # Regression algorithms
reg_results = ['MAE', 'MRAE', 'RMSE', 'R^2-Score',]  # 'Predictions', 'Ground Truth'
df_reg_synthetic = pd.DataFrame(index=reg_algs, columns=reg_results) 
df_reg_synthetic

Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,,,,


In [29]:
reg_algs = ['RF', 'GBR-Ls', 'DNN-Reg',]  # Regression algorithms
reg_results = ['MAE', 'MRAE', 'RMSE','R^2-Score',]  # 'Predictions', 'Ground Truth'
df_reg_combined = pd.DataFrame(index=reg_algs, columns=reg_results) 
df_reg_combined

Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,,,,


### Load dataset

In [7]:
# pld_complete = catalog.load('pld_complete').dropna().sort_values('pos', ascending=True)
# pld_complete_zscore = pd.read_csv("/home/soroosh/SearchOX/data/pld_complete_zscore.csv", index_col=False)


pld_complete_range = pd.read_csv("/home/soroosh/Desktop/SearchOX/data/pld_complete_range.csv",
                                 index_col=False)

pld_complete_range_synthetic = np.loadtxt("/home/soroosh/Desktop/SearchOX/data/x_r_synthetic.npy")



In [8]:
pld_complete_range.head()

Unnamed: 0,median_Voltage_HSR_V_1025,median_Voltage_HSR_V_1027,median_Voltage_HSR_V_1030,median_Voltage_HSL_V_1025,median_Voltage_HSL_V_1027,median_Voltage_HSL_V_1030,median_Voltage_HF_V_1025,median_Voltage_HF_V_1027,median_Voltage_HF_V_1030,median_Voltage_HC_V_1025,...,std_Sigma_1030,pos,Speed,X FWHM,Y FWHM,R FWHM,Coolness,Coolness_neg,Ic,Ic_norm
0,0.129479,0.177414,0.005142,0.69562,-0.256257,-0.015406,-0.009215,-0.072807,-0.021102,0.057123,...,-0.390667,-0.501695,0.009841,0.039589,0.06747,0.015697,0.053006,-0.086369,496.2,1.767913
1,0.110059,0.340245,0.005142,0.69562,-0.321465,-0.015406,-0.009215,-0.173901,-0.021102,0.050178,...,-0.390667,-0.501652,-0.005266,0.039589,0.06747,0.015697,0.053006,-0.086369,494.7,1.762568
2,0.119769,0.340245,0.005142,0.69562,-0.321465,-0.015406,-0.007363,-0.173901,-0.021102,0.057123,...,-0.390667,-0.501646,-0.007197,0.039589,0.06747,0.015697,0.053006,-0.086369,494.2,1.760787
3,0.110059,0.348139,0.005142,0.69562,-0.322354,-0.015406,-0.009215,-0.21827,-0.021102,0.057123,...,-0.390667,-0.501635,-0.010946,0.039589,0.06747,0.015697,0.053006,-0.086369,495.5,1.765419
4,0.110059,0.264593,0.005142,0.69562,-0.30582,-0.015406,-0.009215,-0.129533,-0.021102,0.057123,...,-0.390667,-0.501619,-0.017147,0.042682,0.069442,0.017956,0.051965,-0.084836,497.9,1.773168


In [9]:
x_r_real = pld_complete_range.loc[:,~pld_complete_range.columns.isin(['Ic', 'Ic_norm', 'pos'])].to_numpy()

y_ic_real = pld_complete_range['Ic'].to_numpy()
y_ic_norm_real = pld_complete_range['Ic_norm'].to_numpy()
pos_real = pld_complete_range['pos'].to_numpy()

x_r_real.shape, y_ic_real.shape, y_ic_norm_real.shape, pos_real.shape

((18561, 376), (18561,), (18561,), (18561,))

In [10]:
pos_idx = pld_complete_range.columns.get_loc("pos")
ic_idx = pld_complete_range.columns.get_loc("Ic")
ic_norm_idx = pld_complete_range.columns.get_loc("Ic_norm")
pos_idx, ic_idx, ic_norm_idx

(370, 377, 378)

In [11]:
x_r_synthetic_1 = pld_complete_range_synthetic[:, : pos_idx]
x_r_synthetic_2 = pld_complete_range_synthetic[:, pos_idx+1:ic_idx]


x_r_synthetic = np.concatenate([x_r_synthetic_1, x_r_synthetic_2], axis=1)
x_r_synthetic_1.shape, x_r_synthetic_2.shape, x_r_synthetic.shape

((18561, 370), (18561, 6), (18561, 376))

In [12]:
y_ic_synthetic = pld_complete_range_synthetic[:, ic_idx]
y_ic_norm_synthetic = pld_complete_range_synthetic[:, ic_norm_idx] 
pos_synthetic = pld_complete_range_synthetic[:, pos_idx]

x_r_synthetic.shape, y_ic_synthetic.shape, y_ic_norm_synthetic.shape, pos_synthetic.shape

((18561, 376), (18561,), (18561,), (18561,))

In [13]:
x_r_combined = np.concatenate([x_r_real, x_r_synthetic], axis=0)
y_ic_combined = np.concatenate([y_ic_real, y_ic_synthetic], axis=0)
y_ic_norm_combined = np.concatenate([y_ic_norm_real, y_ic_norm_synthetic], axis=0)

x_r_combined.shape, y_ic_combined.shape, y_ic_norm_combined.shape

((37122, 376), (37122,), (37122,))

In [14]:
assert not np.any(np.isnan(x_r_real))
assert not x_r_real.shape != x_r_synthetic.shape
assert not x_r_combined.shape[0] != int(2*x_r_synthetic.shape[0])

## splitting the data

### Real data

In [15]:
x_r_train_real, x_r_test_real, \
y_train_real, y_test_real = train_test_split(x_r_real,
                                             y_ic_norm_real,
                                             test_size=0.40,
                                             random_state=43,)

x_r_val_real, x_r_test_real, \
y_val_real, y_test_real = train_test_split(x_r_test_real,
                                           y_test_real,
                                           test_size=0.5,
                                           random_state=43,)


x_r_train_real.shape, x_r_test_real.shape, \
x_r_val_real.shape, y_train_real.shape, \
y_test_real.shape, y_val_real.shape

((11136, 376), (3713, 376), (3712, 376), (11136,), (3713,), (3712,))

### Synthetic data


In [16]:
x_r_train_synthetic, x_r_test_synthetic,\
y_train_synthetic, y_test_synthetic = train_test_split(x_r_synthetic,
                                                       y_ic_norm_synthetic,
                                                       test_size=0.40,
                                                       random_state=43,)

x_r_val_synthetic, x_r_test_synthetic,\
y_val_synthetic, y_test_synthetic = train_test_split(x_r_test_synthetic,
                                                     y_test_synthetic,
                                                     test_size=0.5,
                                                     random_state=43,)


x_r_train_synthetic.shape, x_r_test_synthetic.shape, \
x_r_val_synthetic.shape, y_train_synthetic.shape,  y_test_synthetic.shape, y_val_synthetic.shape

((11136, 376), (3713, 376), (3712, 376), (11136,), (3713,), (3712,))

In [17]:
x_r_train_combined, x_r_test_combined, \
y_train_combined, y_test_combined = train_test_split(x_r_combined,
                                                     y_ic_norm_combined,
                                                     test_size=0.40,
                                                     random_state=43,)

x_r_val_combined, x_r_test_combined, \
y_val_combined, y_test_combined = train_test_split(x_r_test_combined,
                                                   y_test_combined,
                                                   test_size=0.5,
                                                   random_state=43,)


x_r_train_combined.shape, x_r_test_combined.shape, \
x_r_val_combined.shape, y_train_combined.shape, y_test_combined.shape, y_val_combined.shape

((22273, 376), (7425, 376), (7424, 376), (22273,), (7425,), (7424,))

## Training RF

- For more about tuning the parameters see the u01-*.ipynb 

In [18]:
def train_eval_rf(x_train, y_train, x_test, y_test, name):
    
    rf_reg = RandomForestRegressor(n_estimators=100, 
                                   n_jobs = -2, 
                                   criterion='mse', 
                                   min_samples_leaf=1,
                                   verbose=1)
    
    rf_reg.fit(x_train, y_train)
    
    y_preds_rf = rf_reg.predict(x_test)

    filename = "rf_reg-" + name +  "-.joblib"
    dump(rf_reg, "saved_model/"+ filename )
    
    return y_preds_rf


## Train Gradient Boosting Regressor


In [19]:
def train_eval_gbr(x_train, y_train, x_test, y_test, name):
    
    gbr_ls = GradientBoostingRegressor(loss='ls', verbose=1,)
    gbr_ls.fit(x_train, y_train)
    y_preds_gbr_ls = gbr_ls.predict(x_test)
    
    filename = "gbr_ls-" + name +  "-.joblib"
    dump(gbr_ls, "saved_model/"+ filename )
    
    return y_preds_gbr_ls



## DNN-Regressor



In [20]:
class DnnReg(tfk.Model):
    
    def __init__(self, n_units, n_features, name='dnn_reg', **kwargs):
        super(DnnReg, self).__init__(name=name, **kwargs)
        self.n_units = n_units
        self.n_features = n_features
        
        self.input_layer = tfkl.InputLayer(input_shape=self.n_features)
        self.cast_layer = tfkl.Lambda(lambda x: tf.cast(x, tf.float32))
        self.dense_1 = tfkl.Dense(units=int(.5*self.n_units), activation=tf.nn.leaky_relu)
        self.dense_2 = tfkl.Dense(units=self.n_units, activation=tf.nn.leaky_relu)
        self.dense_3 = tfkl.Dense(units=2*self.n_units, activation=tf.nn.leaky_relu,)
        self.regressor = tfkl.Dense(units=1)
        
    def call(self, inputs):
        x = self.input_layer(inputs)
        x = self.cast_layer(x)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_3(x)
        regression = self.regressor(x)
        return regression
        

In [21]:
n_epochs = 1000  # previously, it was 500
learning_rate = 1e-5  # [1e-2, 1e-3, 1e-5] 
batch_size = 64  # [32, 64, 256]
n_units = 128

input_shape = (x_r_real.shape[1])  
n_units = 128


In [22]:
def train_eval_dnn_reg(dnn_reg, x_train, y_train, x_val, y_val, x_test, y_test, name):

    callback = tfk.callbacks.EarlyStopping(monitor='loss', patience=5)  # for early-stop

    dnn_reg.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
                    loss='mse',
                   )

    history_dnn_ref = dnn_reg.fit(x=x_train, y=y_train,
               epochs=n_epochs, batch_size=batch_size,
               validation_data=(x_val, y_val),
    #            callbacks=[callback],
               )


#     plot_loss(history=history_dnn_ref, name='DNN-Reg('+ 
#               str(batch_size)+ ", " + str(learning_rate)+ ')')

    # Saving the trained weights for future applications
    # !mkdir -p saved_model
    filename = "dnn-reg-" + name +  "-.joblib"
    dnn_reg.save_weights('saved_model/' + filename + '.h5')

    y_preds_dnn_reg = dnn_reg.predict(x_test)

    print("y_preds_dnn_reg:", y_preds_dnn_reg.shape)
    
    return y_preds_dnn_reg

# Real-only data 

In [23]:
%%time

dnn_reg_real = DnnReg(n_units=n_units, n_features=input_shape)

y_preds_dnn_reg_real = train_eval_dnn_reg(dnn_reg=dnn_reg_real,
                                          x_train=x_r_train_real,
                                          y_train=y_train_real, 
                                          x_val=x_r_val_real, 
                                          y_val=y_val_real,
                                          x_test=x_r_test_real,
                                          y_test=y_test_real,
                                          name='real',
                                         )



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [24]:

df_reg_real = add_to_regression_comparison(df_reg_real,
                                           y_preds=y_preds_dnn_reg_real,
                                           y_trues=y_test_real, 
                                           name='DNN-Reg',
                                           data_name='real')
df_reg_real



Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,0.0372209,0.0217353,0.0474404,0.570674


In [25]:
%%time

y_preds_rf_real = train_eval_rf(x_train=x_r_train_real,
                                y_train=y_train_real, 
                                x_test=x_r_test_real,
                                y_test=y_test_real,
                                name='real'
                               )


df_reg_real = add_to_regression_comparison(df_reg_real,
                                           y_preds=y_preds_rf_real,
                                           y_trues=y_test_real, 
                                           name='RF',
                                           data_name='real'
                                          )
df_reg_real



[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done  20 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   27.4s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:    0.0s finished


CPU times: user 5min 42s, sys: 4.02 ms, total: 5min 42s
Wall time: 27.9 s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.010694,0.00628533,0.017858,0.747494
GBR-Ls,,,,
DNN-Reg,0.0372209,0.0217353,0.0474404,0.570674


In [26]:
%%time
y_preds_gbr_real = train_eval_gbr(x_train=x_r_train_real,
                              y_train=y_train_real, 
                              x_test=x_r_test_real,
                              y_test=y_test_real,
                              name='real',
                             )


df_reg_real = add_to_regression_comparison(df_reg_real,
                                           y_preds=y_preds_gbr_real,
                                           y_trues=y_test_real, 
                                           name='GBR-Ls',
                                           data_name='real'
                                          )
df_reg_real



      Iter       Train Loss   Remaining Time 
         1           0.0011            1.10m
         2           0.0010            1.08m
         3           0.0009            1.07m
         4           0.0008            1.06m
         5           0.0007            1.05m
         6           0.0007            1.04m
         7           0.0007            1.03m
         8           0.0006            1.02m
         9           0.0006            1.01m
        10           0.0006           59.96s
        20           0.0005           53.38s
        30           0.0004           47.06s
        40           0.0004           40.82s
        50           0.0004           34.21s
        60           0.0004           27.46s
        70           0.0004           20.66s
        80           0.0003           13.78s
        90           0.0003            6.90s
       100           0.0003            0.00s
CPU times: user 1min 9s, sys: 0 ns, total: 1min 9s
Wall time: 1min 9s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.010694,0.00628533,0.017858,0.747494
GBR-Ls,0.0141234,0.00829238,0.0211861,0.644608
DNN-Reg,0.0372209,0.0217353,0.0474404,0.570674


#### Conclusion over real-only data:

- All three algorithms obtained acceptable results w.r.t MAE, MRAE, RMSE.

- Although all of these three also obtain acceptable r^2 score, however, RF is the winner

# Synthetic-only data

In [27]:
%%time

dnn_reg_synthetic = DnnReg(n_units=n_units, n_features=input_shape)

y_preds_dnn_reg_synthetic = train_eval_dnn_reg(dnn_reg=dnn_reg_synthetic,
                                          x_train=x_r_train_synthetic,
                                          y_train=y_train_synthetic, 
                                          x_val=x_r_val_synthetic, 
                                          y_val=y_val_synthetic,
                                          x_test=x_r_test_synthetic,
                                          y_test=y_test_synthetic,
                                          name='synthetic',
                                         )



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [31]:

df_reg_synthetic = add_to_regression_comparison(df_reg_synthetic,
                                           y_preds=y_preds_dnn_reg_synthetic,
                                           y_trues=y_test_synthetic, 
                                           name='DNN-Reg',
                                           data_name='synthetic')
df_reg_synthetic



Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,0.0459249,0.0269077,0.0587384,0.979433


In [32]:
%%time

y_preds_rf_synthetic = train_eval_rf(x_train=x_r_train_synthetic,
                                y_train=y_train_synthetic, 
                                x_test=x_r_test_synthetic,
                                y_test=y_test_synthetic,
                                name='synthetic'
                               )


df_reg_synthetic = add_to_regression_comparison(df_reg_synthetic,
                                           y_preds=y_preds_rf_synthetic,
                                           y_trues=y_test_synthetic, 
                                           name='RF',
                                           data_name='synthetic'
                                          )
df_reg_synthetic



[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done  20 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   30.6s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:    0.0s finished


CPU times: user 6min 16s, sys: 0 ns, total: 6min 16s
Wall time: 30.8 s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.0152971,0.00900766,0.0229524,0.697224
GBR-Ls,,,,
DNN-Reg,0.0459249,0.0269077,0.0587384,0.979433


In [33]:
%%time
y_preds_gbr_synthetic = train_eval_gbr(x_train=x_r_train_synthetic,
                              y_train=y_train_synthetic, 
                              x_test=x_r_test_synthetic,
                              y_test=y_test_synthetic,
                              name='synthetic',
                             )


df_reg_synthetic = add_to_regression_comparison(df_reg_synthetic,
                                           y_preds=y_preds_gbr_synthetic,
                                           y_trues=y_test_synthetic, 
                                           name='GBR-Ls',
                                           data_name='synthetic'
                                          )
df_reg_synthetic



      Iter       Train Loss   Remaining Time 
         1           0.0015            1.49m
         2           0.0014            1.48m
         3           0.0013            1.46m
         4           0.0012            1.45m
         5           0.0011            1.43m
         6           0.0011            1.42m
         7           0.0010            1.40m
         8           0.0010            1.39m
         9           0.0010            1.37m
        10           0.0009            1.36m
        20           0.0008            1.21m
        30           0.0007            1.06m
        40           0.0006           54.76s
        50           0.0006           45.73s
        60           0.0006           36.71s
        70           0.0005           27.57s
        80           0.0005           18.40s
        90           0.0005            9.22s
       100           0.0005            0.00s
CPU times: user 1min 32s, sys: 0 ns, total: 1min 32s
Wall time: 1min 32s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.0152971,0.00900766,0.0229524,0.697224
GBR-Ls,0.017798,0.0104725,0.025361,0.630344
DNN-Reg,0.0459249,0.0269077,0.0587384,0.979433


#### Conclusion over synthetic-only data:

- All three algorithms obtained acceptable results w.r.t MAE, MRAE, RMSE.

- Although all of these three also obtain acceptable r^2 score, however, DNN-Reg is the winner

# Combined data

In [34]:
%%time

dnn_reg_combined = DnnReg(n_units=n_units, n_features=input_shape)

y_preds_dnn_reg_combined = train_eval_dnn_reg(dnn_reg=dnn_reg_combined,
                                          x_train=x_r_train_combined,
                                          y_train=y_train_combined, 
                                          x_val=x_r_val_combined, 
                                          y_val=y_val_combined,
                                          x_test=x_r_test_combined,
                                          y_test=y_test_combined,
                                          name='combined',
                                         )



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [35]:

df_reg_combined = add_to_regression_comparison(df_reg_combined,
                                           y_preds=y_preds_dnn_reg_combined,
                                           y_trues=y_test_combined, 
                                           name='DNN-Reg',
                                           data_name='combined')
df_reg_combined



Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,,,,
GBR-Ls,,,,
DNN-Reg,0.040592,0.0237572,0.0517857,0.703024


In [36]:
%%time

y_preds_rf_combined = train_eval_rf(x_train=x_r_train_combined,
                                y_train=y_train_combined, 
                                x_test=x_r_test_combined,
                                y_test=y_test_combined,
                                name='combined'
                               )


df_reg_combined = add_to_regression_comparison(df_reg_combined,
                                           y_preds=y_preds_rf_combined,
                                           y_trues=y_test_combined, 
                                           name='RF',
                                           data_name='combined'
                                          )
df_reg_combined



[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done  20 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:    0.0s finished


CPU times: user 14min 50s, sys: 0 ns, total: 14min 50s
Wall time: 1min 11s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.0133423,0.00782976,0.0202903,0.715944
GBR-Ls,,,,
DNN-Reg,0.040592,0.0237572,0.0517857,0.703024


In [37]:
%%time
y_preds_gbr_combined = train_eval_gbr(x_train=x_r_train_combined,
                              y_train=y_train_combined, 
                              x_test=x_r_test_combined,
                              y_test=y_test_combined,
                              name='combined',
                             )


df_reg_combined = add_to_regression_comparison(df_reg_combined,
                                           y_preds=y_preds_gbr_combined,
                                           y_trues=y_test_combined, 
                                           name='GBR-Ls',
                                           data_name='combined'
                                          )
df_reg_combined



      Iter       Train Loss   Remaining Time 
         1           0.0013            2.84m
         2           0.0012            2.83m
         3           0.0011            2.81m
         4           0.0011            2.80m
         5           0.0010            2.78m
         6           0.0010            2.74m
         7           0.0009            2.71m
         8           0.0009            2.68m
         9           0.0009            2.65m
        10           0.0008            2.62m
        20           0.0007            2.32m
        30           0.0007            2.03m
        40           0.0006            1.75m
        50           0.0006            1.46m
        60           0.0006            1.17m
        70           0.0006           52.87s
        80           0.0005           35.31s
        90           0.0005           17.70s
       100           0.0005            0.00s
CPU times: user 2min 57s, sys: 0 ns, total: 2min 57s
Wall time: 2min 57s


Unnamed: 0,MAE,MRAE,RMSE,R^2-Score
RF,0.0133423,0.00782976,0.0202903,0.715944
GBR-Ls,0.0169401,0.00993664,0.0242529,0.594159
DNN-Reg,0.040592,0.0237572,0.0517857,0.703024


#### Conclusion over synthetic-only data:

- All three algorithms obtained acceptable results w.r.t MAE, MRAE, RMSE.

- Although all of these three also obtain acceptable r^2 score. Although RF is the winner, but DNN-Reg is a closer follower here.

## Overall Conclusion:


This study trained three regressors over a) real-data; b) synthetically generated data; c) their combination.

We used four metrics to evaluate and compare the obtained results.

- W.r.t MAE, MRAE, RMSE, all three algorithms, obtained excellent results over all three types of data sets.

- W.r.t R^2 score, DNN-Reg obtained outstanding results over synthetic only data. It is also a close follower of the combined data set winner, which is RF. 

- RF wins the combined and real-only data, with relatively acceptable results.



Future work: 

- I am going to re-train DNN-Reg with more epochs and smaller batch-size to improve its performance (hopefully). 

