### Libraries and Packages to be imported

In [1]:
import sys
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import numpy as np

### Calculate MAPE (Mean absolute percentage error) value


The mean absolute percentage error (MAPE) is a statistical measure of how accurate a forecast system is.

It measures this accuracy as a percentage and can be calculated as the average absolute percent error for
each time period minus actual values divided by actual values. 


In [2]:
def MAPE(test, predict):
    mape = 0

    for i, j in zip(test['int_rate'].as_data_frame().values, predict.as_data_frame().values):
        mape += np.abs((i-j)/i)

    mape = (mape/test.shape[0])*100

    return mape


    

### Start H2O on same machine as running python process

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.111-b14, mixed mode)
  Starting server from C:\Users\Siddhi\Anaconda3\envs\Non-Tensor_Flow\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Siddhi\AppData\Local\Temp\tmpmrbjma79
  JVM stdout: C:\Users\Siddhi\AppData\Local\Temp\tmpmrbjma79\h2o_Siddhi_started_from_python.out
  JVM stderr: C:\Users\Siddhi\AppData\Local\Temp\tmpmrbjma79\h2o_Siddhi_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_Siddhi_a8jol8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.507 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


### Importing Data after mice file in two different data frames

In [4]:
df_linear = h2o.import_file('data_after_mice.csv')
df_linear.head()

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,loan_status_Current,loan_status_Default,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),issue_month,issue_year
0,7000,7000,7000,0,6.62,214.93,0,48000,16.0,0,0,14,0,22301,80.9,25,0.0,0.0,7736.09,7736.09,7000.0,736.09,0.0,0,0,223.8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,10,2011
1,7200,7200,7200,0,12.42,240.59,9,35000,6.14,0,0,6,0,6250,89.3,13,0.0,0.0,8680.72,8680.72,7200.0,1465.73,14.9906,0,0,8.82,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,10,2011
2,22000,22000,22000,0,14.65,758.88,1,192000,1.82,0,1,8,0,20795,65.8,10,0.0,0.0,23407.6,23407.6,22000.0,1407.62,0.0,0,0,859.49,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,10,2011
3,18750,18750,18750,0,7.9,586.7,3,75000,13.94,0,1,11,0,31463,49.6,24,0.0,0.0,21120.1,21120.1,18750.0,2370.14,0.0,0,0,591.04,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,11,2011
4,12500,12500,12500,0,7.9,391.13,3,32500,24.15,0,0,6,0,7458,84.7,10,0.0,0.0,14080.0,14080.0,12500.0,1580.04,0.0,0,0,394.54,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,10,2011
5,11000,11000,11000,0,7.51,342.22,7,65000,19.88,0,1,16,0,11968,21.0,35,0.0,0.0,12318.6,12318.6,11000.0,1318.62,0.0,0,0,350.37,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,10,2011
6,7000,7000,7000,0,7.9,219.04,2,36000,3.7,0,0,4,0,7850,47.3,13,0.0,0.0,7884.32,7884.32,7000.0,884.32,0.0,0,0,223.59,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,10,2011
7,31500,31500,31500,1,15.96,765.35,0,132000,6.01,0,6,7,0,33690,41.7,31,0.0,0.0,34329.6,34329.6,31500.0,2829.57,0.0,0,0,29750.9,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,10,2011
8,13600,13600,13575,1,17.58,342.26,0,81060,8.1,1,1,10,0,6720,42.5,15,3188.24,3182.31,17060.3,17029.1,10411.8,6648.59,0.0,0,0,342.26,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10,2011
9,14400,14400,14400,1,17.58,362.39,0,89000,7.48,4,0,6,0,0,0.0,22,3376.31,3376.31,18097.7,18097.7,11023.7,7055.93,18.0952,0,0,362.39,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,10,2011




In [5]:
df = h2o.import_file('data_after_mice.csv')
df

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,loan_status_Current,loan_status_Default,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),issue_month,issue_year
0,7000,7000,7000,0,6.62,214.93,0,48000,16.0,0,0,14,0,22301,80.9,25,0.0,0.0,7736.09,7736.09,7000.0,736.09,0.0,0,0,223.8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,10,2011
1,7200,7200,7200,0,12.42,240.59,9,35000,6.14,0,0,6,0,6250,89.3,13,0.0,0.0,8680.72,8680.72,7200.0,1465.73,14.9906,0,0,8.82,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,10,2011
2,22000,22000,22000,0,14.65,758.88,1,192000,1.82,0,1,8,0,20795,65.8,10,0.0,0.0,23407.6,23407.6,22000.0,1407.62,0.0,0,0,859.49,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,10,2011
3,18750,18750,18750,0,7.9,586.7,3,75000,13.94,0,1,11,0,31463,49.6,24,0.0,0.0,21120.1,21120.1,18750.0,2370.14,0.0,0,0,591.04,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,11,2011
4,12500,12500,12500,0,7.9,391.13,3,32500,24.15,0,0,6,0,7458,84.7,10,0.0,0.0,14080.0,14080.0,12500.0,1580.04,0.0,0,0,394.54,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,10,2011
5,11000,11000,11000,0,7.51,342.22,7,65000,19.88,0,1,16,0,11968,21.0,35,0.0,0.0,12318.6,12318.6,11000.0,1318.62,0.0,0,0,350.37,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,10,2011
6,7000,7000,7000,0,7.9,219.04,2,36000,3.7,0,0,4,0,7850,47.3,13,0.0,0.0,7884.32,7884.32,7000.0,884.32,0.0,0,0,223.59,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,10,2011
7,31500,31500,31500,1,15.96,765.35,0,132000,6.01,0,6,7,0,33690,41.7,31,0.0,0.0,34329.6,34329.6,31500.0,2829.57,0.0,0,0,29750.9,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,10,2011
8,13600,13600,13575,1,17.58,342.26,0,81060,8.1,1,1,10,0,6720,42.5,15,3188.24,3182.31,17060.3,17029.1,10411.8,6648.59,0.0,0,0,342.26,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10,2011
9,14400,14400,14400,1,17.58,362.39,0,89000,7.48,4,0,6,0,0,0.0,22,3376.31,3376.31,18097.7,18097.7,11023.7,7055.93,18.0952,0,0,362.39,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,10,2011




In [6]:
df.shape

(6000, 81)

### Generate training and testing datasets

In [7]:
train, valid, test = df.split_frame(
    ratios=[0.6,0.2], 
    seed=1234, 
    destination_frames=['train.hex','valid.hex','test.hex']
)

In [8]:
train_linear, valid_linear, test_linear = df_linear.split_frame(
    ratios=[0.6,0.2], 
    seed=1234, 
    destination_frames=['train.hex','valid.hex','test.hex']
)

In [9]:
df.columns

['C1',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'emp_length',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'collections_12_mths_ex_med',
 'acc_now_delinq',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'sub_grade_A2',
 'sub_grade_A3',
 'sub_grade_A4',
 'sub_grade_A5',
 'sub_grade_B1',
 'sub_grade_B2',
 'sub_grade_B3',
 'sub_grade_B4',
 'sub_grade_B5',
 'sub_grade_C1',
 'sub_grade_C2',
 'sub_grade_C3',
 'sub_grade_C4',
 'sub_grade_C5',
 'sub_grade_D1',
 'sub_grade_D2',
 'sub_grade_D3',
 'sub_grade_D4',
 'sub_grade_D5',
 'sub_grade_E1',
 'sub_grade_E2',
 'sub_grade_E3',
 'sub_grade_E4',
 'sub_grade_E5',
 'sub_grade_F1',
 'sub_grade_F2',
 'sub_gr

### Defining Response and Predictors

The outcome variable is also called the response or dependent variable,
and the risk factors and confounders are called the predictors, or explanatory or independent variables.
In regression analysis, the dependent variable is denoted "Y" and the independent variables are denoted by "X".


In [10]:

predictors = ['grade_C','grade_D','grade_E',
        'grade_F','grade_G','total_rec_int',
        'total_pymnt_inv','funded_amnt_inv','sub_grade_B5',
        'sub_grade_C5','sub_grade_C4','sub_grade_C3','sub_grade_B4','sub_grade_D5']

predictors

['grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'total_rec_int',
 'total_pymnt_inv',
 'funded_amnt_inv',
 'sub_grade_B5',
 'sub_grade_C5',
 'sub_grade_C4',
 'sub_grade_C3',
 'sub_grade_B4',
 'sub_grade_D5']

In [11]:
response = 'int_rate'

### Random Forest

### Builds a Distributed Random Forest (DRF) on a parsed dataset, for regression or classification

In [12]:
rf = H2ORandomForestEstimator(ntrees = 20, max_depth = 60, stopping_rounds = 2, 
                             score_each_iteration = True, seed = 1000000)
rf.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
rf

drf Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1586666792402_1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,20.0,20.0,359583.0,28.0,36.0,30.85,1257.0,1536.0,1427.4




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 1.1405876343185952
RMSE: 1.067982974732554
MAE: 0.6995018465653386
RMSLE: 0.10772300173613361
Mean Residual Deviance: 1.1405876343185952

ModelMetricsRegression: drf
** Reported on validation data. **

MSE: 1.0117658731311383
RMSE: 1.0058657331528589
MAE: 0.6581933277587954
RMSLE: 0.10168302025834519
Mean Residual Deviance: 1.0117658731311383

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
0,,2020-04-12 00:46:58,0.024 sec,0.0,,,,,,
1,,2020-04-12 00:46:58,0.209 sec,1.0,1.287884,0.803306,1.658646,1.307968,0.801591,1.710781
2,,2020-04-12 00:46:58,0.272 sec,2.0,1.330194,0.815078,1.769417,1.152252,0.748628,1.327684
3,,2020-04-12 00:46:58,0.350 sec,3.0,1.390445,0.82332,1.933339,1.124111,0.732005,1.263625
4,,2020-04-12 00:46:59,0.412 sec,4.0,1.348996,0.802783,1.81979,1.104079,0.714057,1.218991
5,,2020-04-12 00:46:59,0.444 sec,5.0,1.28551,0.780254,1.652537,1.101555,0.705415,1.213424
6,,2020-04-12 00:46:59,0.475 sec,6.0,1.229936,0.766074,1.512742,1.071961,0.691202,1.1491
7,,2020-04-12 00:46:59,0.502 sec,7.0,1.20347,0.756276,1.448339,1.056941,0.685281,1.117124
8,,2020-04-12 00:46:59,0.532 sec,8.0,1.18249,0.753025,1.398281,1.042206,0.676265,1.086194
9,,2020-04-12 00:46:59,0.554 sec,9.0,1.165529,0.743788,1.358459,1.031949,0.672327,1.064918



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,total_rec_int,209721.28125,1.0,0.233386
1,grade_E,177777.34375,0.847684,0.197837
2,grade_D,159116.75,0.758706,0.177071
3,grade_F,95239.4375,0.454124,0.105986
4,grade_C,89662.210938,0.42753,0.09978
5,funded_amnt_inv,36551.917969,0.174288,0.040676
6,total_pymnt_inv,36251.582031,0.172856,0.040342
7,grade_G,34666.046875,0.165296,0.038578
8,sub_grade_B5,24310.974609,0.11592,0.027054
9,sub_grade_B4,17561.054688,0.083735,0.019543




In [13]:
test_predict = rf.predict(test)
test_predict

drf prediction progress: |████████████████████████████████████████████████| 100%


predict
8.251
16.9898
12.048
8.004
12.095
16.6985
11.188
10.5935
6.3085
15.668




The mean absolute percentage error (MAPE) is a statistical measure of how accurate a forecast system is.
It measures this accuracy as a percentage, and can be calculated as the average absolute percent error
for each time period minus actual values divided by actual values.

In [14]:
mape = 0

for i, j in zip(test['int_rate'].as_data_frame().values, test_predict.as_data_frame().values):
    mape += np.abs((i-j)/i)
    
mape = (mape/test.shape[0])*100

mape    

array([6.55582879])

### Linear Regression

Linear regression is a statistical approach for modelling 
the relationship between a dependent variable with a given set of independent variables. 

In [15]:
glm = H2OGeneralizedLinearEstimator(family = 'gaussian', lambda_search = True)
glm.train(x = predictors, y = response, training_frame = train_linear, validation_frame = valid_linear)
glm

glm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1586666792402_2


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.00368 )","nlambda = 100, lambda.max = 4.7529, lambda.min = 0.00368, lambda.1...",14,14,78,train.hex




ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 1.894520037924836
RMSE: 1.3764156486776935
MAE: 1.0342704515935814
RMSLE: 0.14402891040179663
R^2: 0.8938537725251856
Mean Residual Deviance: 1.894520037924836
Null degrees of freedom: 3618
Residual degrees of freedom: 3604
Null deviance: 64592.66787297567
Residual deviance: 6856.268017249981
AIC: 12614.693350304282

ModelMetricsRegressionGLM: glm
** Reported on validation data. **

MSE: 1.9022097835747582
RMSE: 1.37920621502905
MAE: 1.0366994166382688
RMSLE: 0.14416765679108418
R^2: 0.8960232815177703
Mean Residual Deviance: 1.9022097835747582
Null degrees of freedom: 1203
Residual degrees of freedom: 1189
Null deviance: 22048.895414315986
Residual deviance: 2290.260579424009
AIC: 4222.995558191123

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
0,,2020-04-12 00:47:09,0.000 sec,1,4.8,1,17.848209,18.313036
1,,2020-04-12 00:47:09,0.007 sec,2,4.3,2,17.535629,18.021055
2,,2020-04-12 00:47:09,0.010 sec,3,3.9,2,17.221528,17.727606
3,,2020-04-12 00:47:09,0.017 sec,4,3.6,3,16.881114,17.408664
4,,2020-04-12 00:47:09,0.020 sec,5,3.3,4,16.325578,16.874237
5,,2020-04-12 00:47:09,0.027 sec,6,3.0,4,15.678789,16.241197
6,,2020-04-12 00:47:09,0.030 sec,7,2.7,5,15.033285,15.604926
7,,2020-04-12 00:47:09,0.032 sec,8,2.5,5,14.312589,14.87463
8,,2020-04-12 00:47:09,0.034 sec,9,2.3,5,13.625039,14.177124
9,,2020-04-12 00:47:09,0.040 sec,10,2.1,5,12.973059,13.514915



See the whole table with table.as_data_frame()




In [16]:
test_predict = glm.predict(test)
test_predict

glm prediction progress: |████████████████████████████████████████████████| 100%


predict
8.55725
16.8533
11.802
8.61402
12.0654
17.2423
8.5403
8.81847
8.46958
15.9882




In [17]:
MAPE(test, test_predict)

array([11.59191902])

### Neural Network

Epochs: Specify the number of times to iterate (stream) the dataset. The value can be a fraction. In our case it is 15
    
batch_size: Total number of training examples present in a single batch
    
mape: The mean absolute percentage error (MAPE) is a statistical measure of how accurate a forecast system is.
It measures this accuracy as a percentage



In [18]:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]

In [19]:
results = []

for ep in epochs:
    for batch in batch_size:
        print('epoch: {}, batch: {}'.format(ep, batch))
        dl = H2ODeepLearningEstimator(hidden = [50, 25], activation = 'Tanh', 
                              distribution = 'gaussian', mini_batch_size = batch, epochs = ep)
        dl.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
        
        test_predict = dl.predict(test)
        
        mape = MAPE(test, test_predict)
        
        results.append('epoch: {}, batch: {}, MAPE: {}'.format(ep, batch, mape))
        

epoch: 10, batch: 10
deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
epoch: 10, batch: 20
deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
epoch: 10, batch: 40
deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
epoch: 10, batch: 60
deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
epoch: 10, batch: 80
deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
epoch: 10, batch: 100
deeplearning Model Build progress: |███████████████████████████

In [20]:
results

['epoch: 10, batch: 10, MAPE: [9.02675081]',
 'epoch: 10, batch: 20, MAPE: [9.68478336]',
 'epoch: 10, batch: 40, MAPE: [9.39739187]',
 'epoch: 10, batch: 60, MAPE: [8.37404729]',
 'epoch: 10, batch: 80, MAPE: [9.86354528]',
 'epoch: 10, batch: 100, MAPE: [8.11352469]',
 'epoch: 50, batch: 10, MAPE: [6.82697564]',
 'epoch: 50, batch: 20, MAPE: [7.1750922]',
 'epoch: 50, batch: 40, MAPE: [9.46691882]',
 'epoch: 50, batch: 60, MAPE: [7.46963353]',
 'epoch: 50, batch: 80, MAPE: [8.02671406]',
 'epoch: 50, batch: 100, MAPE: [7.24128858]',
 'epoch: 100, batch: 10, MAPE: [7.81808519]',
 'epoch: 100, batch: 20, MAPE: [7.5683049]',
 'epoch: 100, batch: 40, MAPE: [7.30660851]',
 'epoch: 100, batch: 60, MAPE: [7.22617611]',
 'epoch: 100, batch: 80, MAPE: [7.30852312]',
 'epoch: 100, batch: 100, MAPE: [7.38913359]']