In [None]:
# Walk Through Professors Code and Perform simple version in H2O
# Append Kaggle data with Professors Set and Perform simple version again in H2O

# Use original set and walk through following two kernels:
# Walk Through Comprehensive KNN Kernel
# https://www.kaggle.com/shivamb/a-very-comprehensive-tutorial-nn-cnn?scriptVersionId=3866404
# Walk Through Suggested KNN Code with Keras and Perform Next Version (will take a long time to run) to get accuracy up
# https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6/notebook
# Use appended set and perform on KNN Kernel

# Kaggle Digit Recognizer Competition

In [1]:
import h2o 
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## First Attempt: Using H2O From Professor's Code

In [2]:
h2o.init()
h2o.no_progress()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-win64) (build 25.121-b15, mixed mode)
  Starting server from C:\Users\Davee\Miniconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Davee\AppData\Local\Temp\tmph7jqanhf
  JVM stdout: C:\Users\Davee\AppData\Local\Temp\tmph7jqanhf\h2o_Davee_started_from_python.out
  JVM stderr: C:\Users\Davee\AppData\Local\Temp\tmph7jqanhf\h2o_Davee_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,05 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.11
H2O cluster version age:,24 days
H2O cluster name:,H2O_from_python_Davee_l7isf2
H2O cluster total nodes:,1
H2O cluster free memory:,1.747 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [3]:
# Using H2O to import data

# The "label" column will be converted to a nominal value and the others will be converted to numeric values
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'label': 'enum'}

train = h2o.import_file('train.csv', col_types=col_types)
test = h2o.import_file('test.csv')

In [4]:
# Split training into training (80%) and validation (20%)

train, valid = train.split_frame([0.8], seed=12345)

In [5]:
# For H2O, we have to assign target and inputs and set to factor

y = 'label'
X = [name for name in train.columns if name != y]

train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()

In [8]:
# We will train multiple NN models with random hyperparameters and select best model based on validation error (per Prof)

# Hyper Parameter Hidden are the different NN models we will try.
# Each is the number of hidden layers and units within that layer
# I have altered this from the Professor's code to includea few more models with more layers

# I have not altered the other parameters as they will be helpful in creating a variety of random models (up to 25)
# L1: lets only strong weights survive
# L2: prevents any single weight from getting too big.

# define random grid search parameters
hyper_parameters = {'hidden': [[250, 250], [500,500], [1000, 1000], [250, 250, 250, 250], [500, 500, 500, 500]],
                    'l1':[s/1e4 for s in range(0, 1000, 100)],
                    'l2':[s/1e5 for s in range(0, 1000, 100)],
                    'input_dropout_ratio':[s/1e2 for s in range(0, 20, 2)]}

# Change this to max run time 28800 seconds (8 hours) and models to 25 so it is more manageable
# Ran this and it only ran three models so I readjusted back

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':60000}

# initialize grid search
gsearch = H2OGridSearch(H2ODeepLearningEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=train,
              validation_frame=valid, 
              activation='RectifierWithDropout', 
              epochs=8000, 
              stopping_rounds=20,
              sparse=True,                    
              ignore_const_cols=True,              
              adaptive_rate=True)

# Ask professor about epochs and stopping rounds
# epoch = one forward pass and one backward pass of all the training examples
# In grid search am I performing too many epochs?

In [9]:
# show grid search results
gsearch.show()

# select best model
mnist_model = gsearch.get_grid()[0]

# print model information
mnist_model

                       hidden input_dropout_ratio    l1     l2  \
0                  [500, 500]                0.16   0.0  0.009   
1                  [500, 500]                 0.1  0.08  0.002   
2    [1000, 1000, 1000, 1000]                0.16  0.09  0.003   

                                                              model_ids  \
0  Grid_DeepLearning_py_5_sid_adc6_model_python_1529252701935_2_model_2   
1  Grid_DeepLearning_py_5_sid_adc6_model_python_1529252701935_2_model_0   
2  Grid_DeepLearning_py_5_sid_adc6_model_python_1529252701935_2_model_1   

               logloss  
0  0.10461823120702544  
1   2.2995516068626864  
2    2.355231047764764  
Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  Grid_DeepLearning_py_5_sid_adc6_model_python_1529252701935_2_model_2


ModelMetricsMultinomial: deeplearning
** Reported on train data. **

MSE: 0.004587400182568444
RMSE: 0.06773034905098632
LogLoss: 0.017078130584356765
Mean Per-Class Error: 0.0053998199100237605


0,1,2,3,4,5,6,7,8,9,10,11
0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,Error,Rate
975.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0030675,3 / 978
0.0,1111.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0035874,"4 / 1,115"
0.0,0.0,933.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0042689,4 / 937
0.0,0.0,1.0,989.0,0.0,4.0,0.0,1.0,4.0,1.0,0.011,"11 / 1,000"
0.0,3.0,0.0,0.0,979.0,0.0,1.0,0.0,1.0,1.0,0.0060914,6 / 985
0.0,0.0,0.0,1.0,0.0,880.0,0.0,0.0,0.0,0.0,0.0011351,1 / 881
4.0,1.0,0.0,0.0,0.0,0.0,944.0,0.0,0.0,0.0,0.0052687,5 / 949
0.0,2.0,0.0,0.0,0.0,0.0,0.0,1025.0,0.0,0.0,0.0019474,"2 / 1,027"
3.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,974.0,0.0,0.0071356,7 / 981


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.994546
2,0.9990910
3,0.9997979
4,0.9998989
5,0.9999999
6,0.9999999
7,0.9999999
8,0.9999999
9,0.9999999



ModelMetricsMultinomial: deeplearning
** Reported on validation data. **

MSE: 0.022067447306029997
RMSE: 0.14855116056776532
LogLoss: 0.10461823120702544
Mean Per-Class Error: 0.02527432716827784
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11
0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,Error,Rate
856.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0058072,5 / 861
0.0,903.0,3.0,2.0,0.0,0.0,1.0,1.0,6.0,2.0,0.0163399,15 / 918
2.0,2.0,784.0,2.0,1.0,3.0,4.0,9.0,4.0,0.0,0.0332922,27 / 811
0.0,3.0,2.0,850.0,0.0,9.0,1.0,3.0,7.0,3.0,0.0318907,28 / 878
0.0,4.0,1.0,0.0,824.0,0.0,3.0,2.0,3.0,9.0,0.0260047,22 / 846
2.0,3.0,0.0,13.0,0.0,784.0,7.0,1.0,1.0,2.0,0.0356704,29 / 813
2.0,1.0,1.0,0.0,1.0,1.0,777.0,0.0,3.0,0.0,0.0114504,9 / 786
0.0,3.0,4.0,0.0,0.0,0.0,0.0,860.0,0.0,4.0,0.0126292,11 / 871
5.0,12.0,4.0,4.0,0.0,2.0,3.0,3.0,792.0,6.0,0.0469314,39 / 831


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.9748815
2,0.992891
3,0.9969195
4,0.9991707
5,0.9996446
6,0.9997631
7,0.9998816
8,1.0
9,1.0


Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12
,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
,2018-06-17 21:09:26,0.000 sec,,0.0,0,0.0,,,,,,
,2018-06-17 21:09:31,6:59:16.366,1169 obs/sec,0.1575983,1,5289.0,0.3057397,0.3506140,0.1085749,0.3100231,0.3726005,0.1099526
,2018-06-17 21:10:46,7:00:30.031,1529 obs/sec,3.2758939,21,109939.0,0.1748863,0.1108454,0.0350470,0.1950342,0.1465254,0.0433649
,2018-06-17 21:11:51,7:01:35.318,1561 obs/sec,6.0816150,39,204099.0,0.1438709,0.0760297,0.0235330,0.1783277,0.1290339,0.0355450
,2018-06-17 21:12:56,7:02:40.464,1572 obs/sec,8.8841180,57,298151.0,0.1319171,0.0640159,0.0189880,0.1692002,0.1190048,0.0330569
---,---,---,---,---,---,---,---,---,---,---,---,---
,2018-06-17 22:05:48,7:55:32.518,1439 obs/sec,130.9239869,841,4393809.0,0.0739811,0.0203306,0.0059590,0.1520606,0.1147137,0.0251185
,2018-06-17 22:06:54,7:56:38.303,1438 obs/sec,133.4070024,857,4477139.0,0.0739570,0.0194128,0.0065650,0.1507522,0.1088830,0.0264218
,2018-06-17 22:07:59,7:57:43.552,1438 obs/sec,135.8839988,873,4560267.0,0.0715282,0.0199833,0.0049490,0.1531072,0.1096553,0.0266588



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
pixel99,1.0,1.0,0.0018153
pixel710,0.9987874,0.9987874,0.0018131
pixel350,0.9871588,0.9871588,0.0017920
pixel477,0.9721351,0.9721351,0.0017647
pixel715,0.9716280,0.9716280,0.0017638
---,---,---,---
pixel495,0.4965284,0.4965284,0.0009013
pixel523,0.4878727,0.4878727,0.0008856
pixel629,0.4748994,0.4748994,0.0008621



See the whole table with table.as_data_frame()




In [10]:
import re
import time
time_stamp = re.sub('[: ]', '_', time.asctime())

# score unlabeled test data
sub = mnist_model.predict(test)

# save file for submission
sub = sub['predict']

import numpy as np # create ID column
sub = h2o.H2OFrame(np.arange(1, 28001)).cbind(sub) 

sub.columns = ['ImageId', 'Label']

print(sub.head())

sub_fname = 'C:/Users/Davee/Google Drive/GWU MBA/Summer 2018/Machine Learning/DigitRecognizer/submission_' + str(time_stamp) + '.csv'
h2o.download_csv(sub, sub_fname)

ImageId,Label
1,2
2,0
3,9
4,9
5,3
6,7
7,0
8,3
9,0
10,3





In [None]:
# Initial submission 0.97500

## Second Attempt: Using H2O From Prof's Code + Using Additional Data to Expand Dataset