In [None]:
import h5py
import numpy as np
import pandas as pd
import json
import yaml
from ludwig import LudwigModel
import copy
import ray
from ludwig.utils.misc import merge_dict
import re

1. start with titanic.csv file from kaggle
2. create a simple ludwig model yaml file
3. run it through ludwig to generate the hdf5 and metadata file.
4. create a yaml file with all the parameters you want to search over.
5. annotate using simple {{[1,2,3]}} type annotation.  this can probably be improved and better generalized.
6. follow the rest of the notebook


In [5]:
!cat titanic.yaml

combiner: {fc_size: "{{[24,48,64]}}", num_fc_layers: 1, type: concat}
input_features:
- {name: Pclass, type: category}
- {name: Sex, type: category}
- {name: Age, type: numerical,missing_value_strategy: fill_with_mean}
- {name: SibSp, type: numerical}
- {name: Parch, type: numerical}
- {name: Fare, type: numerical, missing_value_strategy: fill_with_mean}
- {name: Embarked, representation: "{{[sparse, dense]}}", type: category}
output_features:
- {name: Survived, type: binary}
training:
  batch_size: "{{[8,16,32,64]}}"
  bucketing_field: null
  decay: false
  decay_rate: 0.96
  decay_steps: 10000
  dropout_rate: 0.0
  early_stop: 5
  epochs: 100
  gradient_clipping: null
  increase_batch_size_on_plateau: 0
  increase_batch_size_on_plateau_max: 512
  increase_batch_size_on_plateau_patience: 5
  increase_batch_size_on_plateau_rate: 2
  learning_rate: 0.001
  learning_rate_warmup_epochs: 5
  optimizer: {beta1: 0.9, beta2: 0.999, epsilon: 1.0e-08, type: adam}
  r

get_keys_from_dict will return the following (path, value) pairs:
['combiner->fc_size', '{{[24,48,64]}}'], 
['input_features->[6]->representation', '{{[sparse, dense]}}'], 
['training->batch_size', '{{[8,16,32,64]}}']

note that input_features->[6] references the 6th element of the input_features list.



In [None]:
search_pattern = "^{{.*}}$"

def get_keys_from_list(l, path=""):
    parameters = []
    for index, v in enumerate(l):
        if isinstance(l[index], str):
             if re.match(search_pattern, l[index], flags=0) is not None:
                parameters.append([path + "[" + index + "]", v])

        elif isinstance(l[index], dict):
            p = get_keys_from_dict(l[index], path+ "[" + str(index) + "]" +"->" )
            parameters.extend(p)
        
        elif isinstance(l[index], list):
            p = get_keys_from_list(l[index], path+ "[" + str(index) + "]" +"->" )
            parameters.extend(p)
            
    return parameters


def get_keys_from_dict(dct, path=""):
    parameters = []
    for k, v in dct.items():
        if isinstance(dct[k], str):
            if re.match(search_pattern, dct[k], flags=0) is not None:
                parameters.append([path + k,dct[k]])

        elif isinstance(dct[k], dict):
            p = get_keys_from_dict(dct[k], path+k+"->" )
            parameters.extend(p)
            

        elif isinstance(dct[k], list):
            p = get_keys_from_list(dct[k], path+k+"->" )
            parameters.extend(p)
            
    return parameters


list_index_pattern = "^\[\d+\]$"

def set_search_param(dct, path, value):
    if len(path) == 1:
        dct[path[0]] = value
    else:
        l = re.match(list_index_pattern, path[0], flags=0)
        if l is not None and isinstance(dct, list):
            index = int(l.group(0)[1:-1])
            set_search_param(dct[index], path[1:], value)
        else:
            set_search_param(dct[path[0]], path[1:], value)


def build_model(base_model, config):
    for k, v in config.items():
        p = k.split('->')
        set_search_param(base_model, p, v)
    return base_model

In [None]:
#use full path for Ray
data = '/Users/bm255022/Projects/Teradata/ludwig/examples/hyperparameters/titanic.hdf5'
metadata = '/Users/bm255022/Projects/Teradata/ludwig/examples/hyperparameters/titanic.json'

def train(base, config, reporter):
    new_model_def = build_model(base, config)
    model = LudwigModel(new_model_def)
    train_stats = model.train(data_hdf5=data, train_set_metadata_json=metadata)
    return reporter(mean_accuracy=np.sort(train_stats['validation']['Survived']['accuracy'])[-1], done=True)


In [None]:
from ray.tune import grid_search

def build_search_space(annotated_model):
    pattern = "^{{(.*)}}$"
    grid_search_space = {}
    keys = get_keys_from_dict(annotated_model)
    print(keys)
    for a in keys:
        grid_search_space[a[0]]= grid_search(yaml.load(re.match(pattern, a[1], flags=0)[1]))

    return grid_search_space


In [None]:
from ray.tune import register_trainable, grid_search, run_experiments

ray.shutdown()
ray.init()

with open("titanic.yaml", 'r') as stream:
    annotated_model = yaml.load(stream)

grid_search_space = build_search_space(annotated_model)

register_trainable('train', lambda cfg, rptr: train(annotated_model, cfg, rptr))
run_experiments({'titanic': {
    'run': 'train',
    'stop': {'mean_accuracy': 0.9},
    'config': grid_search_space}
    })
