In [3]:
#!/usr/bin/env python

"""Execution Time Experiment

Run a list of workloads in sequence and report the execution time for each one
These are the flow ids with number of setups that end up being executed:
{5804: 18,
 5909: 9,
 5910: 1,
 5913: 1,
 5914: 1,
 5995: 1,
 6268: 3,
 6269: 1,
 6334: 1,
 6840: 341,
 6946: 2,
 6952: 31,
 6954: 7,
 6958: 1,
 6969: 1503,
 6970: 79,
 5804: 18
}
 Complete list of setups are the experiment result files

"""
import errno
import os
import sys
import uuid
from datetime import datetime

from openml import config

if len(sys.argv) > 1:
    SOURCE_CODE_ROOT = sys.argv[1]
else:
    SOURCE_CODE_ROOT = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/code/collaborative' \
                       '-optimizer/ '
sys.path.append(SOURCE_CODE_ROOT)
# Somehow someone hard codes this to be on top of the sys path and I cannot get rid of it
if '/home/zeuchste/git/scikit-learn' in sys.path:
    sys.path.remove('/home/zeuchste/git/scikit-learn')

from paper.experiment_helper import Parser
from experiment_graph.data_storage import StorageManagerFactory, DedupedStorageManager
from experiment_graph.executor import CollaborativeExecutor
from experiment_graph.execution_environment import ExecutionEnvironment
from experiment_graph.materialization_algorithms.materialization_methods import TopNModelMaterializer, \
    AllMaterializer
from experiment_graph.optimizations.Reuse import LinearTimeReuse
from experiment_graph.storage_managers import storage_profiler
from experiment_graph.openml_helper.openml_connectors import get_setup_and_pipeline
from experiment_graph.workloads.openml_optimized import OpenMLOptimizedWorkload

e_id = uuid.uuid4().hex.upper()[0:8]
EXPERIMENT_TIMESTAMP = datetime.now()

parser = Parser(sys.argv)
verbose = parser.get('verbose', 0)

DEFAULT_ROOT = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization'
ROOT = parser.get('root', DEFAULT_ROOT)
ROOT_DATA_DIRECTORY = ROOT + '/data'

storage_manager = StorageManagerFactory.get_storage(parser.get('storage_type', 'dedup'))

EXPERIMENT = parser.get('experiment', 'openml')
limit = int(parser.get('limit', 100))
openml_task = int(parser.get('task', 31))
OPENML_DIR = ROOT_DATA_DIRECTORY + '/openml/'
config.set_cache_directory(OPENML_DIR + '/cache')

result_file = parser.get('result', ROOT + '/experiment_results/local/alpha_impact/openml/test.csv')
profile = storage_profiler.get_profile(parser.get('profile', ROOT_DATA_DIRECTORY + '/profiles/local-dedup'))

if not os.path.exists(os.path.dirname(result_file)):
    try:
        os.makedirs(os.path.dirname(result_file))
    except OSError as exc:  # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise

OPENML_DIR = ROOT_DATA_DIRECTORY + '/openml/'
OPENML_TASK = ROOT_DATA_DIRECTORY + '/openml/task_id={}'.format(openml_task)
setup_and_pipelines = get_setup_and_pipeline(openml_dir=OPENML_DIR, runs_file=OPENML_TASK + '/all_runs.csv',
                                             limit=limit)

mat_type = parser.get('mat_type', 'best_n')
alpha = float(parser.get('alpha', '0.1'))

if mat_type == 'best_n':
    materializer = TopNModelMaterializer(n=1, alpha=alpha)
else:
    materializer = AllMaterializer()

ee = ExecutionEnvironment(DedupedStorageManager(), reuse_type=LinearTimeReuse.NAME)
executor = CollaborativeExecutor(ee, cost_profile=profile, materializer=materializer)


def get_workload(setup, pipeline):
    return OpenMLOptimizedWorkload(setup, pipeline, task_id=openml_task)


def run(executor, workload):
    return executor.run_workload(workload=workload, root_data=ROOT_DATA_DIRECTORY, verbose=verbose)


def is_best_model_materialized(executor):
    graph = executor.execution_environment.experiment_graph.graph
    best = (0, {}, 'id')
    for n, d in graph.nodes(data=True):
        if d['type'] == 'SK_Model' and d['score'] > 0:
            if best[0] < d['score']:
                best = (d['score'], d, n)
    return graph.nodes[best[2]]['mat']



In [4]:
best_workload = None
best_score = -1
best_setup = -1
best_pipeline = -1
i = 0
print 'experiment with materializer: {}, alpha: {}'.format(mat_type, alpha)
for setup, pipeline in setup_and_pipelines:
    workload = get_workload(setup, pipeline)
    start = datetime.now()
    success = run(executor, workload)
    end_current = datetime.now()
    run_time_current = (end_current - start).total_seconds()
    current_score = workload.get_score()
    if best_score == -1:
        best_score = current_score
        best_setup = setup.setup_id
        best_pipeline = setup.flow_id
    if best_workload is not None:
        start_best_workload = datetime.now()
        success = run(executor, best_workload)
        if current_score > best_workload.get_score():
            best_score = current_score
            best_workload = workload
            best_setup = setup.setup_id
            best_pipeline = setup.flow_id
    else:
        best_workload = workload
        best_score = current_score
    end = datetime.now()

    elapsed = (end - start).total_seconds()

    executor.local_process()
    executor.global_process()
    executor.cleanup()
    mat_status = 1 if is_best_model_materialized(executor) else 0
    i += 1
    if i % 50 == 0:
        print 'run {} out of {} completed'.format(i, limit)
    if not success:
        elapsed = 'Failed!'


{"Agg": 0.09642666666666667, "SK_Model": 5.86940990518854e-05, "Evaluation": 0.04334545454545455, "Feature": 0.03777813765182186, "Dataset": 0.0006677262236078122}


In [5]:
executor.compute_heuristics(ee.experiment_graph.graph, executor.DEFAULT_PROFILE)

In [4]:
graph = executor.execution_environment.experiment_graph.graph

In [8]:
0.1171875 * 0.08959999999999999

0.010499999999999999

In [6]:
max_load = (0,0)
for n,d in graph.nodes(data=True):
    if 'load_cost' in d:
        print n, d['type'], d['size'], d['load_cost'], d['recreation_cost']
        if max_load[1] < d['load_cost']:
            max_load = (n,d['load_cost'])

91F663F204EC1B373BAA7D5F4ED259B3 Feature 1.9296875 4.9599200026e-05 53975020.108
ECDA0D538C2716511D31B918A0A196AB Agg 0.1171875 0.0105 6751138.855
6D67B47B385303BCE343069471903013 Feature 1921.9375 0.0494 4683.694
421BD661A585C601515158BB06A45BCE Feature 1921.9375 0.0494 36.653
2FDDD27929272F0872ED90A8782B3E9E Feature 1.8984375 4.87959741147e-05 341815.233
6AABC0FEF0CC9CEE313FEE2F88BCD237 Dataset 29309.6171875 14.7703 247.386
EDD0B28901B73F1F09CD86A338953753 Dataset 116759.671875 58.8399149147 265146.411
7DC7FD1982D2E6DFAC4689879CF74496 Feature 480.4921875 0.0123502008065 489.107
588E87EA4494D973E7E1D3023591BA38 Feature 13327.8173828 0.342567944437 0.159
1EBBC23A724A33FC623DFFE910804FFA Agg 0.1171875 0.0105 336292.717
5EDB6467754F13B3F2AAA2F921144194 Dataset 235437.414062 118.646422936 4850.955
B049A25988BB3F5CCB75BF7894073109 Agg 0.0234375 0.0021 4671.389
A1AC5FEB094CCC27E0FBE2C877097260 Feature 240.2421875 0.006175 82971.496
167A31E89EBD1BC62655FD7052DA2ABE Feature 1.921875 4.9398393

E37F048B899D318CD4BA87C7C9A44DDA Feature 11574.4785156 0.297501473733 4658.48
3F632739DF0CD1B6FBB40EE6E4992F10 Feature 1.9296875 4.9599200026e-05 26987509.639
31253C26E2FF11A3F9CBFC52C49D8E91 Agg 0.03125 0.0028 165948.338
03B5B1A6B7E6A529298BE6F96046608E Dataset 5291.34375 2.66652184812 248942.122
9E9BAF3CED5C7D225138BDFE337CD0C8 Dataset 244566.617188 123.246997146 6749764.148
2CDD5AAC44505D882479B8DB63ED5148 Dataset 3843.9453125 1.93711931091 82973.041
7B72196374C75F99BCAFE4009CEBCB64 Feature 480.4921875 0.0123502008065 12552.749
77977327B1E4C5F63B993B07FA74C52A Agg 0.0234375 0.0021 4671.816
666D049B5A66C9625117B3BB7CF3C0B5 Feature 1921.9375 0.0494 1498310.284
AE25B96EA42A278A305BCAEAA0DFC5A5 Dataset 42855.1259766 21.5964290206 20421.355
91CAB33ADB2480351D2AEDC0AB8F420A Feature 240.3984375 0.00617901612956 165979.624
354DE3595708334A81873F03DDA5E905 Feature 1921.9375 0.0494 82970.967
65BED3C9DAC84FCBBC03B2DFA8E39FCC Feature 13821.6757812 0.355261700026 4658.485
4733C453477DADC1096DD8B

In [None]:
graph.nodes['B81B4D2E69928650629AD1266B67FD53']['data']

In [None]:
graph.nodes['B81B4D2E69928650629AD1266B67FD53']

In [None]:
for source, destionation, data in graph.edges(data=True):
    if destionation == 'B81B4D2E69928650629AD1266B67FD53':
        print destionation, data

In [None]:
graph.nodes['application_train.csv{}']['data'].underlying_data.pandas_df

In [None]:
obj = graph.nodes['E4211A925EA65EB27C7AB0E2EB202872']['data']

In [None]:
ee.experiment_graph.get_total_size()

In [None]:
ee.experiment_graph.get_real_size()

In [None]:
from pympler.asizeof import asizeof

In [None]:
total = 0
columns = []
for n,d in graph.nodes(data=True):
    if d['type'] == 'Feature':
        print d['data'].underlying_data.pandas_series

            

In [None]:
(ee.experiment_graph.get_total_size())/(1024 * 1024)

In [None]:
def utf8len(s):
    return len(s.encode('utf-8'))

In [None]:
str_size

In [None]:
np.where(optimized['AMT_ANNUITY'] != baseline['AMT_ANNUITY'])

In [None]:
#pd.concat([optimized['AMT_ANNUITY'],baseline['AMT_ANNUITY']],axis=1)
optimized['AMT_ANNUITY'][688]

In [None]:
baseline['AMT_ANNUITY'][688]

In [None]:
(optimized['AMT_ANNUITY']==baseline['AMT_ANNUITY']).all()

In [None]:
print optimized['ENTRANCES_MODE'] == baseline['ENTRANCES_MODE']

In [None]:
baseline['ENTRANCES_MODE'].head()

In [None]:
start = datetime.now()
ee.new_workload()
# Load Data
train = ee.load(root_data + '/openml/task_id=31/train.csv')
test = ee.load(root_data + '/openml/task_id=31/test.csv')

test_labels = test['class']
test = test.drop('class')

train_labels = train['class']
train = train.drop(columns=['class'])

train2=train.drop('checking_status')
test2=test.drop('checking_status')
from experiment_graph.sklearn_helper.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler(feature_range=(0, 1))
scaler2.fit(train2)
train2 = scaler2.transform(train2)
test2 = scaler2.transform(test2)


scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

# # Random Forest 2 random_state = 60 (n_estimator = 10)
from experiment_graph.sklearn_helper.ensemble import RandomForestClassifier
random_forest2 = RandomForestClassifier(should_warmstart=True, n_estimators=10000, random_state=50, verbose = 1)
random_forest2.fit(train, train_labels)
random_forest2.score(test, test_labels).data()
print 'total time: {}'.format((datetime.now() - start).total_seconds())

In [None]:
start = datetime.now()
ee.new_workload()
# Load Data
train = ee.load(root_data + '/openml/task_id=31/train.csv')
test = ee.load(root_data + '/openml/task_id=31/test.csv')

test_labels = test['class']
test = test.drop('class')

train_labels = train['class']
train = train.drop(columns=['class'])

train2=train.drop('checking_status')
test2=test.drop('checking_status')
from experiment_graph.sklearn_helper.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler(feature_range=(0, 1))
scaler2.fit(train2)
train2 = scaler2.transform(train2)
test2 = scaler2.transform(test2)


scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

# # Random Forest 2 random_state = 60 (n_estimator = 10)
from experiment_graph.sklearn_helper.ensemble import RandomForestClassifier
random_forest2 = RandomForestClassifier(should_warmstart=True, n_estimators=10000, random_state=60, verbose = 1)
random_forest2.fit(train, train_labels)

# Execute
#random_forest2.trained_node.data()

random_forest2.score(test, test_labels).data()
print 'total time: {}'.format((datetime.now() - start).total_seconds())

In [None]:
start = datetime.now()
ee.new_workload()
# Load Data
train = ee.load(root_data + '/openml/task_id=31/train.csv')
test = ee.load(root_data + '/openml/task_id=31/test.csv')

test_labels = test['class']
test = test.drop('class')

train_labels = train['class']
train = train.drop(columns=['class'])

train2=train.drop('checking_status')
test2=test.drop('checking_status')
from experiment_graph.sklearn_helper.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler(feature_range=(0, 1))
scaler2.fit(train2)
train2 = scaler2.transform(train2)
test2 = scaler2.transform(test2)


scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

# # Random Forest 2 random_state = 60 (n_estimator = 10)
from experiment_graph.sklearn_helper.ensemble import RandomForestClassifier
random_forest2 = RandomForestClassifier(should_warmstart=False, n_estimators=10000, random_state=60, verbose = 1)
random_forest2.fit(train, train_labels)

# Execute
#random_forest2.trained_node.data()

random_forest2.score(test, test_labels).data()
print 'total time: {}'.format((datetime.now() - start).total_seconds())