In [80]:
# read graphs and build graph pool 
import graph_tool.all as gt

import sys
sys.path.append('/local0/Kariz/code')
import utils.objectstore as object_store

def build_input_format(inputs_str):
    return ' '.join(inputs_str.split(':')).split()

def load_graph_templates(path):
    graph_skeletons = {}
    with open(path, 'r') as fd:
        graph_strs = fd.read().split('#')[1:]
        for g_str in graph_strs:
            g= build_graph_skeleton(g_str)
            graph_skeletons[g.gp.name] = g
    return graph_skeletons


def build_graph_skeleton(g_str):
    g_elements = g_str.split('\n')
    g_name = g_elements[0].split('\t')[1]
    g_id = 0
    g_queuetime = 0

    g = gt.Graph(directed=True)
    g.gp['name'] = g.new_graph_property("string", g_name)
    g.gp['id'] = g.new_graph_property("string", str(g_id))
    g.gp['queue_time'] = g.new_graph_property("int", g_queuetime)
    g.gp['cur_stage'] = g.new_graph_property("int", -1)
    status = g.new_vertex_property("int")
    inputs = g.new_vertex_property("string")
    cache_runtime = g.new_vertex_property("int")
    remote_runtime = g.new_vertex_property("int")
    color = g.new_vertex_property("string")
    ops = g.new_vertex_property("string")
    vids = g.new_vertex_property("int")
    id_v = {}
    
    # build vertices
    for el in g_elements[1:]:
        if el.startswith('v'):
            vid, operation, inputs_str = el.split(',')[1:]
            v = g.add_vertex()
            vids[v] = int(vid)
            id_v[int(vid)] = v
            inputs[v] = inputs_str #build_input_format(inputs_str)
            #print(inputs_str)
            color[v] = '#fb8072' if len(inputs[v]) > 0 else '#80b1d3'
            cache_runtime[v] = 0
            remote_runtime[v] = 0
            ops[v] = operation if operation else 'SAVE'

    # build edges
    for el in g_elements[1:]:
        if el.startswith('e'):
            v_src, v_dest = el.split(',')[1:]
            e = g.add_edge(id_v[int(v_src)], id_v[int(v_dest)])

    g.vp['id'] = vids
    g.vp['color'] = color
    g.vp['tables'] = inputs
    g.vp['remote_runtime'] = remote_runtime
    g.vp['cache_runtime'] = cache_runtime
    g.vp['status'] = status
    g.vp['feature'] = ops
    print('Load query:', g_name)
    return g

graphs_pool = load_graph_templates('/local0/Kariz/expriments/simulator/multidag/%s'%('pig.tpch.template'))

inputs = object_store.load_object_meta('/local0/Kariz/expriments/simulator/multidag/config/inputs.csv')

Load query: TPCH_Q12
Load query: TPCH_Q17
Load query: TPCH_Q9
Load query: TPCH_Q4
Load query: TPCH_Q16
Load query: TPCH_Q8
Load query: TPCH_Q7
Load query: TPCH_Q22
Load query: TPCH_Q19
Load query: TPCH_Q6
Load query: TPCH_Q3
Load query: TPCH_Q5
Load query: TPCH_Q15
Load query: TPCH_Q18
Load query: TPCH_Q13
Load query: TPCH_Q10
Load query: TPCH_Q14
Load query: TPCH_Q20
Load query: TPCH_Q21
Load query: TPCH_Q2
Load query: TPCH_Q11
Load query: TPCH_Q1


In [96]:
import random 

def choose_input(seen_objects):
    reuse = random.choices([1, 0], cum_weights=(reuse_ratio, 1.00), k=1)[0]
    table, _ = random.choice(list(seen_objects.items())) if ((reuse) and (len(seen_objects) > 5)) else random.choice(list(inputs.items())) 
    seen_objects[table] = 1 if table not in seen_objects else seen_objects[table]+1
    return table

def random_query():
    gid, g = random.choice(list(graphs_pool.items()))
    return gid, g.copy()
    
reuse_ratio = 0.32
cfg_n_similar = 2
max_dag_concurrency=10
min_dag_concurrency=1
n_iterations = 10

seen_objects = {}
workload_str=''
g = None
for it in range(0, n_iterations):
    n_similar = cfg_n_similar
    n_concurrent_dags = 10 #random.randint(min_dag_concurrency, max_dag_concurrency)
    workload_str += ("%" + 'r,%d,%d\n'%(it,n_concurrent_dags))
    for i in range(0, n_concurrent_dags):
        if not (g and g.num_vertices() > 4 and n_similar > 1):
            gid, g = random_query()
        else:
            n_similar -= 1
            
        workload_str += ('#t,%d%d,%s\n'%(it,i,gid))
        for v in g.vertices():
            t_compute = random.randint(5, 100)
            t_reduction = 1
            if len(g.vp.tables[v]) > 0: 
                table = choose_input(seen_objects) 
                g.vp.tables[v] = table
                t_reduction = random.uniform(0.3, 1)
            workload_str += ('v,%d,%s,%s,%d,%.2f\n'%(g.vp.id[v],g.vp.feature[v],g.vp.tables[v],
                                            t_compute, t_reduction))
        for e in g.edges():
            workload_str += ('e,%d,%d\n'%(g.vp.id[e.source()],g.vp.id[e.target()]))
    
print(workload_str)
workload_file = '/local0/Kariz/expriments/simulator/multidag/config/synthetic_worload_2.g'
with open(workload_file, 'w') as fd:
    fd.write(workload_str)

%r,0,10
#t,00,TPCH_Q15
v,0,GROUP_BY:COMBINER,s12,46,0.51
v,1,GROUP_BY:COMBINER,,56,1.00
v,2,HASH_JOIN,m11,73,0.45
v,3,SAMPLER,,21,1.00
v,4,ORDER_BY,,30,1.00
e,0,1
e,0,2
e,1,2
e,2,3
e,3,4
#t,01,TPCH_Q15
v,0,GROUP_BY:COMBINER,j0,69,0.63
v,1,GROUP_BY:COMBINER,,83,1.00
v,2,HASH_JOIN,b1,27,0.52
v,3,SAMPLER,,90,1.00
v,4,ORDER_BY,,30,1.00
e,0,1
e,0,2
e,1,2
e,2,3
e,3,4
#t,02,TPCH_Q4
v,0,COGROUP,q16,28,0.57
v,1,GROUP_BY:COMBINER,,84,1.00
v,2,SAMPLER,,50,1.00
v,3,ORDER_BY,,98,1.00
e,0,1
e,1,2
e,2,3
#t,03,TPCH_Q16
v,0,HASH_JOIN,n14,5,0.43
v,1,HASH_JOIN,v12,93,0.49
v,2,GROUP_BY:COMBINER,,31,1.00
v,3,SAMPLER,,27,1.00
v,4,ORDER_BY,,8,1.00
e,0,1
e,1,2
e,2,3
e,3,4
#t,04,TPCH_Q6
v,0,HASH_JOIN,v12,87,0.89
#t,05,TPCH_Q14
v,0,HASH_JOIN,k16,28,0.49
v,1,MULTI_QUERY:COMBINER,,99,1.00
v,2,MAP_ONLY,,73,1.00
e,0,1
e,1,2
#t,06,TPCH_Q1
v,0,GROUP_BY:COMBINER,n14,65,0.50
v,1,SAMPLER,,53,1.00
v,2,ORDER_BY,,69,1.00
e,0,1
e,1,2
#t,07,TPCH_Q10
v,0,HASH_JOIN,l19,30,0.56
v,1,HASH_JOIN,i0,31,0.56
v,2,HASH_JOIN,b1,38,0.71
