In [2]:
import code.utils.graph
import graph_tool.all as gt
import os
import ast
import re

print(os.getcwd())

def build_input_format(inputs_str):
    res = re.search('\[(.*)\]', inputs_str)
    return dict.fromkeys(res.group(1).split(','), 0) if res else {}
    

def build_graph_skeleton(g_str):
    g_elements = g_str.split('\n')
    g_name = g_elements[0].split(',')[1:]
    g_id = 0
    g_queuetime = 0
    
    g = gt.Graph(directed=True)
    g.gp['name'] = g.new_graph_property("string", g_name)
    g.gp['id'] = g.new_graph_property("string", str(g_id))
    g.gp['queue_time'] = g.new_graph_property("int", g_queuetime)
    g.gp['cur_stage'] = g.new_graph_property("int", -1)
    status = g.new_vertex_property("int")
    inputs = g.new_vertex_property("object")
    cache_runtime = g.new_vertex_property("int")
    remote_runtime = g.new_vertex_property("int")
    ops = g.new_vertex_property("vector<string>")

    # build vertices
    for el in g_elements[1:]:
        if el.startswith('v'):
            vid, inputs_str, operation = el.split(',')[1:]
            v = g.add_vertex()
            
            inputs[v] = build_input_format(inputs_str)
            cache_runtime[v] = 0
            remote_runtime[v] = 0 
            ops[v] = operation.split('|')

    
    # build edges
    for el in g_elements[1:]:
        if el.startswith('e'):
            v_src, v_dest = el.split(',')[1:]
            e = g.add_edge(v_src, v_dest)
            
        g.vp['inputs'] = inputs
        g.vp['remote_runtime'] = remote_runtime
        g.vp['cache_runtime'] = cache_runtime
        g.vp['status'] = status
        g.vp['ops'] = ops
    return g
            

def load_graph_skeleton(path):
    graph_skeletons = {}
    with open(path, 'r') as fd:
        graph_strs = fd.read().split('#')[1:]

        for g_str in graph_strs:
            g= build_graph_skeleton(g_str)
            graph_skeletons[g.gp.name] = g
    return graph_skeletons
            
path = "../tpch_pig_dags.g"
graph_skeletons = load_graph_skeleton(path)

print(graph_skeletons)


/local0/Kariz/expriments/macrobenchmark/ipython
{"['Q1', 'sequential']": <Graph object, directed, with 4 vertices and 3 edges at 0x7f6cf4cf8630>, "['Q2', 'sequential']": <Graph object, directed, with 8 vertices and 7 edges at 0x7f6cf4d05198>, "['Q3', 'sequential']": <Graph object, directed, with 6 vertices and 5 edges at 0x7f6cf4d05278>}


In [39]:
import code.utils.graph
import graph_tool.all as gt
import os
import ast
import re

import workload.sequential as sqw
import sys

print(os.getcwd())

config_file = "../config.json"

workload = sqw.Sequential(config_file)

print(workload.graph_skeleton_pool)

# Load statistics
import utils.jobhistory as hist
import pandas as pd

print(workload.configs['stat_file_path'])

jobs_stats = pd.read_csv(workload.configs['stat_file_path'])

jobs_stats = jobs_stats.apply(hist.process_tasks, axis=1)

jobs_stats.to_csv(workload.configs['stat_file_path'], index=False, header=True)
#jobs = jobs.apply(process_tasks, axis=1)


/local0/Kariz/expriments/macrobenchmark/ipython
{
  "type": "sequential",
  "graph_src": "file",
  "graph_skeleton_path": "/local0/Kariz/expriments/macrobenchmark/tpch_pig_dags.g",
  "stat_file_path": "/local0/Kariz/expriments/macrobenchmark/pig_tpch_stats",
  "input_dir": "s3a://data/pig-tpch/64G",
  "output_dir": "/tpch-64G-output",
  "benchmark_path": "/local0/Kariz/expriments/benchmark/BenchmarkScripts/tpch/pig",
  "rgw_host": "192.168.35.41",
  "rgw_port": 80,
  "swift_user": "testuser:swift",
  "swift_key": "7Xqb6gdsCE5Vu0clmk2qL0yjjy1NCNiFuaPlGQvJ",
  "bucket_name": "data"
}
{'Q1': <Graph object, directed, with 3 vertices and 2 edges at 0x7f5eb80910f0>, 'Q2': <Graph object, directed, with 8 vertices and 7 edges at 0x7f5eb80912e8>, 'Q3': <Graph object, directed, with 6 vertices and 5 edges at 0x7f5eb8091da0>}
/local0/Kariz/expriments/macrobenchmark/pig_tpch_stats


In [40]:
import json
import d3n.metadata as md
import utils.yarn as yarn
import d3n.d3n_api as api


metadata = md.load_metadata(workload.configs['rgw_host'], workload.configs['rgw_port'],
                        workload.configs['swift_user'], workload.configs['swift_key'],
                        workload.configs['bucket_name'])

token = md.get_token(workload.configs['rgw_host'], workload.configs['rgw_port'],
                     workload.configs['swift_user'], workload.configs['swift_key'])


print(workload.configs['input_dir'])

input_dir = workload.configs['input_dir'].replace('s3a://data/', '')

ds_meta = api.get_dataset_metadata(metadata, input_dir)

for dsm in ds_meta:
    print(dsm, ds_meta[dsm]['size'])
    
    


RGW token: AUTH_rgwtk0e00000074657374757365723a7377696674b5c27b7cc1cf6b65feadcb5e810aa31ccfc49b8edc2bcd331cb513fd4688b031b9aa3291
s3a://data/pig-tpch/64G
customer 1563391596
lineitem 50361940251
nation 2199
orders 11245608539
part 1557974541
partsupp 7743443936
region 384
supplier 90756062


In [41]:
jobs_stats

Unnamed: 0,query,dataset_size,node_id,jobid,type,runtime,map_avg,map_min,map_max,queuetime
0,Q1,64,0,job_1590090272562_0037,remote,455.957,35.268,8.44,53.89,4.472
1,Q1,64,1,job_1590090272562_0038,remote,9.856,2.585,0.0,2.58,4.261
2,Q1,64,2,job_1590090272562_0039,remote,10.925,3.155,0.0,3.15,5.02
3,Q1,64,0,job_1590090272562_0040,cache,143.54,9.734,6.58,13.5,4.386
4,Q1,64,1,job_1590090272562_0041,cache,10.976,2.871,0.0,2.87,5.215
5,Q1,64,2,job_1590090272562_0042,cache,10.695,2.805,0.0,2.81,3.901
6,Q2,64,0,job_1590090272562_0043,remote,12.592,3.458,3.33,3.59,4.727
7,Q2,64,1,job_1590090272562_0044,remote,13.916,5.851,3.41,7.19,4.421
8,Q2,64,2,job_1590090272562_0045,remote,215.804,33.001,5.83,54.06,6.049
9,Q2,64,3,job_1590090272562_0046,remote,85.059,9.776,4.11,14.74,4.28


In [42]:
print(workload.graph_skeleton_pool)


for g_name in workload.graph_skeleton_pool:
    g = workload.graph_skeleton_pool[g_name]
    
    for v in g.vertices():
        remote_runtime = jobs_stats[(jobs_stats['query'] == g_name) & (jobs_stats['node_id'] == int(v)) & (jobs_stats['type'] == 'remote')]['runtime'].values[0]
        cache_runtime = jobs_stats[(jobs_stats['query'] == g_name) & (jobs_stats['node_id'] == int(v)) & (jobs_stats['type'] == 'cache')]['runtime'].values[0]
        
        g.vp.cache_runtime[v] = cache_runtime
        g.vp.remote_runtime[v] = remote_runtime
        
        
        for f in g.vp.inputs[v]:
            if f in ds_meta:
                g.vp.inputs[v][f] = ds_meta[f]['size']

    for v in g.vertices():
        print(g.gp.name, v, g.vp.remote_runtime[v], g.vp.cache_runtime[v], g.vp.inputs[v])
    print("\n\n")
        
        
        
        #print(g_name, jobs_stats[(jobs_stats['query'] == g_name) & (jobs_stats['node_id'] == int(v))])
    

{'Q1': <Graph object, directed, with 3 vertices and 2 edges at 0x7f5eb80910f0>, 'Q2': <Graph object, directed, with 8 vertices and 7 edges at 0x7f5eb80912e8>, 'Q3': <Graph object, directed, with 6 vertices and 5 edges at 0x7f5eb8091da0>}
Q1 0 455 143 {'lineitem': 50361940251}
Q1 1 9 10 {'': 0}
Q1 2 10 10 {'': 0}



Q2 0 12 10 {'nation': 2199, 'region': 384}
Q2 1 13 12 {'supplier': 90756062}
Q2 2 215 200 {'partsupp': 7743443936}
Q2 3 85 88 {'part': 1557974541}
Q2 4 12 13 {'': 0}
Q2 5 11 11 {'': 0}
Q2 6 10 10 {'': 0}
Q2 7 10 10 {'': 0}



Q3 0 235 171 {'customer': 1563391596, 'orders': 11245608539}
Q3 1 739 405 {'lineitem': 50361940251}
Q3 2 23 29 {'': 0}
Q3 3 12 12 {'': 0}
Q3 4 14 14 {'': 0}
Q3 5 9 10 {'': 0}



