## Analysis of pipeline execution time across different branches

In [77]:
# dependencies for this notebook that aren't already in the 'emission' environment
!pip install plotly
!pip install nbformat

# db_name = "openpath_prod_ca_ebike"
# %env DB_HOST=mongodb://localhost:27017/$db_name

from __future__ import annotations
import subprocess
import time
import pandas as pd
import plotly.express as px
import arrow
import emission.core.timer as ect
import emission.storage.decorations.stats_queries as esds
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt



In [78]:
branches = {'master': {}, 'Dist_Optimize': {}}

day = "iphone_2016-02-22"
opcode = "nrelop_dev-emulator-study_69"

curr_git_branch_name = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode('utf-8').strip()

for branch_name in branches.keys():
    print(f"Checking out {branch_name} to measure pipeline runtime")
    !git checkout $branch_name
    
    !./e-mission-py.bash ./bin/reset_pipeline.py --all
    !./e-mission-py.bash bin/debug/load_timeline_for_day_and_user.py emission/tests/data/real_examples/$day $opcode
    with ect.Timer() as t:
        !./e-mission-py.bash bin/debug/intake_single_user.py -e $opcode
    
    now = time.time()
    tq = estt.TimeQuery("data.ts", now - t.elapsed, now)
    print(f"Ran pipeline on {branch_name}, {tq}")
    branches[branch_name]['tq'] = tq

    print(f"Switching back to {curr_git_branch_name}")
    !git checkout $curr_git_branch_name
    


# !./e-mission-py.bash ./bin/reset_pipeline.py --all
# if db_host:=os.environ.get('DB_HOST'):
#     with ect.Timer() as t:
#         !./e-mission-py.bash ./bin/intake_multiprocess.py 5
#     esds.store_pipeline_time(None, f"{curr_git_branch_name}_intake_multiprocess/{db_host}", time.time(), t.elapsed)

Checking out master to measure pipeline runtime
Switched to branch 'master'
Your branch is up to date with 'origin/master'.
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
Namespace(all=True, platform=None, user_list=None, email_list=None, date=None, dry_run=False)
INFO:root:About to delete 235 analysis results
INFO:root:About to delete entries with keys ['analysis/cleaned_place', 'analysis/cleaned_section', 'analysis/cleaned_stop', 'analysis/cleaned_trip', 'analysis/composite_trip', 'analysis/confirmed_place', 'analysis/confirmed_trip', 'analysis/expected_trip', 'analysis/inferred_labels', 'analysis/inferred_section', 'analysis/inferred_trip', 'analysis/recreated_location', 'analysis/smoothing', 'inference/labels', 'inference/prediction', 'segmentation/raw_place', 'segmentation/raw_section', 'segmentation/ra

In [79]:
ts = esta.TimeSeries.get_aggregate_time_series()

pipeline_stats_dfs = [
    ts.get_data_df('stats/pipeline_time', time_query=stats['tq']).assign(branch_name=branch_name)
    for branch_name, stats in branches.items()
]
pipeline_stats_df = pd.concat(pipeline_stats_dfs)

pipeline_stages_df = pipeline_stats_df[
    pipeline_stats_df['name'].str.isupper() & ~pipeline_stats_df['name'].str.contains('/')
]
fig = px.bar(
    pipeline_stages_df,
    y="name",
    x="reading",
    color="branch_name",
    orientation="h",
)
fig.update_layout(
    title=f"Pipeline stage runtimes ({day})",
    barmode='group',
    yaxis=dict(dtick=1),
    legend=dict(
        yanchor="bottom",
        xanchor="right",
    )
)
fig.show()

In [80]:
trip_segmentation_df = pipeline_stats_df[
    pipeline_stats_df['name'].str.contains('TRIP_SEGMENTATION')
]
fig = px.bar(
    trip_segmentation_df,
    y="name",
    x="reading",
    color="branch_name",
    orientation="h",
)
fig.update_layout(
    title=f"Trip segmentation substage runtimes ({day})",
    barmode='group',
    legend=dict(
        yanchor="bottom",
        xanchor="right",
    )
)
fig.show()

In [None]:
db_calls_df = pipeline_stats_df[
    pipeline_stats_df['name'] == 'get_entries_for_timeseries'
]
db_calls_counts_df = db_calls_df['branch_name'].value_counts(sort=False)

fig = px.bar(
    db_calls_counts_df,
    color=db_calls_counts_df.index,
    orientation="h",
)
fig.update_layout(
    title=f"Calls to _get_entries_for_timeseries during pipeline ({day})",
    barmode='group',
    legend=dict(
        yanchor="bottom",
        xanchor="right",
    )
)
fig.show()

In [None]:
trip_segmentation_db_calls_dfs = []

branches_trip_segmentation_tqs = []
for branch_name in branches.keys():
    trip_segmentation = pipeline_stages_df[
        (pipeline_stages_df['name'] == 'TRIP_SEGMENTATION') &
        (pipeline_stages_df['branch_name'] == branch_name)
    ].iloc[0]
    trip_segmentation_db_calls = db_calls_df[
        (db_calls_df['ts'] < trip_segmentation['ts'])
        & (db_calls_df['ts'] > trip_segmentation['ts'] - trip_segmentation['reading'])
    ]
    trip_segmentation_db_calls_dfs.append(trip_segmentation_db_calls)

trip_segmentation_db_calls_counts_df = pd.concat(trip_segmentation_db_calls_dfs)['branch_name'].value_counts(sort=False)
fig = px.bar(
    trip_segmentation_db_calls_counts_df,
    color=trip_segmentation_db_calls_counts_df.index,
    orientation="h",
)
fig.update_layout(
    title=f"Calls to _get_entries_for_timeseries during trip segmentation ({day})",
    barmode='group',
    legend=dict(
        yanchor="bottom",
        xanchor="right",
    )
)
fig.show()

In [81]:
# !./e-mission-py.bash bin/debug/purge_user.py -e $opcode