## Analysis of pipeline execution time across different branches

In [103]:
# dependencies for this notebook that aren't already in the 'emission' environment
!pip install plotly
!pip install nbformat

# db_name = "openpath_prod_ca_ebike"
# %env DB_HOST=mongodb://localhost:27017/$db_name

from __future__ import annotations
import subprocess
import time
import pandas as pd
import plotly.express as px
import emission.core.timer as ect
import emission.storage.decorations.stats_queries as esds
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt



In [104]:
OPCODE = "nrelop_dev-emulator-study_0"
DAYS = [
    # "shankari_2016-07-22",
    # "shankari_2016-07-25",
    # "shankari_2016-07-27",
    "shankari_2016-08-04",
    # "shankari_2016-08-05",
]

print(f"Loading {DAYS} for {OPCODE}")
!./e-mission-py.bash bin/debug/purge_user.py -e $opcode
for DAY in DAYS:
    !./e-mission-py.bash bin/debug/load_timeline_for_day_and_user.py emission/tests/data/real_examples/$DAY $OPCODE

branches = {
    '47ab8be28a1682f841b3d6a03cbe0f9fe0515e0f': { 'label': 'baseline'},
    'master': { 'label': 'master' },
    'fe6035ab43fa539235ad8bff982b1db5af615c3f': { 'label': 'vectorized_segmentation (draft 1)' },
    'vectorized_segmentation': { 'label': 'vectorized_segmentation (current)' },
}

curr_branch_name_name = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode('utf-8').strip()

for branch_name in branches.keys():
    print(f"Checking out {branch_name} to measure pipeline runtime")
    !git checkout $branch_name
    !./e-mission-py.bash ./bin/reset_pipeline.py --all
    with ect.Timer() as t:
        !./e-mission-py.bash bin/debug/intake_single_user.py -e $OPCODE
    
    now = time.time()
    tq = estt.TimeQuery("data.ts", now - t.elapsed, now)
    print(f"Ran pipeline on {branch_name}, {tq}")
    branches[branch_name]['tq'] = tq

    confirmed_trips_df = esta.TimeSeries.get_aggregate_time_series().get_data_df("analysis/confirmed_trip")
    pd.set_option('display.float_format', '{:.2f}'.format)
    display(confirmed_trips_df[['start_ts', 'end_ts', ]])

print(f"Switching back to {curr_branch_name_name}")
!git checkout $curr_branch_name_name

Loading ['shankari_2016-08-04'] for nrelop_dev-emulator-study_0
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
usage: purge_user [-h] (-e USER_EMAIL | -u USER_UUID) [-p]
purge_user: error: argument -e/--user_email: expected one argument
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
emission/tests/data/real_examples/shankari_2016-08-04
Loading file emission/tests/data/real_examples/shankari_2016-08-04
After registration, nrelop_dev-emulator-study_0 -> bb6c7318-52f7-4e0f-a51b-c3f802f3b567
Finished loading 0 entries into the usercache and 2667 entries into the timeseries
Checking out 47ab8be28a1682f841b3d6a03cbe0f9fe0515e0f to measu

Unnamed: 0,start_ts,end_ts
0,1470341031.23,1470342912.0
1,1470343292.14,1470351348.72
2,1470352238.74,1470352844.0
3,1470354036.96,1470354172.71
4,1470354386.56,1470355132.0
5,1470355612.59,1470355893.16
6,1470356315.84,1470357252.57
7,1470357578.29,1470363534.71
8,1470364485.74,1470364718.35


Checking out master to measure pipeline runtime
M	.gitignore
M	emission/storage/timeseries/builtin_timeseries.py
Previous HEAD position was 47ab8be2 Merge pull request #1011 from shankari/go_back_to_confirmed_trips
Switched to branch 'master'
Your branch is up to date with 'origin/master'.
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
Namespace(all=True, platform=None, user_list=None, email_list=None, date=None, dry_run=False)
INFO:root:About to delete 796 analysis results
INFO:root:About to delete entries with keys ['analysis/cleaned_place', 'analysis/cleaned_section', 'analysis/cleaned_trip', 'analysis/composite_trip', 'analysis/confirmed_place', 'analysis/confirmed_trip', 'analysis/expected_trip', 'analysis/inferred_labels', 'analysis/inferred_section', 'analysis/inferred_trip', 'analysis/recreated_locat

Unnamed: 0,start_ts,end_ts
0,1470341031.23,1470342912.0
1,1470343292.14,1470351348.72
2,1470352238.74,1470352844.0
3,1470354036.96,1470354172.71
4,1470354386.56,1470355132.0
5,1470355612.59,1470355893.16
6,1470356315.84,1470357252.57
7,1470357578.29,1470363534.71
8,1470364485.74,1470364718.35


Checking out fe6035ab43fa539235ad8bff982b1db5af615c3f to measure pipeline runtime
M	.gitignore
M	emission/storage/timeseries/builtin_timeseries.py
Note: switching to 'fe6035ab43fa539235ad8bff982b1db5af615c3f'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at fe6035ab rename variables for clarity
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL lo

Unnamed: 0,start_ts,end_ts
0,1470341031.23,1470342912.0
1,1470343292.14,1470351348.72
2,1470352238.74,1470352844.0
3,1470354036.96,1470354172.71
4,1470354386.56,1470355132.0
5,1470355612.59,1470355893.16
6,1470356315.84,1470357252.57
7,1470357578.29,1470363534.71
8,1470364485.74,1470364718.35


Checking out vectorized_segmentation to measure pipeline runtime
M	.gitignore
M	emission/storage/timeseries/builtin_timeseries.py
Previous HEAD position was fe6035ab rename variables for clarity
Switched to branch 'vectorized_segmentation'
Your branch is up to date with 'jgreenlee/vectorized_segmentation'.
Config file not found, returning a copy of the environment variables instead...
Retrieved config: {'DB_HOST': None, 'DB_RESULT_LIMIT': None}
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
Namespace(all=True, platform=None, user_list=None, email_list=None, date=None, dry_run=False)
INFO:root:About to delete 796 analysis results
INFO:root:About to delete entries with keys ['analysis/cleaned_place', 'analysis/cleaned_section', 'analysis/cleaned_trip', 'analysis/composite_trip', 'analysis/confirmed_place', 'analysis/confirmed_trip', 'analysis/expected_trip', 'analysis/inferred_labels', 'analysis/inferred_section', 'analysis/inferred_trip', 'analysi

Unnamed: 0,start_ts,end_ts
0,1470341031.23,1470342912.0
1,1470343292.14,1470351348.72
2,1470352238.74,1470352844.0
3,1470354036.96,1470354172.71
4,1470354386.56,1470355132.0
5,1470355612.59,1470355893.16
6,1470356315.84,1470357252.57
7,1470357578.29,1470363534.71
8,1470364485.74,1470364718.35


Switching back to vectorized_segmentation
M	.gitignore
M	emission/storage/timeseries/builtin_timeseries.py
Already on 'vectorized_segmentation'
Your branch is up to date with 'jgreenlee/vectorized_segmentation'.


In [112]:
ts = esta.TimeSeries.get_aggregate_time_series()

pipeline_stats_dfs = [
    ts.get_data_df('stats/pipeline_time', time_query=branch_info['tq']).assign(branch=branch_info['label'])
    for branch_name, branch_info in branches.items()
]
pipeline_stats_df = pd.concat(pipeline_stats_dfs)

pipeline_stages_df = pipeline_stats_df[
    pipeline_stats_df['name'].str.isupper() & ~pipeline_stats_df['name'].str.contains('/')
]
trip_segmentation_stage_df = pipeline_stages_df[
    pipeline_stages_df['name'] == 'TRIP_SEGMENTATION'
]

for df in [pipeline_stages_df, trip_segmentation_stage_df]:
    fig = px.bar(
        df,
        y="name",
        x="reading",
        color="branch",
        orientation="h",
    )
    fig.update_layout(
        title=f"Pipeline stage runtimes {DAYS}",
        barmode='group',
        yaxis=dict(dtick=1),
        legend=dict(yref="container", xanchor="right", x=1, y=0),
    )
    fig.show()

In [106]:
trip_segmentation_df = pipeline_stats_df[
    pipeline_stats_df['name'].str.contains('TRIP_SEGMENTATION/')
]
fig = px.bar(
    trip_segmentation_df,
    y="name",
    x="reading",
    color="branch",
    orientation="h",
)
fig.update_layout(
    title=f"Trip segmentation substage runtimes {DAYS}",
    barmode='group',
    legend=dict(yref="container", xanchor="right", x=1, y=0),
)
fig.show()

In [107]:
db_calls_df = pipeline_stats_df[
    pipeline_stats_df['name'] == 'get_entries_for_timeseries'
]
db_calls_counts_df = db_calls_df['branch'].value_counts(sort=False)

fig = px.bar(
    db_calls_counts_df,
    color=db_calls_counts_df.index,
    orientation="h",
)
fig.update_layout(
    title=f"Calls to _get_entries_for_timeseries during pipeline {DAYS}",
    barmode='group',
    yaxis=dict(showticklabels=False),
    legend=dict(yref="container", xanchor="right", x=1, y=0),
)
fig.show()

In [108]:
trip_segmentation_db_calls_dfs = []

branches_trip_segmentation_tqs = []
for branch_name, branch_info in branches.items():
    try:
        trip_segmentation = pipeline_stages_df[
            (pipeline_stages_df['name'] == 'TRIP_SEGMENTATION') &
            (pipeline_stages_df['branch'] == branch_info['label'])
        ].iloc[0]
    except IndexError:
        print(f"No trip segmentation for {branch_name}")
        continue
    trip_segmentation_db_calls = db_calls_df[
        (db_calls_df['ts'] < trip_segmentation['ts'])
        & (db_calls_df['ts'] > trip_segmentation['ts'] - trip_segmentation['reading'])
    ]
    trip_segmentation_db_calls_dfs.append(trip_segmentation_db_calls)
    print(branch_name)
    display(trip_segmentation_db_calls)

trip_segmentation_db_calls_counts_df = pd.concat(trip_segmentation_db_calls_dfs)['branch'].value_counts(sort=False)
fig = px.bar(
    trip_segmentation_db_calls_counts_df,
    color=trip_segmentation_db_calls_counts_df.index,
    orientation="h",
)
fig.update_layout(
    title=f"Calls to _get_entries_for_timeseries during trip segmentation {DAYS}",
    barmode='group',
    legend=dict(yref="container", xanchor="right", x=1, y=0),
)
fig.show()

47ab8be28a1682f841b3d6a03cbe0f9fe0515e0f


Unnamed: 0,name,ts,reading,_id,user_id,metadata_write_ts,branch
5,get_entries_for_timeseries,1739173753.09,find_entries,67a9af794d7168c4d9792106,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173753.09,baseline
6,get_entries_for_timeseries,1739173753.09,find_entries,67a9af794d7168c4d9792107,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173753.09,baseline
8,get_entries_for_timeseries,1739173753.13,find_entries,67a9af794d7168c4d9792109,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173753.13,baseline
9,get_entries_for_timeseries,1739173753.14,find_entries,67a9af794d7168c4d979210a,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173753.14,baseline
10,get_entries_for_timeseries,1739173753.17,find_entries,67a9af794d7168c4d979210b,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173753.17,baseline
...,...,...,...,...,...,...,...
3744,get_entries_for_timeseries,1739173761.99,find_entries,67a9af814d7168c4d9792fb8,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173761.99,baseline
3745,get_entries_for_timeseries,1739173762.00,find_entries,67a9af824d7168c4d9792fbb,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173762.00,baseline
3746,get_entries_for_timeseries,1739173762.00,find_entries,67a9af824d7168c4d9792fbc,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173762.00,baseline
3747,get_entries_for_timeseries,1739173762.01,find_entries,67a9af824d7168c4d9792fbf,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173762.01,baseline


master


Unnamed: 0,name,ts,reading,_id,user_id,metadata_write_ts,branch
430,get_entries_for_timeseries,1739173777.73,find_entries,67a9af91936c010546d01ca9,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.73,master
431,get_entries_for_timeseries,1739173777.73,find_entries,67a9af91936c010546d01ca8,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.73,master
432,get_entries_for_timeseries,1739173777.72,find_entries,67a9af91936c010546d01ca5,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.72,master
433,get_entries_for_timeseries,1739173777.71,find_entries,67a9af91936c010546d01ca4,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.72,master
434,get_entries_for_timeseries,1739173777.7,find_entries,67a9af91936c010546d01ca1,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.7,master
435,get_entries_for_timeseries,1739173777.7,find_entries,67a9af91936c010546d01ca0,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.7,master
436,get_entries_for_timeseries,1739173777.69,find_entries,67a9af91936c010546d01c9d,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.69,master
437,get_entries_for_timeseries,1739173777.69,find_entries,67a9af91936c010546d01c9c,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.69,master
438,get_entries_for_timeseries,1739173777.68,find_entries,67a9af91936c010546d01c99,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.68,master
439,get_entries_for_timeseries,1739173777.68,find_entries,67a9af91936c010546d01c98,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173777.68,master


fe6035ab43fa539235ad8bff982b1db5af615c3f


Unnamed: 0,name,ts,reading,_id,user_id,metadata_write_ts,branch
430,get_entries_for_timeseries,1739173790.83,find_entries,67a9af9ecfa05a741ce6a66c,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.83,vectorized_segmentation (draft 1)
431,get_entries_for_timeseries,1739173790.83,find_entries,67a9af9ecfa05a741ce6a66b,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.83,vectorized_segmentation (draft 1)
432,get_entries_for_timeseries,1739173790.82,find_entries,67a9af9ecfa05a741ce6a668,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.82,vectorized_segmentation (draft 1)
433,get_entries_for_timeseries,1739173790.82,find_entries,67a9af9ecfa05a741ce6a667,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.82,vectorized_segmentation (draft 1)
434,get_entries_for_timeseries,1739173790.8,find_entries,67a9af9ecfa05a741ce6a664,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.8,vectorized_segmentation (draft 1)
435,get_entries_for_timeseries,1739173790.8,find_entries,67a9af9ecfa05a741ce6a663,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.8,vectorized_segmentation (draft 1)
436,get_entries_for_timeseries,1739173790.79,find_entries,67a9af9ecfa05a741ce6a660,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.79,vectorized_segmentation (draft 1)
437,get_entries_for_timeseries,1739173790.79,find_entries,67a9af9ecfa05a741ce6a65f,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.79,vectorized_segmentation (draft 1)
438,get_entries_for_timeseries,1739173790.77,find_entries,67a9af9ecfa05a741ce6a65c,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.77,vectorized_segmentation (draft 1)
439,get_entries_for_timeseries,1739173790.77,find_entries,67a9af9ecfa05a741ce6a65b,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173790.77,vectorized_segmentation (draft 1)


vectorized_segmentation


Unnamed: 0,name,ts,reading,_id,user_id,metadata_write_ts,branch
430,get_entries_for_timeseries,1739173803.46,find_entries,67a9afab86a8515481eeb1ed,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.46,vectorized_segmentation (current)
431,get_entries_for_timeseries,1739173803.46,find_entries,67a9afab86a8515481eeb1ec,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.46,vectorized_segmentation (current)
432,get_entries_for_timeseries,1739173803.45,find_entries,67a9afab86a8515481eeb1e9,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.45,vectorized_segmentation (current)
433,get_entries_for_timeseries,1739173803.45,find_entries,67a9afab86a8515481eeb1e8,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.45,vectorized_segmentation (current)
434,get_entries_for_timeseries,1739173803.44,find_entries,67a9afab86a8515481eeb1e5,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.44,vectorized_segmentation (current)
435,get_entries_for_timeseries,1739173803.43,find_entries,67a9afab86a8515481eeb1e4,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.43,vectorized_segmentation (current)
436,get_entries_for_timeseries,1739173803.43,find_entries,67a9afab86a8515481eeb1e1,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.43,vectorized_segmentation (current)
437,get_entries_for_timeseries,1739173803.42,find_entries,67a9afab86a8515481eeb1e0,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.42,vectorized_segmentation (current)
438,get_entries_for_timeseries,1739173803.41,find_entries,67a9afab86a8515481eeb1dd,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.41,vectorized_segmentation (current)
439,get_entries_for_timeseries,1739173803.41,find_entries,67a9afab86a8515481eeb1dc,bb6c7318-52f7-4e0f-a51b-c3f802f3b567,1739173803.41,vectorized_segmentation (current)
