In [None]:
from tau_profile_parser import TauProfileParser
import pandas as pd
import plotly.express as px
import os
pd.options.plotting.backend = "plotly"

#### Instructions: 
Drag this notebook into the main folder that contains the TauProfile parent folders.  
  
Example:  
/documents/experimentName/folder1/profile.0.0.0   
/documents/experimentName/folder1/profile.1.0.0  
/documents/experimentName/ThisNotebook  
  
  
#### UI  
At minimum, the user need only change a few key variables listed throughout the notebook.  
The notebook will read the files from the local folder.  


In [None]:
counter_data = False
processes_per_node = 42 # Dependent on machine
sample_on = True
# print(os.getcwd()) # Verify working directory  

In [None]:
path_to_ExampleData = os.getcwd()
directory_contents = os.listdir(path_to_ExampleData)
folders = [item for item in directory_contents if os.path.isdir(item) and item[0] != '.']

# sort by number of ranks
print(folders)

In [None]:
# Multi Logic
# if all false, then no multi files, but if one is True then there is a MULTI file

multi_bool = False
for fold in folders:
    subdirs_names = os.listdir(path_to_ExampleData+'/'+fold)
    subfolders = [item for item in directory_contents if os.path.isdir(item) and item[0] != '.']
    if any([False if 'MULTI' not in fold else True for subfold in subfolders]):
        multi_bool = True
        break
    
# print(multi_bool)

In [None]:
TauProfObjects = [TauProfileParser.parse(path_to_ExampleData+'/'+fold, MULTI=multi_bool) for fold in folders]

# print(TauProfObjects)

In [None]:
# To see experiment details, metadata will print out information

# TauProfObjects[1].metadata

Saving the dataframes from a tau profile object

Using the Atomic data from counters. The TauProfileParser obeject uses .atomic_data() to return a dataframe of the counters.
For timer data, .interval_data() is used.

In [None]:
if counter_data:
    Dataframes1 = [obj.atomic_data()   for obj in TauProfObjects]
else:
    Dataframes1 = [obj.interval_data() for obj in TauProfObjects]

print(type(Dataframes1[0]))

In [None]:
timers = [timer for timer in set(Dataframes1[0].index.get_level_values("Timer")) if 'TAU application' not in timer]


In [None]:
for t in enumerate(timers):
    print(t)

In [None]:
# the list of timers we want to visualize

selected_timers = ["*** custom:Grid_updateRefinement:amrex_regrid",
"MPI_Allgather()",'*** custom:RiemannState']

In [None]:
dataframes = [(int(int(len(set(frame.index.get_level_values('Node'))))/processes_per_node),frame) for frame in Dataframes1 ]
dataframes = sorted(dataframes, key=lambda x: x[0])
dataframes = list(map(lambda x: (str(x[0]),x[1]), dataframes))
print(dataframes[0][0])
dataframes[0][1].head()

We want a df with columns of all the papi_ counters, this df will be only the mean values

Read out of the correlated timers is returned

In [None]:
def samples(df1, noutliers=5):
    # returns a smaller dataframe using pandas.sample()
    
    df = df1.copy()
    new_df = pd.DataFrame()
    ####### Uncomment below to include Max and min outliers ############

    #     for _ in range(5): # add mins
    #         maxes = df.idxmax()
    #         for m in maxes:
    #             new_df = new_df.append(df.loc[m].T)
    #             df.drop([m])
    #         mins = df.idxmin()
    #         for n in mins:
    #             new_df = new_df.append(df.loc[n].T)
    #             df.drop([n])
    new_df = new_df.append(df.sample(frac=.25, random_state=1))
    return new_df

In [None]:
def getMetric(dfs, metric, timers=None, sample_on=sample_on):
    # dfs is list of tuples [(#nodes, dataframe), (n,df)...]
    et_df = []
    for df in dfs:
        tmp = pd.DataFrame()
        nodemax = df[1].index.get_level_values(0).max() + 1
        for node in range(nodemax):
            tmp = tmp.append(df[1].loc[(node,0,0),metric].T)
        tmp.reset_index(drop=True, inplace=True)
        et_df.append(tmp)
        
    sums = et_df[0].sum()
    sums.sort_values(ascending=False, inplace=True)
    # Drop .TAU application
    sums.drop(labels='.TAU application', inplace=True)
    
    # call paths
    for column in sums.index:
        if '.TAU application' not in column:
                continue
        sums.drop(column, inplace=True)
        
    # drop the MPI TIMERS
    #for timer in orig_sums.index:
    #    if 'MPI' in timer:
    #         print(timer)
    #        orig_sums.drop(labels=timer, inplace=True)
    
    # timers
    sel_timers = timers
    if not timers:
        top = sums.head(4)
        sel_timers = [timer for timer in top.index]
    if sample_on: 
        et_df = [samples(df) for df in et_df]
        
    return et_df,sel_timers

def doHistograms(non_normal_df, top_timers, m1):
    for tt in top_timers:
        hfig = px.histogram(non_normal_df, x=tt, color="Method", marginal="box")
        title = m1
        hfig.update_layout(height=400, width=600, title=title)
        hfig.show()

def doSplom(title, df, dims):
    fig = px.scatter_matrix(df, dimensions=dims, color="Method", hover_data=['Node'])
    # if saving, use width=1500, else use width=1000
    fig.update_layout(height=1500, width=1000, title=title)
    #fig.write_html("/path/to/save/splom_orig_nocall_noMPI_non_norm.html") # uncomment to save to files for easier viewing and sharing
    fig.show()



In [None]:
# parameters for visualization
metric = 'Inclusive'
timers = selected_timers
sample_on = True        # if you would like full data, make False. Uses lots of RAM and slows browser down. 

In [None]:
et_df,splom_timers = getMetric(dataframes, metric, timers, sample_on)
for df in et_df:
    df.reset_index(drop=True, inplace=True)
et_df[0].index
### add the method to the orig_time data frames to plot non-normalized dataframes
subdirs = [folder[0] for folder in dataframes]
print(subdirs)
for df,sd in zip(et_df,subdirs):
    df['Node'] = df.index
    df['Method'] = sd

non_normal_df = pd.concat(et_df)
doHistograms(non_normal_df, splom_timers, metric)

# creating a title using selected timers.
if not timers:
    title = 'Top 6 Timers on Original Run sorted by ' + metric # change title!!
else:
    title = 'Sorted Timers: '
    for timer in timers:
        title += timer +',\n '
    title+= 'by ' + metric

doSplom(title, non_normal_df, splom_timers)

    