In [None]:
from tau_profile_parser import TauProfileParser
import pandas as pd
import plotly.express as px
import os
pd.options.plotting.backend = "plotly"

#### Instructions: 
Drag this notebook into the main folder that contains the TauProfile parent folders.  
  
Example:  
/documents/experimentName/folder1/profile.0.0.0   
/documents/experimentName/folder1/profile.1.0.0  
/documents/experimentName/ThisNotebook  
  
  
#### UI  
At minimum, the user need only change a few key variables listed throughout the notebook.  
The notebook will read the files from the local folder.  



In [None]:
multi_bool = False # if working with mutliple timers, e.g. TIME, Papi_L1_DCM, etc.
counter_data = True
cores_per_node = 42
# print(os.getcwd()) # Verify working directory

In [None]:
path_to_ExampleData = os.getcwd()
directory_contents = os.listdir(path_to_ExampleData)
folders = [item for item in directory_contents if os.path.isdir(item) and item[0] != '.']

# sort by number of ranks
# print(folders)

In [None]:
# Multi Logic
# if all false, then no multi files, but if one is True then there is a MULTI file

multi_bool = False
for fold in folders:
    subdirs_names = os.listdir(path_to_ExampleData+'/'+fold)
    subfolders = [item for item in directory_contents if os.path.isdir(item) and item[0] != '.']
    if any([False if 'MULTI' not in fold else True for subfold in subfolders]):
        multi_bool = True
        break
    

In [None]:
TauProfObjects = [TauProfileParser.parse(path_to_ExampleData+'/'+fold, MULTI=multi_bool) for fold in folders]

# orig = TauProfileParser.parse(path_to_ExampleData+'/orig',MULTI=multi_bool)
# nocall = TauProfileParser.parse(path_to_ExampleData+'/nocall',MULTI=multi_bool)

print(TauProfObjects)

In [None]:
# To see details of the run, metadata will print out information

# TauProfObjects[1].metadata

Saving the dataframes from a tau profile object

Using the Atomic data from counters. The TauProfileParser obeject uses .atomic_data() to return a dataframe of the counters.
For timer data, .interval_data() is used.

In [None]:
if counter_data:
    Dataframes1 = [obj.atomic_data()   for obj in TauProfObjects]
else:
    Dataframes1 = [obj.interval_data() for obj in TauProfObjects]

print(type(Dataframes1[0]))

In [None]:
# get the number of nodes for each experiment
Dataframes = [(int(len(set(frame.index.get_level_values('Node'))))/cores_per_node,frame) for frame in Dataframes1 ]
Dataframes = sorted(Dataframes, key=lambda x: x[0])


In [None]:
for pair in Dataframes:
    pair[1]['Total'] = pair[1]['Count'] * pair[1]['Mean']
    
papi_timers = [timer for timer in set(Dataframes[0][1].index.get_level_values("Timer")) if 'PAPI_' in timer]
papi_timers.sort()


In [None]:
Dataframes[2][1]

In [None]:
Metric = "Maximum"  # User can change to one of the column names

def get_non_correlated_timers(dataframe, papi_timers, metric=Metric, corr_threshold=0.9):
    # Returns a dataframe of only correlated timers using pandas.corr()
    # organized with rows of nodes and columns being the timers.
    # corr_report
    dataframe_of_metric = pd.DataFrame()
    for node in set(dataframe.index.get_level_values('Node')):
        values = dataframe.loc[(node,0,0,papi_timers),(metric)].T
        values.reset_index(level=['Node','Context','Thread'], drop=True, inplace=True)
        dataframe_of_metric = dataframe_of_metric.append(values)
        
    dataframe_of_metric.reset_index(drop=True, inplace=True)
    correlation_frame = dataframe_of_metric.corr()
    rows = list(correlation_frame.index)
    interesting_pairs = papi_timers.copy()
    corr_report = {}
    for col in correlation_frame.columns:
        for row in rows:
            if correlation_frame.loc[(row),(col)] >= corr_threshold and row != col and row in interesting_pairs and col in interesting_pairs:
                #interesting_pairs.remove(random.choice([row,col]))
                interesting_pairs.remove(col)
                corr_report[(row,col)] = correlation_frame.loc[(row),(col)]
        rows.remove(col)
    interesting_pairs.remove('PAPI_SP_OPS')
    interesting_pairs.remove('PAPI_HW_INT')

    return dataframe_of_metric, set(interesting_pairs), pd.Series(corr_report)


In [None]:
# Correlation threshold percentage
threshold = 0.9

# each entry in corr_dfs: ('folder name', ('metric df', 'non-correlated timers', 'dropped timers with correlation numbers'))
corr_dfs = [(pair[0], get_non_correlated_timers(pair[1], papi_timers)) for pair in Dataframes]

In [None]:
corr_dfs[0][1][0]

In [None]:
def samples(df1, noutliers=5):
    # Takes a dataframe and returns a smaller dataframe using the pandas sample().
    # The commented section adds in n max & min outliers for each column.
    
    df = df1.copy()
    new_df = pd.DataFrame()
#     for _ in range(5): # add mins
#         maxes = df.idxmax()
#         for m in maxes:
#             new_df = new_df.append(df.loc[m].T)
#             df.drop([m])
#         mins = df.idxmin()
#         for n in mins:
#             new_df = new_df.append(df.loc[n].T)
#             df.drop([n])
    new_df = new_df.append(df.sample(frac=.25, random_state=1))
    return new_df

In [None]:
sampled_dfs = [(tup[0], samples(tup[1][0]), tup[1][1], tup[1][2]) for tup in corr_dfs]

In [None]:
# Set up the metric dataframes with columns of numbered index and method names to distinguish in graphs

for tup in corr_dfs:
    tup[1][0]['Node'] = list(tup[1][0].index)
    tup[1][0]['Method'] = [str(tup[0]) + ' nodes' for i in range(len(tup[1][0].index))]

In [None]:
for tup in sampled_dfs:
    tup[1]['Node'] = list(tup[1].index)
    tup[1]['Method'] = [str(tup[0]) + ' nodes' for i in range(len(tup[1].index))]

In [None]:
sets = [tup[1][1] for tup in corr_dfs]
intersection_timers = list(set.intersection(*sets)).sort() # whats common between the runs
union_timers = list(set.union(*sets)).sort()

In [None]:
combined_df_splom = pd.concat([tup[1][0] for tup in corr_dfs]) # non-sampled, very large dataset
sampled_comb_df = pd.concat([tup[1] for tup in sampled_dfs]) # sampled, much smaller dataset

In [None]:
hfig = px.histogram(combined_df_splom, x="PAPI_STL_ICY", color="Method", marginal="box")
hfig.update_layout(height=600, width=800, title="Unsampled PAPI_STL_ICY")
hfig.show()

In [None]:
hfig = px.histogram(sampled_comb_df, x="PAPI_STL_ICY", color="Method", marginal="box")
hfig.update_layout(height=600, width=800, title="Sampled PAPI_STL_ICY")
hfig.show()

In [None]:
# In the case with lots of data points, saving the helps improve visual performance.
dimensions = union_timers # can change to intersection timer set


title='Timers with correlated removed (>{:.2f}) and sampled data'.format(threshold)
fig2 = px.scatter_matrix(sampled_comb_df, dimensions=dimensions, color='Method', hover_data=['Node'])
fig2.update_layout(height=3000, width=3000, title=title)
#fig2.write_html(path_to_ExampleData+'/1Combined_64v63_SPLOM.html') # uncomment to save to file for easier viewing
fig2.show()