In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

color = sns.color_palette("rocket", 20)


In [3]:
basic_full_flow_df = pd.read_csv('profile_results/basic_full_flow.py.csv')
mnist_convnet_df = pd.read_csv('profile_results/demo_mnist_convnet.py.csv')
functional_df = pd.read_csv('profile_results/demo_functional.py.csv')

dfs = {"basic_full_flow": basic_full_flow_df, 
       "demo_mnist_convnet": mnist_convnet_df, 
       "demo_functional": functional_df
       }

### Preprocessing

In [4]:
for i, df in dfs.items():
    # Remove duplicate rows
    df = df.drop_duplicates()
    # Remove rows with no ncalls or tottime
    df = df.dropna(subset=['ncalls', 'tottime'])
    # Get total calls from ncalls (the larger number)
    df['ncalls'] = df['ncalls'].apply(lambda x: int(x.split('/')[0].strip()))

    # Change file filename:lineno(function) column name to function
    df['function'] = df['filename:lineno(function)']
    df = df.drop('filename:lineno(function)', axis=1)

    # Sort by tottime
    df = df.sort_values('tottime', ascending=False)

    dfs[i] = df 


In [None]:
for df in dfs.values():
    print(df.head())
    print('\n\n\n')


In [None]:
# Make a total time df 
# It will three entries and sum the tottime for each 
total_time_df = pd.DataFrame(columns=['name', 'tottime'])
for i, df in dfs.items():
    total_time_df.loc[len(total_time_df)] = [i, df['tottime'].sum()]

print(total_time_df)

In [None]:
# Plot the total time with a bar chart in seconds
fig, ax = plt.subplots()

sns.barplot(x='name', y='tottime', data=total_time_df, palette=color, ax=ax)
ax.set_title('Total Time of each file')
ax.set_ylabel('Seconds')
ax.set_xlabel('File')

fig.show()

# Save the plot
fig.savefig('total_time.png')

In [17]:
# Modify the dfs to include the percentage of total time
for i, df in dfs.items():
    df['percentage'] = df['tottime'] / total_time_df.loc[total_time_df['name'] == i, 'tottime'].values[0]*100

In [None]:
# Use a subset of the data. 20 of the highest tottime functions are enough
dfs_small = {k: v.head(20) for k, v in dfs.items()}
# Plotting

for name, df in dfs_small.items():
    fig, ax = plt.subplots(figsize=(14, 9))

    # Plot
    sns.barplot(x='percentage', y='function', data=df, ax=ax, palette=color)
    ax.set_title(f'Top 20 functions with highest Total time in {name}')
    ax.set_xlabel('Total time in function as a percentage of total time in script')
    # Put the y_label on the right side of the graph
    ax.yaxis.tick_right()
    ax.yaxis.set_label_position('right')
    ax.set_ylabel('Function or file name')
    ax.set_yticks(range(20))
    ax.set_yticklabels(df['function'])

    # Scale x-axis to log
    #ax.set_xscale('log')

    # Annotate the bars with the tottime
    for p in ax.patches:
        ax.annotate(f'{p.get_width():.2f}%', (p.get_width() * 1.00, p.get_y() + p.get_height() / 2), ha='left', va='center')
    
    plt.show()

    #Save the plot
    fig.savefig(f'profile_results/{name}_top20_tottime.png')
        

In [None]:
# Plot the percentage of total time for each function in a pie chart in seaborn
# Make a legend with the function name and the percentage of total time
for name, df in dfs_small.items():
    fig, ax = plt.subplots(figsize=(14, 9))
    ax.pie(df['percentage'], autopct='%1.1f%%')
    ax.legend(df['function'], title='Function', loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))
    ax.set_title(f'Percentage of total time in {name}')
    plt.show()

    # Save the plot
    fig.savefig(f'profile_results/{name}_percentage_tottime.png')

# Same as in the previous section, but now we look into the top 20 highest percall values.

In [None]:
# Sort by percall instead of tottime
for i, df in dfs.items():
    df = df.sort_values('percall', ascending=False)
    dfs[i] = df

# Use a subset of the data. 20 of the highest percall functions are enough
dfs_small = {k: v.head(20) for k, v in dfs.items()}

# Plotting
for name, df in dfs_small.items():
    fig, ax = plt.subplots(figsize=(14, 9))

    # Plot
    sns.barplot(x='percall', y='function', data=df, ax=ax, palette=color)
    ax.set_title(f'Top 20 functions with highest percall in {name}')
    ax.set_xlabel('Average time spent in function per call (s)')
    # Put the y_label on the right side of the graph
    ax.yaxis.tick_right()
    ax.yaxis.set_label_position('right')
    ax.set_ylabel('Function or file name')
    ax.set_yticks(range(20))
    ax.set_yticklabels(df['function'])

    # Scale x-axis to log
    #ax.set_xscale('log')

    # Annotate the bars with the tottime
    for p in ax.patches:
        ax.annotate(f'{p.get_width():.2f}', (p.get_width() * 1.00, p.get_y() + p.get_height() / 2), ha='left', va='center')
    
    plt.show()

    #Save the plot
    fig.savefig(f'profile_results/{name}_top20_percall.png')

string>:1(module>) is top-level code in the entry-point Python script invoked. So if all your code is top-level in "foo.py" and you do python foo.py, then you'll see a high tottime for <string>:1(<module>). By "top-level", I mean not inside any function definition - just run directly. – 
Jean-Paul Calderone
