In [None]:
# This notebook demonstrates key features of the TraceLens library
# We encourage users to walk through the notebook and play with the code
# to get a feel for how the library works.

# For production cases, we recommend using the TraceLens/examples/generate_perf_report.py script

from pprint import pprint
import json
import pandas as pd
from TraceLens import TreePerfAnalyzer

In [None]:
# replace by your profile path, it can be a single rank profile from a multi gpu run as well
path = '/path/to/profile.json'
perf_analyzer = TreePerfAnalyzer.from_file(path)

In [None]:
# get breakdown of gpu timeline - busy time, idle time, communication time, etc
perf_analyzer.get_df_gpu_timeline()

In [None]:
# table of all lowest-level CPU operations (from the call stack perspective)
# and the time they "induce" on the GPU
df_kernel_launchers = perf_analyzer.get_df_kernel_launchers(include_kernel_names=True)
df_kernel_launchers.round(2).head()

In [None]:
# group by op name and summarize
# this gives an op wise breakdown of gpu time
df_kernel_launchers_summary = perf_analyzer.get_df_kernel_launchers_summary(df_kernel_launchers)
df_kernel_launchers_summary.round(2).head()

In [None]:
# Generate a detailed breakdown of unique argument combinations for all kernel-launching CPU ops.
# For each unique (op name + input dims/types/strides/concrete args), this groups and aggregates GPU time,
# helping identify which op and its arguments are the most time-consuming.
perf_analyzer.get_df_kernel_launchers_unique_args(df_kernel_launchers, include_pct=True)

In [None]:
# Same as above, but restricted to a specific op type — e.g., only `aten::mm`.
# Useful for drilling into the breakdown of a single op, such as mm, addmm, convolution, etc.
perf_analyzer.get_df_kernel_launchers_unique_args(df_kernel_launchers, event_name="aten::mm", include_pct=True)

In [None]:
# Roofline for ops
# currently we have GEMM, CONV fwd+bwd, FA
# many more coming soon

# Example 1 GEMM
from TraceLens.PerfModel import dict_cat2names
gemm_op_names = dict_cat2names['GEMM']
gemm_events = [event for event in perf_analyzer.tree.events if event['name'] in gemm_op_names]
print(f"Found {len(gemm_events)} gemm events")

# take an example event and compute perf metrics
gemm_event = gemm_events[0]
print("Event dict:")
pprint(gemm_event)
print("Perf metrics dict:")
pprint(perf_analyzer.compute_perf_metrics(gemm_event))


In [None]:
# build table for compute perf metrics for all gemm events
# include_kernel_names=True will add a column with the list of kernel name launched by the CPU op
df_gemm_ops = perf_analyzer.build_df_perf_metrics(gemm_events, include_kernel_names=True)
df_gemm_ops.head()

In [None]:
# summarize by grouping across params M K N and bias and computing aggregate metrics
perf_analyzer.summarize_df_perf_metrics(df_gemm_ops, ['mean'])

In [None]:
# Example 2a sdpa fwd
sdpa_op_names = dict_cat2names['SDPA']
sdpa_events = [event for event in perf_analyzer.tree.events if event['name'] in sdpa_op_names]
df_sdpa_fwd_ops = perf_analyzer.build_df_perf_metrics(sdpa_events)
perf_analyzer.summarize_df_perf_metrics(df_sdpa_fwd_ops, ['mean'])

In [None]:
# Example 2b sdpa bwd
# Note: bwd events for a fwd pass event are found 
# by traversing the autograd links.
df_sdpa_bwd_ops = perf_analyzer.build_df_perf_metrics(sdpa_events, bwd=True)
perf_analyzer.summarize_df_perf_metrics(df_sdpa_bwd_ops, ['mean'])

In [None]:
# Example 3a conv fwd
conv_op_names = dict_cat2names['CONV']
conv_events = [event for event in perf_analyzer.tree.events if event['name'] in conv_op_names]
df_conv_fwd_ops = perf_analyzer.build_df_perf_metrics(conv_events)
perf_analyzer.summarize_df_perf_metrics(df_conv_fwd_ops, ['mean'])

In [None]:
# Example 3b conv bwd
df_conv_bwd_ops = perf_analyzer.build_df_perf_metrics(conv_events, bwd=True)
perf_analyzer.summarize_df_perf_metrics(df_conv_bwd_ops, ['mean'])

In [None]:
# Example 4 unary elementwise 

unary_elemwise_op_names = dict_cat2names['UnaryElementwise']
unary_elementwise_events = [event for event in perf_analyzer.tree.events if event['name'] in unary_elemwise_op_names]
df_unary_elementwise_ops = perf_analyzer.build_df_perf_metrics(unary_elementwise_events)
perf_analyzer.summarize_df_perf_metrics(df_unary_elementwise_ops, ['mean'])

In [None]:
# Example 5 binary elementwise 
binary_elemwise_op_names = dict_cat2names['BinaryElementwise']
binary_elementwise_events = [event for event in perf_analyzer.tree.events if event['name'] in binary_elemwise_op_names]
df_binary_elementwise_ops = perf_analyzer.build_df_perf_metrics(binary_elementwise_events)
perf_analyzer.summarize_df_perf_metrics(df_binary_elementwise_ops, ['mean'])