In [1]:
import os
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import torchvision.transforms as transforms
from importlib import reload

In [3]:
# from Legg, simple MLP
class MLPLazy(nn.Module):
    def __init__(self, nx, hidden_layer_dims, ny, device):
        super(MLPLazy, self).__init__()
        self.hidden_layer_dims = hidden_layer_dims
        linear_layers = []
        last_dim = nx
        for next_dim in hidden_layer_dims:
            linear_layer = nn.Linear(last_dim, next_dim).to(device)
            linear_layers.append(linear_layer)
            last_dim = next_dim
        # should push to ModuleList so that params stay on cuda
        self.linear_layers = nn.ModuleList(linear_layers)
        
        self.scorer = nn.Linear(last_dim, ny).to(device)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        last_X = X
        for i, linear_layer in enumerate(self.linear_layers):
            # shape (m, self.hidden_layer_dims[i])
            last_X = linear_layer(last_X)
            # shape (m, self.hidden_layer_dims[i])
            last_X = torch.relu(last_X)
        # shape (m, ny)

        z = self.scorer(last_X)
        # shape (m, ny)
        a = torch.softmax(z, dim=1)
        return z

In [60]:
def profileStuff():
     for i in range(10):
        with profiler.record_function("forward" + str(i)):
            y = model(x)
            mse = (y-x).pow(2.0).sum()
        with profiler.record_function("backward" + str(i)):
            mse.backward()

In [236]:
import torch.autograd.profiler as profiler

profiler_kind = torch.autograd.ProfilerState.CPU

record_shapes = True
profile_memory = True
with_stack = True
config = torch.autograd.ProfilerConfig(
    profiler_kind,
    record_shapes,
    profile_memory,
    with_stack)
torch.autograd._enable_profiler(config)

device = "cpu"
model = MLPLazy(nx=100, hidden_layer_dims=[100, 100, 100], ny=100, device=device)
x = torch.normal(0, 1, [100, 100])
y = torch.normal(0, 1, [100, 100])
#with profiler.profile(profile_memory=True, record_shapes=True, use_cuda=False, with_stack=True) as prof:
profileStuff()
records = torch.autograd._disable_profiler()


In [240]:

names = [(x.name(), x) for x in records[0]]
[x for x in names if "backward" in x]

[]

In [223]:
from torch.autograd.profiler import parse_event_records
ayy = parse_event_records(records)

In [None]:
[x.name for x in ayy if len(x.stack) == 0]

In [230]:
len([x for x in ayy if len(x.stack) == 0 and x.name == "torch::autograd::ReluBackward0"])

0

In [243]:
[x.name for x in prof.function_events if x.scope == 0]

['aten::zeros',
 'aten::empty',
 'aten::zero_',
 'aten::fill_',
 'aten::empty',
 'aten::t',
 'aten::transpose',
 'aten::as_strided',
 'aten::addmm',
 'aten::empty',
 'aten::expand',
 'aten::as_strided',
 'aten::copy_',
 'aten::stride',
 'aten::relu',
 'aten::threshold',
 'aten::empty',
 'aten::t',
 'aten::transpose',
 'aten::as_strided',
 'aten::addmm',
 'aten::empty',
 'aten::expand',
 'aten::as_strided',
 'aten::copy_',
 'aten::stride',
 'aten::relu',
 'aten::threshold',
 'aten::empty',
 'aten::t',
 'aten::transpose',
 'aten::as_strided',
 'aten::addmm',
 'aten::empty',
 'aten::expand',
 'aten::as_strided',
 'aten::copy_',
 'aten::stride',
 'aten::relu',
 'aten::threshold',
 'aten::empty',
 'aten::t',
 'aten::transpose',
 'aten::as_strided',
 'aten::addmm',
 'aten::empty',
 'aten::expand',
 'aten::as_strided',
 'aten::copy_',
 'aten::stride',
 'aten::softmax',
 'aten::_softmax',
 'aten::contiguous',
 'aten::empty',
 'aten::sub',
 'aten::empty',
 'aten::pow',
 'aten::result_type',
 'a

In [211]:
len([x for x in prof.function_events if len(x.stack) != 0])

2092

from importlib import reload
import modelInspector
reload(modelInspector)
from modelInspector import inspectModel, displayInspect
inspect = inspectModel(model, prof)
#print(displayInspect(inspectModel(model, prof), sort_key=lambda x: x.cpu_memory_usage, row_limit=1000))

In [162]:
def seeValues(arr, key):
    values = set([getattr(x, key) for x in arr])
    return values

seeValues(inspect, "key")

item = [vars(x) for x in inspect][10]
item['cpu_children'][0].stack[0:3], item

import re
import linecache as allLines
def stackLineToCode(stackLine):
    for (path, line, funcName) in re.findall("(.*)\((\d+)\)\: (.*)", stackLine):
        return allLines.getline(path, int(line))

from collections import defaultdict
def groupItemsByStack(events):
    allStacks = defaultdict(lambda: [])
    for e in events:
        for i in range(1, len(e.stack)):
            topOfStack = e.stack[-i:]
            topOfStackKey = tuple(topOfStack)
            allStacks[topOfStackKey].append(e)
    return allStacks

allStacks = groupItemsByStack(prof.function_events)
    
    
#print(item['cpu_children'][0].stack)
#stackLineToCode("<ipython-input-60-c852a2e05d02>(5): profileStuff")
#stackLineToCode("/home/azureuser/miniconda3/envs/sandbox1/lib/python3.8/site-packages/torch/nn/functional.py(1690): linear")


In [None]:
arr = list(allStacks.items())
arr.sort(key=lambda x: len(x[1]))

for k,v in arr:
    print(k)
    print(vars(v[0]))
    print("\n\n\n")

In [169]:
def populate_cpu_children(self):
    """Populates child events into each underlying FunctionEvent object.
    One event is a child of another if [s1, e1) is inside [s2, e2). Where
    s1 and e1 would be start and end of the child event's interval. And
    s2 and e2 start and end of the parent event's interval
    Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
    be a parent of two other intervals.
    If for any reason two intervals intersect only partially, this function
    will not record a parent child relationship between then.
    """
    if self.cpu_children_populated:
        return

    # Some events can be async (i.e. start and end on different threads),
    # since it's generally undefined how to attribute children ranges to
    # async ranges, we do not use them when calculating nested ranges and stats
    sync_events = [evt for evt in self if not evt.is_async]
    events = sorted(
        sync_events,
        key=attrgetter("thread"),
    )
    # Group by both thread and node_id, so that events that happen to have
    # the same thread_id but are from different nodes aren't incorrectly
    # grouped together.
    threads = itertools.groupby(
        events, key=lambda event: (event.thread, event.node_id)
    )

    # For each thread we keep a stack of current nested parents.
    # We maintain the invariant that each interval is a subset of all other
    # intervals lower in the stack.
    #
    # First we sort the intervals by their start time. Then we iterate over them.
    # Every time we see a new interval we remove several parents from
    # the top until we restore the invariant. Then parent child relationship
    # if recorded if the stack is not empty.
    # Finally we add new interval to the list
    #
    # Algorithm has O(N * log(N)) complexity where N is number of
    # intervals
    for thread_id, thread_events in threads:
        thread_events_ = sorted(
            thread_events,
            key=lambda event: [event.cpu_interval.start, -event.cpu_interval.end],
        )
        current_events: List[FunctionEvent] = []
        cur_end = 0
        for event in thread_events_:
            while len(current_events) > 0:
                parent = current_events[-1]
                if event.cpu_interval.start >= parent.cpu_interval.end or \
                        event.cpu_interval.end > parent.cpu_interval.end:
                    # this can't be a parent
                    current_events.pop()
                else:
                    parent.append_cpu_child(event)
                    assert (
                        event.cpu_parent is None
                    ), "There is already a CPU parent event for {}".format(
                        event.key
                    )
                    event.set_cpu_parent(parent)
                    break

            current_events.append(event)

    self._cpu_children_populated = True

def set_backward_stacktraces(self):
    self.populate_cpu_children()

    def bw_parent(evt):
        if evt is None:
            return None
        elif evt.scope == 1:
            return evt
        else:
            return bw_parent(evt.cpu_parent)

    fwd_stacks = {}
    for evt in self:
        if bw_parent(evt) is None:
            t = (evt.sequence_nr, evt.thread)
            if t not in fwd_stacks:
                fwd_stacks[t] = evt.stack

    for evt in self:
        p = bw_parent(evt)
        if p is not None:
            assert p.fwd_thread is not None
            t = (p.sequence_nr, p.fwd_thread)
            if t in fwd_stacks:
                evt.stack = fwd_stacks[t]
            else:
                evt.stack = []

<FunctionEvent id=9916 node_id=-1 cpu_time=263.902us cpu_start=60.501 cpu_end=324.403 cpu_children=[9918, 9919] cuda_time=0.000us name=aten::zeros thread=1 input_shapes=[[], [], [], [], []] cpu_memory_usage=4 cuda_memory_usage=0 is_async=False is_remote=False seq_nr=-1>