In [253]:
import os
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import torchvision.transforms as transforms
from importlib import reload

In [254]:
# from Legg, simple MLP
class MLPLazy(nn.Module):
    def __init__(self, nx, hidden_layer_dims, ny, device):
        super(MLPLazy, self).__init__()
        self.hidden_layer_dims = hidden_layer_dims
        linear_layers = []
        last_dim = nx
        for next_dim in hidden_layer_dims:
            linear_layer = nn.Linear(last_dim, next_dim).to(device)
            linear_layers.append(linear_layer)
            last_dim = next_dim
        # should push to ModuleList so that params stay on cuda
        self.linear_layers = nn.ModuleList(linear_layers)
        
        self.scorer = nn.Linear(last_dim, ny).to(device)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        last_X = X
        for i, linear_layer in enumerate(self.linear_layers):
            # shape (m, self.hidden_layer_dims[i])
            last_X = linear_layer(last_X)
            # shape (m, self.hidden_layer_dims[i])
            last_X = torch.relu(last_X)
        # shape (m, ny)

        z = self.scorer(last_X)
        # shape (m, ny)
        a = torch.softmax(z, dim=1)
        return z

In [197]:
from torch.autograd.profiler import FunctionEventAvg, EventList
def key_averages(eventArr, group_by_input_shapes=False, stackKey=None, stackParse=None):
    """Averages all function events over their keys.
    Arguments:
        group_by_input_shapes: group entries by
        (event name, input shapes) rather than just event name.
        This is useful to see which input shapes contribute to the runtime
        the most and may help with size-specific optimizations or
        choosing the best candidates for quantization (aka fitting a roof line)
        group_by_stack_n: group by top n stack trace entries
    Returns:
        An EventList containing FunctionEventAvg objects.
    """
    eventArr.populate_cpu_children()
    stats: Dict[Tuple[int, Tuple[int, int]], FunctionEventAvg] = defaultdict(FunctionEventAvg)

    def get_key(event, group_by_input_shapes, stackKey):
        key = [str(event.key), str(event.node_id)]
        if group_by_input_shapes:
            key.append(str(event.input_shapes))
        if stackKey is not None:
            stackKeys = [stackKey(x) for x in event.stack if stackKey(x) is not None]
            if len(stackKeys) > 0:
                key += "\n".join(stackKeys)
        return tuple(key)
    for evt in eventArr:
        stats[get_key(evt, group_by_input_shapes, stackKey)].add(evt)
    
    avg_list = EventList(stats.values(), use_cuda=eventArr._use_cuda, profile_memory=eventArr._profile_memory)
    
    for evt in avg_list:
        if not group_by_input_shapes:
            evt.input_shapes = ""
        if stackParse is not None:
            evt.stack = [stackParse(x) for x in evt.stack if stackParse(x) is not None]
    return avg_list


In [240]:
import torch.autograd.profiler as profiler
device = "cuda:0"
config = TransformerConfig(numHeads=2, vocabSize=128, embeddingDim=256, posEmbeddingDim=256, keyDim=512, valueDim=512, hiddenSize=512, numLayers=8, seqLen=100)
model = Transformer(config)
x = torch.randint(0, 127, [1, 100])
desired = torch.nn.Softmax(dim=2)(torch.normal(0, 1, [1, 100, 128]))
with profiler.profile(profile_memory=True, record_shapes=True, use_cuda=False, with_stack=True) as prof:
    for i in range(10):
        with profiler.record_function("forward"):
            y, loss = model(x, desired)
        with profiler.record_function("backward"):
            loss.backward()

prof.export_chrome_trace("trace.json")

In [252]:
import modelInspector
reload(modelInspector)
from modelInspector import inspectModel, displayInspect
print(displayInspect(inspectModel(model, prof), sort_key=lambda x: x.cpu_memory_usage, row_limit=10))

aten::empty: Self CPU time: 204364.46200000675 CPU Time: 204364.46200000675 CPU Memory Usage: 875.01 Mb
  model.encodingLayers.7.layerNorm2:(137)                             mu = x.mean((1,2,3), keepdim=True)

  model.encodingLayers.7:(563)                                        ui = self.layerNorm1(x+attentionOut) # todo: check to see if layer norm inside res net block is doing weird stuff, since we have a second res net thing below not attached

  model.encodingLayers:(117)                                              input = module(input)

  model:(224)                                                         forwardPass = self.encodingLayers(embeddings)

aten::resize_: Self CPU time: 49074.98000000351 CPU Time: 49074.98000000351 CPU Memory Usage: 570.00 Mb
  model.finalProjection2:(171)                                            res = (torch.einsum(einsumStr, x, self.weight)+self.bias)

  model.encodingLayers.7.attention:(462)                              q = self.Q(x, "blnd,dk->bln

In [218]:
import inspect
# For every line in the forward function of the given module, this returns something that looks like
# ('pathToForwardFile.py(206): forward', 206, 'embeddings = torch.cat([embs, posEmbs], axis=3)')
def getForwardPaths(module):
    lines, lineNum = inspect.getsourcelines(module.forward)
    filePath = inspect.getsourcefile(module.forward)
    for i, line in enumerate(lines):
        yield f"{filePath}({lineNum+i}): forward", (lineNum+i), line

# returns a dict that can take a path string that looks like 
# /home/azureuser/openai_learning/customTransformer.py(206): forward
# and returns (module name of that forward function, line number, the code on that line)
def makePathMapping(model):
    mapping = {}
    for mn, m in model.named_modules():
        # add model. to front so we don't have empty string for model
        if mn == "": mn = "model"
        else: mn = "model." + mn
        for forwardPath, lineNum, line in getForwardPaths(m):
            mapping[forwardPath] = (mn, lineNum, line)
    return mapping

# returns a dict that can take a path string that looks like 
# /home/azureuser/openai_learning/customTransformer.py(206): forward
# and returns module name of that forward function
def makePathModuleMapping(model):
    mapping = {}
    for mn, m in model.named_modules():
        for forwardPath, lineNum, line in getForwardPaths(m):
            mapping[forwardPath] = mn
    return mapping



In [219]:

mappingToCodeLine = makePathMapping(model)
mappingToModule = makePathModuleMapping(model)


def stackToModule(stackStr):
    if stackStr in mappingToModule:
        return mappingToModule[stackStr]
    else:
        return None
def stackToLine(stackStr):
    if stackStr in mappingToCodeLine:
        return mappingToCodeLine[stackStr]
    else:
        return None

averages = key_averages(prof.function_events, stackKey=stackToModule, stackParse=stackToLine)

In [231]:
from torch.autograd.profiler import format_memory, format_time_share, format_time
events = list(averages)
events.sort(key=lambda x: x.cpu_memory_usage)
def stackItemToStr(stackItem):
    moduleName, lineNum, code = stackItem
    return f"{moduleName}:({lineNum})"

largestModuleNameSize = max([max([len(stackItemToStr(s)) for s in event.stack]) for event in events if len(event.stack) > 0])
for event in events[::-1]:
    print(f"{event.key}: Self CPU time: {event.self_cpu_time_total} CPU Time: {event.cpu_time_total} CPU Memory Usage: {format_memory(event.cpu_memory_usage)}")
    for moduleName, lineNum, code in event.stack:
        moduleStr = "  " + stackItemToStr((moduleName, lineNum, s))
        padding = " "*(largestModuleNameSize-len(moduleName)) # ensures code lines all line up
        print(moduleStr + padding + code)

aten::empty: Self CPU time: 237243.8650000534 CPU Time: 237243.8650000534 CPU Memory Usage: 875.01 Mb
  model.encodingLayers.7.layerNorm2:(137)                             mu = x.mean((1,2,3), keepdim=True)

  model.encodingLayers.7:(563)                                        ui = self.layerNorm1(x+attentionOut) # todo: check to see if layer norm inside res net block is doing weird stuff, since we have a second res net thing below not attached

  model.encodingLayers:(117)                                              input = module(input)

  model:(224)                                                         forwardPass = self.encodingLayers(embeddings)

aten::resize_: Self CPU time: 53733.82899999546 CPU Time: 53733.82899999546 CPU Memory Usage: 570.00 Mb
  model.finalProjection2:(171)                                            res = (torch.einsum(einsumStr, x, self.weight)+self.bias)

  model.encodingLayers.7.attention:(462)                              q = self.Q(x, "blnd,dk->blnk"

In [11]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs")

In [12]:
writer.add_graph(model, (x, desired))
writer.close()

In [56]:
# things to sort by:
# cpu_time, cpu_time_total
# cuda_time, cuda_time_total
# self_cpu_memory_usage, cpu_memory_usage
# self_cuda_memory_usage, cuda_memory_usage
# count
#print(dir(list(prof.key_averages())[0]))
events = [x for x in prof.function_events]
events.sort(key=lambda x: x.name)
from collections import defaultdict

class CustomEvent(object):
    def __init__(self, event):
        self.event = event
        self.stackItems = [s for s in event.stack if "forward" in s]

def groupBy(arr, keyFunc):
    groupedBy = defaultdict(lambda: [])
    for x in arr:
        groupedBy[keyFunc(x)].append(x)
    return groupedBy


goodEvents = [CustomEvent(event) for event in events]

groupedByStack = groupBy(goodEvents, lambda x: "\n".join(x.stackItems))

#print([x for x in prof.function_events][100].stack)
#print([x for x in prof.function_events][0].cpu_children)
#print(list(prof.key_averages())[0].cpu_children)
#print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_memory_usage", row_limit=100))

In [None]:
vars(groupedByStack[firstKey][0].event)