# Visualization helper functions

This file can help you visualize the order in which your batches are processed.

In [2]:
import asyncio

from lib import Model, Dist
from utils import write_chrome_trace, get_training_stats, get_model_theoretical_best_time

from part1 import part1_training_loop # Change the training call to visualize the training process

In [3]:
# Copy paste the main function from respective part of your code here.
async def main():
    world_size = 32
    num_layers = 64
    global_batch_size = 256
    batch_size = 8

    dist = Dist(world_size)
    models: List[Model] = [
        Model(rank, dist, num_layers, global_batch_size) for rank in range(world_size)
    ]

    theoretical_time = get_model_theoretical_best_time(models[0])

    out = await asyncio.gather(
        *(part1_training_loop(model, batch_size) for model in models)
    )

    execution_time, max_memory, max_memory_idx = get_training_stats(out)

    mfu = (theoretical_time / execution_time) * 100

    print(f"MFU: {mfu}")

    write_chrome_trace(out, "./debug_traces/part1.json")

In [4]:
trained_model = await main()

Execution time: 0.18436232169217012 
Maximum Memory: 2297954304.0 at GPU: 0 time: 0.06849864747465671
MFU: 19.084361579899916


In [5]:
get_training_stats(trained_model)
# Navigate to chrome://tracing and load the generated json file to visualize the training process
write_chrome_trace(trained_model, './debug_traces/test.json')

TypeError: 'NoneType' object is not subscriptable