In [1]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import os
import torch
import sys
import time
from models import UNet
from dataloader import get_dataloaders

In [3]:
model_list = ["unet_d3_out1_gpu_L1_traced.pt","unet_d4_out1_gpu_L1_traced.pt","unet_d5_out1_gpu_L1_traced.pt","unet_d6_out1_gpu_L1_traced.pt","unet_d7_out1_gpu_L1_traced.pt"]

In [4]:
for i in range(5):
    Depth = i + 3
    print(f"UNet Depth: {Depth}")
    ## CUDA
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("=== Result ===")
    print(f"Using device: {device}")

    timings = {}
    in_len = 5
    out_len = 1

    # Data preparation
    torch.cuda.synchronize()
    data_prep_start = time.time()
    test_loader, _ = get_dataloaders(1,
                                in_len=in_len,
                                out_len=out_len,
                                )

    test_input, test_target = next(iter(test_loader))
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    torch.cuda.synchronize()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start

    # Model load
    torch.cuda.synchronize()
    start_time = time.time()
    with torch.no_grad():
        model = torch.jit.load('./checkpoint/'+model_list[i], map_location="cuda")
        model.eval()
        torch.cuda.synchronize() 
        model_prep_end = time.time()
        timings['model_load'] = model_prep_end - start_time

    # Warm up
    with torch.no_grad():
        _ = model(test_input)
    
    # Inference
    num_runs = 100 
    inference_times = []

    for j in range(num_runs):
        torch.cuda.synchronize()
        start_inference = time.time()
    
        with torch.no_grad():
            test_output = model(test_input)
    
        torch.cuda.synchronize()
        end_inference = time.time()
        inference_times.append(end_inference - start_inference)

    avg_inference = sum(inference_times) / num_runs
    std_inference = (sum((x - avg_inference) ** 2 for x in inference_times) / num_runs) ** 0.5
    timings['avg_inference'] = avg_inference
    timings['std_inference'] = std_inference

    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")
    print(f"Model load time: {timings['model_load']:.6f} sec")
    print(f"Average inference ({num_runs} runs): {avg_inference:.6f} sec")

    batch_size = test_input.size(0)
    throughput = batch_size / avg_inference
    print(f"Throughput: {throughput:.2f} samples/sec")

    ## CPU
    device = torch.device('cpu')
    print("=== Result ===")
    print(f"Using device: {device}")

    timings = {}
    in_len = 5
    out_len = 1

    # Data preparation
    data_prep_start = time.time()
    test_loader, _ = get_dataloaders(1,
                                in_len=in_len,
                                out_len=out_len,
                                )

    test_input, test_target = next(iter(test_loader))
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start

    # Model load

    start_time = time.time()
    with torch.no_grad():
        model = torch.jit.load('./checkpoint/'+model_list[i], map_location="cpu")
        model.eval()
        model_prep_end = time.time()
        timings['model_load'] = model_prep_end - start_time
        
    # Warm up
    with torch.no_grad():
        _ = model(test_input)

    # Inference
    num_runs = 100 
    inference_times = []

    for j in range(num_runs):
        start_inference = time.time()
    
        with torch.no_grad():
            test_output = model(test_input)
    
        end_inference = time.time()
        inference_times.append(end_inference - start_inference)

    avg_inference = sum(inference_times) / num_runs
    std_inference = (sum((x - avg_inference) ** 2 for x in inference_times) / num_runs) ** 0.5
    timings['avg_inference'] = avg_inference
    timings['std_inference'] = std_inference

    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")
    print(f"Model load time: {timings['model_load']:.6f} sec")
    print(f"Average inference ({num_runs} runs): {avg_inference:.6f} sec")

    batch_size = test_input.size(0)
    throughput = batch_size / avg_inference
    print(f"Throughput: {throughput:.2f} samples/sec\n")

UNet Depth: 3
=== Result ===
Using device: cuda
Data preparation time: 0.392689 sec
Model load time: 0.037351 sec
Average inference (100 runs): 0.001300 sec
Throughput: 769.09 samples/sec
=== Result ===
Using device: cpu
Data preparation time: 0.327632 sec
Model load time: 0.007464 sec
Average inference (100 runs): 0.145327 sec
Throughput: 6.88 samples/sec

UNet Depth: 4
=== Result ===
Using device: cuda
Data preparation time: 0.413237 sec
Model load time: 0.058280 sec
Average inference (100 runs): 0.001704 sec
Throughput: 586.86 samples/sec
=== Result ===
Using device: cpu
Data preparation time: 0.472553 sec
Model load time: 0.011139 sec
Average inference (100 runs): 0.154578 sec
Throughput: 6.47 samples/sec

UNet Depth: 5
=== Result ===
Using device: cuda
Data preparation time: 0.552009 sec
Model load time: 0.048340 sec
Average inference (100 runs): 0.002204 sec
Throughput: 453.79 samples/sec
=== Result ===
Using device: cpu
Data preparation time: 0.511107 sec
Model load time: 0.0325

In [15]:
%%bash
nvcc -c cuda_sync.c -o cuda_sync.o -I/user-environment/env/default/include

In [7]:
%%bash
export LD_LIBRARY_PATH=/users/class191/miniconda3/envs/weather-cnn/lib:/users/class191/FTorchbin/lib64:$LD_LIBRARY_PATH
for depth in d3 d4 d5 d6 d7; do
    echo "Running with depth: $depth"
    make run_infer_cuda MODEL=checkpoint/unet_${depth}_out1_gpu_L1_traced.pt DATADIR=data
done

Running with depth: d3
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d3_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    3.00000003E-03  sec
  Model load time:          2.99999993E-02  sec
  Average inference (         100  runs):    1.28000020E-03  sec
  Throughput:              781.249878      samples/sec
 UNet inference ran successfully
Running with depth: d4
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d4_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          3.70000005E-02  sec
  Average inference (         100  runs):    1.63000030E-03  sec
  Throughput:              613.496826      samples/sec
 UNet inference ran successfully
Running with depth: d5
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          5.79999983E-02  sec
  Average inference (         100  runs):    2.12000078E-03  sec
  Throughput:              471.697937      samples/sec
 UNet inference ran successfully
Running with depth: d6
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d6_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:         0.149000004      sec
  Average inference (         100  runs):    3.04000080E-03  sec
  Throughput:              328.947296      samples/sec
 UNet inference ran successfully
Running with depth: d7
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d7_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    3.00000003E-03  sec
  Model load time:         0.495000005      sec
  Average inference (         100  runs):    5.95000014E-03  sec
  Throughput:              168.067230      samples/sec
 UNet inference ran successfully


In [8]:
%%bash
export LD_LIBRARY_PATH=/users/class191/miniconda3/envs/weather-cnn/lib:/users/class191/FTorchbin/lib64:$LD_LIBRARY_PATH
for depth in d3 d4 d5 d6 d7; do
    echo "Running with depth: $depth"
    make run_infer_cpu MODEL=checkpoint/unet_${depth}_out1_gpu_L1_traced.pt DATADIR=data
done

Running with depth: d3
./infer_fortran_cpu.x checkpoint/unet_d3_out1_gpu_L1_traced.pt data
 === Fortran (ftorch) Performance Results ===
 Device: CPU
  Data preparation time:    1.00000005E-03  sec
  Model load time:         0.148000002      sec
  Average inference (         100  runs):   0.261799991      sec
  Throughput:              3.81970978      samples/sec
 UNet inference ran successfully
Running with depth: d4
./infer_fortran_cpu.x checkpoint/unet_d4_out1_gpu_L1_traced.pt data
 === Fortran (ftorch) Performance Results ===
 Device: CPU
  Data preparation time:    2.00000009E-03  sec
  Model load time:         0.128999993      sec
  Average inference (         100  runs):   0.170700014      sec
  Throughput:              5.85823011      samples/sec
 UNet inference ran successfully
Running with depth: d5
./infer_fortran_cpu.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data
 === Fortran (ftorch) Performance Results ===
 Device: CPU
  Data preparation time:    2.00000009E-03  sec
  Mo