In [2]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import os
import torch
import sys
import time
from models import UNet
from dataloader import get_dataloaders

In [10]:
for i in range(5):
    # set the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    timings = {}
    in_len = 5
    out_len = 1
    
    # Data preparation
    torch.cuda.synchronize() 
    data_prep_start = time.time()
    test_loader, _ = get_dataloaders(1,
                                in_len=in_len,
                                out_len=out_len,
                                )
    
    test_input, test_target = next(iter(test_loader))
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    torch.cuda.synchronize()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start
    
    # Model load 
    torch.cuda.synchronize() 
    start_time = time.time()
    with torch.no_grad():
        model = torch.jit.load('./checkpoint/unet_d5_out1_gpu_L1_traced.pt', map_location="cuda")
        model.eval()
        torch.cuda.synchronize() 
        model_prep_end = time.time()
        timings['model_load'] = model_prep_end - start_time
    
    # Warm up
    with torch.no_grad():
        _ = model(test_input)
    
    # Inference
    num_runs = 100 
    inference_times = []
    
    for j in range(num_runs):
        torch.cuda.synchronize() 
        start_inference = time.time()
        
        with torch.no_grad():
            test_output = model(test_input)
        
        torch.cuda.synchronize()
        end_inference = time.time()
        inference_times.append(end_inference - start_inference)
    
    avg_inference = sum(inference_times) / num_runs
    std_inference = (sum((x - avg_inference) ** 2 for x in inference_times) / num_runs) ** 0.5
    timings['avg_inference'] = avg_inference
    timings['std_inference'] = std_inference
    
    print("\n=== Result ===")
    print(f"device: {device}")
    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")
    print(f"Model load time: {timings['model_load']:.6f} sec")
    print(f"Average inference ({num_runs} runs): {avg_inference:.6f} sec")
    
    batch_size = test_input.size(0)
    throughput = batch_size / avg_inference
    print(f"Throughput: {throughput:.2f} samples/sec")

Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.528593 sec
Model load time: 0.036943 sec
Average inference (100 runs): 0.002110 sec
Throughput: 473.97 samples/sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.502487 sec
Model load time: 0.029973 sec
Average inference (100 runs): 0.002122 sec
Throughput: 471.32 samples/sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.594076 sec
Model load time: 0.030573 sec
Average inference (100 runs): 0.002126 sec
Throughput: 470.45 samples/sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.578383 sec
Model load time: 0.029953 sec
Average inference (100 runs): 0.002120 sec
Throughput: 471.77 samples/sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.538334 sec
Model load time: 0.029346 sec
Average inference (100 runs): 0.002131 sec
Throughput: 469.33 samples/sec


In [6]:
for i in range(5):
    # set the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    timings = {}
    in_len = 5
    out_len = 1
    
    # Data preparation
    torch.cuda.synchronize() 
    data_prep_start = time.time()
    test_input = np.fromfile("data/input_tensor.dat", dtype=np.float32)
    input_shape = (320, 320, in_len, out_len)
    test_input = test_input.reshape(input_shape)
    test_input = test_input.transpose()
    test_target = np.fromfile("data/target_tensor.dat", dtype=np.float32)
    target_shape = (320, 320, 1, 1)
    test_target = test_target.reshape(target_shape)
    test_target = test_target.transpose()

    test_input, test_target = torch.from_numpy(test_input), torch.from_numpy(test_target) 
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    torch.cuda.synchronize()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start
       
    print("\n=== Result ===")
    print(f"device: {device}")
    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")

Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.001819 sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.001367 sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.001186 sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.001170 sec
Using device: cuda

=== Result ===
device: cuda
Data preparation time: 0.001458 sec


In [11]:
for i in range(5):
    # set the device
    device = torch.device('cpu')
    print(f"Using device: {device}")
    
    timings = {}
    in_len = 5
    out_len = 1
    
    # Data preparation
    data_prep_start = time.time()
    test_loader, _ = get_dataloaders(1,
                                in_len=in_len,
                                out_len=out_len,
                                )
    
    test_input, test_target = next(iter(test_loader))
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start
    
    # Model load
    start_time = time.time()
    with torch.no_grad():
        model = torch.jit.load('./checkpoint/unet_d5_out1_gpu_L1_traced.pt', map_location="cpu")
        model.eval()
        model_prep_end = time.time()
        timings['model_load'] = model_prep_end - start_time
    
    # Warm up
    with torch.no_grad():
        _ = model(test_input)
    
    # Inference
    num_runs = 100 
    inference_times = []
    
    for i in range(num_runs):
        start_inference = time.time()
        
        with torch.no_grad():
            test_output = model(test_input)
        
        end_inference = time.time()
        inference_times.append(end_inference - start_inference)
    
    avg_inference = sum(inference_times) / num_runs
    std_inference = (sum((x - avg_inference) ** 2 for x in inference_times) / num_runs) ** 0.5
    timings['avg_inference'] = avg_inference
    timings['std_inference'] = std_inference
    
    print("\n=== Result ===")
    print(f"device: {device}")
    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")
    print(f"Model load time: {timings['model_load']:.6f} sec")
    print(f"Average inference ({num_runs} runs): {avg_inference:.6f} sec")
    
    batch_size = test_input.size(0)
    throughput = batch_size / avg_inference
    print(f"Throughput: {throughput:.2f} samples/sec")

Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.525012 sec
Model load time: 0.025642 sec
Average inference (100 runs): 0.189187 sec
Throughput: 5.29 samples/sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.644415 sec
Model load time: 0.031176 sec
Average inference (100 runs): 0.201618 sec
Throughput: 4.96 samples/sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.577001 sec
Model load time: 0.029350 sec
Average inference (100 runs): 0.176266 sec
Throughput: 5.67 samples/sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.589272 sec
Model load time: 0.032431 sec
Average inference (100 runs): 0.179104 sec
Throughput: 5.58 samples/sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.749024 sec
Model load time: 0.032046 sec
Average inference (100 runs): 0.202761 sec
Throughput: 4.93 samples/sec


In [7]:
for i in range(5):
    # set the device
    device = torch.device('cpu')
    print(f"Using device: {device}")
    
    timings = {}
    in_len = 5
    out_len = 1
    
    # Data preparation
    data_prep_start = time.time()
    test_input = np.fromfile("data/input_tensor.dat", dtype=np.float32)
    input_shape = (320, 320, in_len, out_len)
    test_input = test_input.reshape(input_shape)
    test_input = test_input.transpose()
    test_target = np.fromfile("data/target_tensor.dat", dtype=np.float32)
    target_shape = (320, 320, 1, 1)
    test_target = test_target.reshape(target_shape)
    test_target = test_target.transpose()

    test_input, test_target = torch.from_numpy(test_input), torch.from_numpy(test_target) 
    test_input, test_target = test_input.to(device), test_target.to(device).float()
    data_prep_end = time.time()
    timings['data_preparation'] = data_prep_end - data_prep_start
    
    print("\n=== Result ===")
    print(f"device: {device}")
    print(f"Data preparation time: {timings['data_preparation']:.6f} sec")

Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.001596 sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.001217 sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.001234 sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.001185 sec
Using device: cpu

=== Result ===
device: cpu
Data preparation time: 0.001256 sec


In [11]:
%%bash
nvcc -c cuda_sync.c -o cuda_sync.o -I/user-environment/env/default/include

In [8]:
%%bash
export LD_LIBRARY_PATH=/users/class191/miniconda3/envs/weather-cnn/lib:/users/class191/FTorchbin/lib64:$LD_LIBRARY_PATH
for i in {1..5}; do make run_infer_cuda MODEL=checkpoint/unet_d5_out1_gpu_L1_traced.pt DATADIR=data; done

gfortran -I/users/class191/FTorchbin/include -I/users/class191/FTorchbin/include/ftorch -I/users/class191/Project/FTorch/build/modules -o infer_fortran_cuda.x infer_fortran_cuda.f90 cuda_sync.o -L/users/class191/FTorchbin/lib64 -L/user-environment/env/default/lib64 -lftorch -lcudart
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          5.99999987E-02  sec
  Average inference (         100  runs):    2.10000062E-03  sec
  Throughput:              476.190338      samples/sec
 UNet inference ran successfully
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    3.00000003E-03  sec
  Model load time:          5.79999983E-02  sec
  Average inference (         100  runs):    2.11000093E-03  sec
  Throughput:              473.933441      samples/sec
 UNet inference ran successfully
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          5.70000000E-02  sec
  Average inference (         100  runs):    2.12000078E-03  sec
  Throughput:              471.697937      samples/sec
 UNet inference ran successfully
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          5.70000000E-02  sec
  Average inference (         100  runs):    2.10000062E-03  sec
  Throughput:              476.190338      samples/sec
 UNet inference ran successfully
LD_LIBRARY_PATH=/user-environment/env/default/lib64:$LD_LIBRARY_PATH ./infer_fortran_cuda.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data


/bin/sh: /users/class191/miniconda3/envs/weather-cnn/lib/libtinfo.so.6: no version information available (required by /lib64/libreadline.so.7)


 === Fortran (ftorch) Performance Results ===
 Device: CUDA
  Data preparation time:    2.00000009E-03  sec
  Model load time:          5.70000000E-02  sec
  Average inference (         100  runs):    2.10000062E-03  sec
  Throughput:              476.190338      samples/sec
 UNet inference ran successfully


In [9]:
%%bash
export LD_LIBRARY_PATH=/users/class191/miniconda3/envs/weather-cnn/lib:/users/class191/FTorchbin/lib64:$LD_LIBRARY_PATH
for i in {1..5}; do make run_infer_cpu MODEL=checkpoint/unet_d5_out1_gpu_L1_traced.pt DATADIR=data; done

gfortran -I/users/class191/FTorchbin/include -I/users/class191/FTorchbin/include/ftorch -I/users/class191/Project/FTorch/build/modules -o infer_fortran_cpu.x infer_fortran_cpu.f90 -L/users/class191/FTorchbin/lib64 -L/user-environment/env/default/lib64 -lftorch -lcudart
./infer_fortran_cpu.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data
 === Fortran (ftorch) Performance Results ===
 Device: CPU
  Data preparation time:    1.00000005E-03  sec
  Model load time:         0.195999995      sec
  Average inference (         100  runs):   0.305449963      sec
  Throughput:              3.27385855      samples/sec
 UNet inference ran successfully
./infer_fortran_cpu.x checkpoint/unet_d5_out1_gpu_L1_traced.pt data
 === Fortran (ftorch) Performance Results ===
 Device: CPU
  Data preparation time:    2.00000009E-03  sec
  Model load time:          6.59999996E-02  sec
  Average inference (         100  runs):   0.186890006      sec
  Throughput:              5.35074091      samples/sec
 UNet infer