In [13]:
import torch
import intel_extension_for_pytorch as ipex

In [14]:
import torch

device = torch.device('cpu')  # Use CPU for loading the model
model = torch.load('pneumonia11.pth', map_location=device)
model = ipex.optimize(model)
model.eval()

GraphModule(
  (conv1): _IPEXConv2d()
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Module(
    (0): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
      (conv2): _IPEXConv2d()
    )
    (1): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
      (conv2): _IPEXConv2d()
    )
  )
  (layer2): Module(
    (0): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
      (conv2): _IPEXConv2d()
      (downsample): Module(
        (0): _IPEXConv2d()
      )
    )
    (1): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
      (conv2): _IPEXConv2d()
    )
  )
  (layer3): Module(
    (0): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
      (conv2): _IPEXConv2d()
      (downsample): Module(
        (0): _IPEXConv2d()
      )
    )
    (1): Module(
      (conv1): _IPEXConv2d()
      (relu): ReLU(inplace=True)
  

In [15]:
import pydicom
from pydicom import dcmread
import torchvision.transforms as transforms
import pydicom
from pydicom import dcmread
from PIL import Image
import numpy as np


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.Resize(224),
    transforms.ToTensor()
])


In [19]:
model = torch.load("pneumonia11.pth", map_location=device)
model = ipex.optimize(model)

In [26]:
torch.save(model,"ipex_pneumonia.pth") 

In [20]:
%%time
test_image_path = 'test_dicom.dcm'
test_image = Image.fromarray((dcmread(test_image_path).pixel_array / 255.0 * 255).clip(0, 255).astype(np.uint8)).convert('RGB')
test_image = transform(test_image).unsqueeze(0).to(device)


CPU times: user 175 ms, sys: 0 ns, total: 175 ms
Wall time: 21 ms


In [30]:
%%time
model = torch.load("ipex_pneumonia.pth")
with torch.no_grad():
    prediction = model(test_image)

CPU times: user 900 ms, sys: 76 ms, total: 976 ms
Wall time: 90.7 ms


In [34]:
trace_model = torch.jit.trace(model, test_image)
script_model = torch.jit.script(trace_model)
print(script_model)

GraphModule(
  original_name=GraphModule
  (conv1): _IPEXConv2d(original_name=_IPEXConv2d)
  (relu): ReLU(original_name=ReLU)
  (maxpool): MaxPool2d(original_name=MaxPool2d)
  (layer1): Module(
    original_name=Module
    (0): Module(
      original_name=Module
      (conv1): _IPEXConv2d(original_name=_IPEXConv2d)
      (relu): ReLU(original_name=ReLU)
      (conv2): _IPEXConv2d(original_name=_IPEXConv2d)
    )
    (1): Module(
      original_name=Module
      (conv1): _IPEXConv2d(original_name=_IPEXConv2d)
      (relu): ReLU(original_name=ReLU)
      (conv2): _IPEXConv2d(original_name=_IPEXConv2d)
    )
  )
  (layer2): Module(
    original_name=Module
    (0): Module(
      original_name=Module
      (conv1): _IPEXConv2d(original_name=_IPEXConv2d)
      (relu): ReLU(original_name=ReLU)
      (conv2): _IPEXConv2d(original_name=_IPEXConv2d)
      (downsample): Module(
        original_name=Module
        (0): _IPEXConv2d(original_name=_IPEXConv2d)
      )
    )
    (1): Module(
      o

In [37]:
%%time
with torch.no_grad():
    prediction = trace_model(test_image)


CPU times: user 222 ms, sys: 0 ns, total: 222 ms
Wall time: 22 ms


In [40]:
freeze_model = torch.jit.freeze(script_model)

In [45]:
%%time
with torch.no_grad():
    prediction = freeze_model(test_image)

CPU times: user 281 ms, sys: 4.94 ms, total: 286 ms
Wall time: 27.2 ms


In [47]:
%%time
for i in range( 0 , 15 ) :
    with torch.no_grad():
        model_pred = model(test_image)
        script_pred = script_model(test_image)
        trace_pred = trace_model(test_image)
        freeze_pred = freeze_model(test_image)


CPU times: user 5.67 s, sys: 2.49 ms, total: 5.68 s
Wall time: 488 ms


In [48]:
%%time
with torch.no_grad():
    prediction = model(test_image)

CPU times: user 98.7 ms, sys: 209 µs, total: 98.9 ms
Wall time: 9.71 ms


In [49]:
%%time
with torch.no_grad():
    prediction = script_model(test_image)

CPU times: user 91.1 ms, sys: 3.69 ms, total: 94.8 ms
Wall time: 8.28 ms


In [50]:
%%time
with torch.no_grad():
    prediction = trace_model(test_image)

CPU times: user 97.2 ms, sys: 90 µs, total: 97.3 ms
Wall time: 7.7 ms


In [51]:
%%time
with torch.no_grad():
    prediction = freeze_model(test_image)

CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 17.2 ms


In [68]:
torch.jit.save(script_model, "script_model.pth")
torch.jit.save(trace_model, "trace_model.pth")
torch.jit.save(freeze_model, "freeze_model.pth")


In [75]:
# Quantization
import torch
import torchvision
from time import time
import os
import matplotlib.pyplot as plt
import intel_extension_for_pytorch as ipex
from intel_extension_for_pytorch.quantization import prepare, convert


In [76]:
def dynamicQuantize(model_fp32, data):
    # Acquire inference times for dynamic quantization INT8 model
    qconfig_dynamic = ipex.quantization.default_dynamic_qconfig
    print("Quantize Model with Dynamic Quantization ...")

    prepared_model_dynamic = prepare(model_fp32, qconfig_dynamic, example_inputs=data, inplace=False)

    converted_model_dynamic = convert(prepared_model_dynamic)
    with torch.no_grad():
        traced_model_dynamic = torch.jit.trace(converted_model_dynamic, data)
        traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)

    # save the quantized dynamic model 
    traced_model_dynamic.save("dynamic_quantized_trace_model.pth")
    return traced_model_dynamic


In [77]:
data = test_image

model = torch.load("trace_model.pth") 
dynamicQuantize( model , data )

Quantize Model with Dynamic Quantization ...


RecursiveScriptModule(original_name=GraphModule)

In [80]:
for i in range( 0 , 50 ) :
    with torch.no_grad():
        quant_trace = model(data)


In [81]:
%%time 
import time 

with torch.no_grad():
    start_time = time.time() 
    quant_trace = model(data)
    end_time = time.time() 
    total += end_time - start_time 

CPU times: user 94 ms, sys: 4.18 ms, total: 98.2 ms
Wall time: 8.26 ms


In [None]:
"""
Function to perform inference on Resnet50 and BERT
"""
def runInference(model, data, modelName="resnet50", dataType="FP32", amx=True):
    """
    Input parameters
        model: the PyTorch model object used for inference
        data: a sample input into the model
        modelName: str representing the name of the model, supported values - resnet50, bert
        dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8
        amx: set to False to disable AMX on BF16, Default: True
    Return value
        inference_time: the time in seconds it takes to perform inference with the model
    """
    
    # Display run case
    if amx:
        isa_text = "AVX512_CORE_AMX"
    else:
        isa_text = "AVX512_CORE_VNNI"
    print("%s %s inference with %s" %(modelName, dataType, isa_text))

    # Configure environment variable
    if not amx:
        os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX512_CORE_VNNI"
    else:
        os.environ["ONEDNN_MAX_CPU_ISA"] = "DEFAULT"

    # Special variables for specific models
    if "bert" == modelName:
        d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference

    # Prepare model for inference based on precision (FP32, BF16, INT8)
    if "INT8" == dataType:
        # Quantize model to INT8 if needed (one time)
        model_filename = "quantized_model_%s.pt" %modelName
        if not os.path.exists(model_filename):
            qconfig = ipex.quantization.default_static_qconfig
            prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False)
            converted_model = convert(prepared_model)
            with torch.no_grad():
                if "resnet50" == modelName:
                    traced_model = torch.jit.trace(converted_model, data)
                elif "bert" == modelName:
                    traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False)
                else:
                    raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS))
                traced_model = torch.jit.freeze(traced_model)
            traced_model.save(model_filename)

        # Load INT8 model for inference
        model = torch.jit.load(model_filename)
        model.eval()
        model = torch.jit.freeze(model)
    elif "BF16" == dataType:
        model = ipex.optimize(model, dtype=torch.bfloat16)
        with torch.no_grad():
            with torch.cpu.amp.autocast():
                if "resnet50" == modelName:
                    model = torch.jit.trace(model, data)
                elif "bert" == modelName:
                    model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
                else:
                    raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS))
                model = torch.jit.freeze(model)
    else: # FP32
        with torch.no_grad():
            if "resnet50" == modelName:
                model = torch.jit.trace(model, data)
            elif "bert" == modelName:
                model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
            else:
                raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS))
            model = torch.jit.freeze(model)

    # Run inference
    with torch.no_grad():
        if "BF16" == dataType:
            with torch.cpu.amp.autocast():
                # Warm up
                for i in range(20):
                    model(data)
                
                # Measure latency
                start_time = time()
                for i in range(NUM_SAMPLES):
                    model(data)
                end_time = time()
        else:
            # Warm up
            for i in range(20):
                model(data)
            
            # Measure latency
            start_time = time()
            for i in range(NUM_SAMPLES):
                model(data)
            end_time = time()
    inference_time = end_time - start_time
    print("Inference on %d samples took %.3f seconds" %(NUM_SAMPLES, inference_time))

    return inference_time