In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Data loading
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Model definition
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
    
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Net()


def train(model):
     #Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    # Training loop
    num_epochs = 5
    for epoch in range(num_epochs):
        for images, labels in train_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    return model

model = train(model)

def evaluate(model):
    # Evaluation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Accuracy of the network on the 10000 test images: {100 * correct / total} %')
        
evaluate(model)


Epoch [1/5], Loss: 0.0819
Epoch [2/5], Loss: 0.0844
Epoch [3/5], Loss: 0.1049
Epoch [4/5], Loss: 0.0061
Epoch [5/5], Loss: 0.0391
Accuracy of the network on the 10000 test images: 97.28 %


In [11]:
from pytorch_nndct import Inspector

target = "DPUCVDX8G_ISA3_C32B6"
#DPUCVDX8G
# Initialize inspector with target
inspector = Inspector(target)
# Note: visualization of inspection results relies on the dot engine.If you don't install dot successfully, set 'image_format = None' when inspecting.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net()

#dummy_data, labels = train_loader
#data_dummy = []

dummy_input = torch.randn(28,28,1)

inspector.inspect(model, dummy_input, device=device, output_dir="inspect", image_format=None) 


[0;32m[VAIQ_NOTE]: Inspector is on.[0m

[0;32m[VAIQ_NOTE]: =>Start to inspect model...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing Net...[0m

[0;32m[VAIQ_NOTE]: Start to trace and freeze model...[0m

[0;32m[VAIQ_NOTE]: The input model nndct_st_Net_ed is torch.nn.Module.[0m

[0;32m[VAIQ_NOTE]: Finish tracing.[0m

[0;32m[VAIQ_NOTE]: Processing ops...[0m


██████████████████████████████████████████████████| 8/8 [00:00<00:00, 3330.13it/s, OpInfo: name = return_0, type = Return]             


[0;32m[VAIQ_NOTE]: =>Doing weights equalization...[0m

[0;32m[VAIQ_NOTE]: =>Quantizable module is generated.(inspect/Net.py)[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_18:
node name:Net::Net/Linear[fc2]/ret.9, op type:nndct_dense, output shape: [1, 256]
node name:Net::Net/ret.11, op type:nndct_relu, output shape: [1, 256]

[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_18:
node name:Net::Net/Linear[fc1]/ret.5, op type:nndct_dense, output shape: [1, 512]
node name:Net::Net/ret.7, op type:nndct_relu, output shape: [1, 512]

[0m

[0;32m[VAIQ_NOTE]: Find subgraph for reshape_fix_1:
node name:Net::Net/ret.3, op type:nndct_reshape, output shape: [1, 784]

[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:Net::Net/Linear[fc3]/ret, op type:nndct_dense, output shape: [1, 10]

[0m

[0;32m[VAIQ_NOTE]: The operators assigned to the CPU are as follows(see more details in 'inspect/inspect_DPUCVDX8G_ISA3_C32B6.txt'):[0m



I20240329 17:56:57.064254 168544 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240329 17:56:57.064272 168544 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240329 17:56:57.064276 168544 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCVDX8G_ISA3_C32B6
I20240329 17:56:57.064335 168544 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_dense_nndct_relu_HAyVkj65JnpKqxlY, with op num: 9
I20240329 17:56:57.064339 168544 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240329 17:56:57.070897 168544 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240329 17:56:57.070916 168544 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240329 17:56:57.075683 168544 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240329 17:56:57.075695 168544 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240329 17:56:57.075699 168

node name       op Type        hardware constraints
--------------  -------------  -----------------------------------
Net::Net/ret.3  nndct_reshape  The input of reshape is not on DPU.

[0;32m[VAIQ_NOTE]: =>Finish inspecting.[0m


In [None]:
import pytorch_nndct
from pytorch_nndct.apis import torch_quantizer


quantizer = torch_quantizer("calib", model, dummy_input) #needs to be in test mode to generate x_model
quant_model = quantizer.quant_model # is quantize aware
qunat_model = train(quant_model)
evaluate(quant_model)


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_CUDA_UNAVAILABLE]: CUDA (HIP) is not available, change device to CPU[0m

[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- seakn-ThinkPad-P16s-Gen-1
              release --- 6.5.0-26-generic
              version --- #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue Mar 12 10:22:43 UTC 2
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 7.5.0
               python --- 3.8.6
              pytorch --- 1.13.1
        vai_q_pytorch --- 3.5.0+60df3f1+torch1.13.1[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_CUDA_UNAVAILABLE]: CUDA (HIP) is not available, change device to CPU.[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization calibration process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing Net.

██████████████████████████████████████████████████| 8/8 [00:00<00:00, 3276.16it/s, OpInfo: name = return_0, type = Return]             


[0;32m[VAIQ_NOTE]: =>Doing weights equalization...[0m

[0;32m[VAIQ_NOTE]: =>Quantizable module is generated.(quantize_result/Net.py)[0m

[0;32m[VAIQ_NOTE]: =>Get module with quantization.[0m





In [10]:
%tb

SystemExit: 0