In [None]:
!pip install --upgrade pip
!git clone https://github.com/IntelLabs/FP8-Emulation-Toolkit.git
%cd FP8-Emulation-Toolkit
!pip install -r requirements.txt
!python setup.py install

In [None]:
%cd /content/FP8-Emulation-Toolkit
!python setup.py clean --all
!python setup.py build_ext --inplace

In [None]:
!nvcc --version
!gcc --version

In [None]:
import mpemu
print("FP8 Emulation Toolkit installed successfully!")

FP8 Emulation Toolkit installed successfully!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from Conv2d import CustomConv2D  # Import custom convolution layer
from mpemu import mpt_emu  # Import FP8 Emulation Toolkit

class MNISTModel(nn.Module):
    def __init__(self, custom_conv=False):
        super(MNISTModel, self).__init__()
        if custom_conv:
            # Use custom conv layer for inference
            self.conv1 = CustomConv2D(1, 32, kernel_size=3, padding=1)
            self.conv2 = CustomConv2D(32, 64, kernel_size=3, padding=1)
        else:
            # Use PyTorch's Conv2d for training
            self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
            self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

batch_size = 64
learning_rate = 0.001
num_epochs = 5

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model for training (using PyTorch's Conv2d)
model = MNISTModel(custom_conv=False).float()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train_model(model, criterion, optimizer, train_loader, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

train_model(model, criterion, optimizer, train_loader, device, num_epochs)

# switch to custom conv layers for inference
model_custom = MNISTModel(custom_conv=True)
model_custom.load_state_dict(model.state_dict())
model_custom = model_custom.to(device)

# Quantize the model for inference using FP8 (after training is complete)
list_exempt_layers = ["conv1","conv2","fc1", "fc2"]  # Exempt fully connected layers from FP8 conversion
model_custom, emulator = mpt_emu.quantize_model(model_custom, dtype="E4M3", list_exempt_layers=list_exempt_layers)

# Function to evaluate the model after quantization
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy after quantization: {accuracy:.2f}%')

# Evaluate the quantized model on the test set
evaluate_model(model_custom, test_loader, device)


Epoch [1/5], Loss: 0.1362
Epoch [2/5], Loss: 0.0427
Epoch [3/5], Loss: 0.0283
Epoch [4/5], Loss: 0.0216
Epoch [5/5], Loss: 0.0168
e4m3 : quantizing model weights..
Accuracy after quantization: 98.96%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from Conv2d import CustomConv2D
from mpemu import mpt_emu

class MNISTModel(nn.Module):
    def __init__(self, custom_conv=False):
        super(MNISTModel, self).__init__()
        if custom_conv:
            # Use custom conv layer for inference
            self.conv1 = CustomConv2D(1, 32, kernel_size=3, padding=1)
            self.conv2 = CustomConv2D(32, 64, kernel_size=3, padding=1)
        else:
            # Use PyTorch's Conv2d for training
            self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
            self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Function to print the input and output tensors of each layer
def hook_fn(layer_name):
    def hook(module, input, output):
        print(f"Layer: {layer_name}")
        print(f"Input tensor shape: {input[0].shape}")
        print(f"Output tensor shape: {output.shape}")
        print(f"Input tensor: {input[0]}")
        print(f"Output tensor: {output}\n")
    return hook

# Attach hooks to layers
def attach_hooks(model):
    model.conv1.register_forward_hook(hook_fn('conv1'))
    model.conv2.register_forward_hook(hook_fn('conv2'))
    model.fc1.register_forward_hook(hook_fn('fc1'))
    model.fc2.register_forward_hook(hook_fn('fc2'))

batch_size = 64
learning_rate = 0.001
num_epochs = 5

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model for training (using PyTorch's Conv2d)
model = MNISTModel(custom_conv=False).float()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train_model(model, criterion, optimizer, train_loader, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

train_model(model, criterion, optimizer, train_loader, device, num_epochs)

# switch to custom conv layers for inference
model_custom = MNISTModel(custom_conv=True)
model_custom.load_state_dict(model.state_dict())
model_custom = model_custom.to(device)

# Attach hooks to custom model for printing intermediate tensors
attach_hooks(model_custom)

# Quantize the model for inference using FP8 (after training is complete)
list_exempt_layers = ["conv1","conv2","fc1", "fc2"]  # Exempt fully connected layers from FP8 conversion
model_custom, emulator = mpt_emu.quantize_model(model_custom, dtype="E4M3", list_exempt_layers=list_exempt_layers)

# Single image inference with intermediate output printing
def inference_with_hooks(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            single_image = images[0].unsqueeze(0)
            print(f"Input image shape: {single_image.shape}")
            outputs = model(single_image)
            print(f"Model output: {outputs}")
            break

inference_with_hooks(model_custom, test_loader, device)

Epoch [1/5], Loss: 0.1319
Epoch [2/5], Loss: 0.0411
Epoch [3/5], Loss: 0.0291
Epoch [4/5], Loss: 0.0210
Epoch [5/5], Loss: 0.0160
e4m3 : quantizing model weights..
Input image shape: torch.Size([1, 1, 28, 28])
Layer: conv1
Input tensor shape: torch.Size([1, 1, 28, 28])
Output tensor shape: torch.Size([1, 32, 28, 28])
Input tensor: tensor([[[[-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,

In [None]:
print(model)

MNISTModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)


In [None]:
print(model_custom)

MNISTModel(
  (conv1): CustomConv2D()
  (conv2): CustomConv2D()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)
