In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

# Import the ResNet18 class from your resnet18.py file

# Import the model from the script
import resnet18

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ResNet-18 model
model = resnet18.resnet18(pretrained=False, device=device)  # Don't use default pretrained weights
model.to(device)

# Load the state dict from the .pt file
state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))

# Load the saved weights into the model, using strict=False to ignore mismatches
model.load_state_dict(state_dict, strict=False)

# Set the model to evaluation mode and move to device
model.eval()
model.to(device)

# Define the transformations for CIFAR-10
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Load the CIFAR-10 test dataset
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


Files already downloaded and verified
Accuracy of the network on the 10000 test images: 92 %


In [None]:
# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

In [None]:
NUM_BITS = 4
def uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor - min_val) / scale) * scale + min_val
    return quantized_tensor


# Apply Uniform Quantization
for name, param in model.named_parameters():
    if "weight" in name:
        print(name)
        param.data = uniform_quantization(param.data)



conv1.weight
bn1.weight
layer1.0.conv1.weight
layer1.0.bn1.weight
layer1.0.conv2.weight
layer1.0.bn2.weight
layer1.1.conv1.weight
layer1.1.bn1.weight
layer1.1.conv2.weight
layer1.1.bn2.weight
layer2.0.conv1.weight
layer2.0.bn1.weight
layer2.0.conv2.weight
layer2.0.bn2.weight
layer2.0.downsample.0.weight
layer2.0.downsample.1.weight
layer2.1.conv1.weight
layer2.1.bn1.weight
layer2.1.conv2.weight
layer2.1.bn2.weight
layer3.0.conv1.weight
layer3.0.bn1.weight
layer3.0.conv2.weight
layer3.0.bn2.weight
layer3.0.downsample.0.weight
layer3.0.downsample.1.weight
layer3.1.conv1.weight
layer3.1.bn1.weight
layer3.1.conv2.weight
layer3.1.bn2.weight
layer4.0.conv1.weight
layer4.0.bn1.weight
layer4.0.conv2.weight
layer4.0.bn2.weight
layer4.0.downsample.0.weight
layer4.0.downsample.1.weight
layer4.1.conv1.weight
layer4.1.bn1.weight
layer4.1.conv2.weight
layer4.1.bn2.weight
fc.weight


In [None]:
# Evaluate Model after AdaRound Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Uniform quantization: {100 * correct / total:.2f}%")
evaluate(model, testloader)

Test Accuracy after Uniform quantization: 89.89%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 2
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)
# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")
evaluate(model, test_loader)
# AdaRound: Adaptive Rounding Quantization with V Optimization

def adaround_round(tensor, v, num_bits=NUM_BITS, beta=BETA, lambda_reg=LAMBDA):
    scale = (tensor.max() - tensor.min()) / (2 ** num_bits - 1)
    h = torch.sigmoid(beta * v)
    rounded_tensor = torch.floor(tensor / scale) + h
    quantized_tensor = (rounded_tensor * scale).clamp(tensor.min(), tensor.max())
    regularization = lambda_reg * torch.sum(1 - torch.abs(2 * h - 1))
    return quantized_tensor, regularization

def uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor - min_val) / scale) * scale + min_val
    return quantized_tensor

def get_previous_layer_output(model, images, current_layer_name):
    layers = list(model.named_modules())
    prev_layer_name = None
    for i, (layer_name, _) in enumerate(layers):
        if layer_name == current_layer_name and i > 0:
            prev_layer_name = layers[i - 1][0]
            break
    if prev_layer_name is None:
        raise ValueError(f"Could not determine previous layer for {current_layer_name}")

    activation = {}
    def hook_fn(module, input, output):
        activation[prev_layer_name] = output.detach()

    handle = dict(model.named_modules())[prev_layer_name].register_forward_hook(hook_fn)
    with torch.no_grad():
        model(images)
    handle.remove()
    return activation[prev_layer_name]

def optimize_adaround(model, test_loader, num_iterations=1000, lr=0.01):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    first_conv = True  # Track first convolutional layer
    for name, param in model.named_parameters():
        if "conv" in name and "weight" in name:  # Only quantize convolutional layers
            if first_conv:
                first_conv = False
                continue  # Skip first convolutional layer
            v = torch.nn.Parameter(torch.zeros_like(param, device=device))
            original_weight = param.clone().detach()
            optimizer_v = optim.Adam([v], lr=lr)

            for images, _ in test_loader:
                images = images.to(device)
                with torch.no_grad():
                    x = get_previous_layer_output(model, images, name.replace("weight", ""))
                break  # Only need a single batch for optimization

            for _ in range(num_iterations):
                optimizer_v.zero_grad()
                quantized_weight, reg_loss = adaround_round(original_weight, v)

                # Ensure correct stride and padding based on original layer
                stride = model.state_dict()[name.replace("weight", "stride")]
                padding = model.state_dict()[name.replace("weight", "padding")]

                quantized_output = nn.functional.conv2d(x, quantized_weight, stride=stride, padding=padding)
                original_output = nn.functional.conv2d(x, original_weight, stride=stride, padding=padding)

                loss = torch.norm(original_output - quantized_output, p='fro') ** 2 + reg_loss
                loss.backward()
                optimizer_v.step()

            with torch.no_grad():
                param.copy_(adaround_round(original_weight, v)[0])

    print("AdaRound optimization complete.")

# Apply AdaRound Quantization
optimize_adaround(model, test_loader)

# # Apply Uniform Quantization to Conv Layers Only
# for name, param in model.named_parameters():
#     if "conv" in name and "weight" in name:
#         param.data = uniform_quantization(param.data)



evaluate(model, test_loader)

Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


Test Accuracy after Quantization: 86.53%


ValueError: Could not determine previous layer for layer1.0.conv1.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 3
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)

# AdaRound: Adaptive Rounding Quantization with V Optimization

def adaround_round(tensor, v, num_bits=NUM_BITS, beta=BETA, lambda_reg=LAMBDA):
    scale = (tensor.max() - tensor.min()) / (2 ** num_bits - 1)
    h = torch.sigmoid(beta * v)
    rounded_tensor = torch.floor(tensor / scale) + h
    quantized_tensor = (rounded_tensor * scale).clamp(tensor.min(), tensor.max())
    regularization = lambda_reg * torch.sum(1 - torch.abs(2 * h - 1))
    return quantized_tensor, regularization

def uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor - min_val) / scale) * scale + min_val
    return quantized_tensor

def get_previous_layer_output(model, images, current_layer_name):
    layers = list(model.named_modules())
    prev_layer_name = None
    for i, (layer_name, _) in enumerate(layers):
        if layer_name == current_layer_name and i > 0:
            prev_layer_name = layers[i - 1][0]
            break
    if prev_layer_name is None:
        raise ValueError(f"Could not determine previous layer for {current_layer_name}")

    activation = {}
    def hook_fn(module, input, output):
        activation[prev_layer_name] = output.detach()

    handle = dict(model.named_modules())[prev_layer_name].register_forward_hook(hook_fn)
    with torch.no_grad():
        model(images)
    handle.remove()
    return activation[prev_layer_name]

def optimize_adaround(model, test_loader, num_iterations=1000, lr=0.01):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    for name, param in model.named_parameters():
        if "conv" in name and "weight" in name and not "conv1":  # Only quantize convolutional layers
            v = torch.nn.Parameter(torch.zeros_like(param, device=device))
            original_weight = param.clone().detach()
            optimizer_v = optim.Adam([v], lr=lr)

            for images, _ in test_loader:
                images = images.to(device)
                with torch.no_grad():
                    x = get_previous_layer_output(model, images, name.replace("weight", ""))
                break  # Only need a single batch for optimization

            for _ in range(num_iterations):
                optimizer_v.zero_grad()
                quantized_weight, reg_loss = adaround_round(original_weight, v)

                # Ensure correct stride and padding based on original layer
                stride = model.state_dict()[name.replace("weight", "stride")]
                padding = model.state_dict()[name.replace("weight", "padding")]

                quantized_output = nn.functional.conv2d(x, quantized_weight, stride=stride, padding=padding)
                original_output = nn.functional.conv2d(x, original_weight, stride=stride, padding=padding)

                loss = torch.norm(original_output - quantized_output, p='fro') ** 2 + reg_loss
                loss.backward()
                optimizer_v.step()

            with torch.no_grad():
                param.copy_(adaround_round(original_weight, v)[0])

    print("AdaRound optimization complete.")

# Apply AdaRound Quantization
#optimize_adaround(model, test_loader)

# Apply Uniform Quantization to Conv Layers Only
for name, param in model.named_parameters():
    if "conv" in name and "weight" in name:
        param.data = uniform_quantization(param.data)

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)

Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


Test Accuracy after Quantization: 22.76%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 4
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)

# AdaRound: Adaptive Rounding Quantization with V Optimization

def adaround_round(tensor, v, num_bits=NUM_BITS, beta=BETA, lambda_reg=LAMBDA):
    scale = (tensor.max() - tensor.min()) / (2 ** num_bits - 1)
    h = torch.sigmoid(beta * v)
    rounded_tensor = torch.floor(tensor / scale) + h
    quantized_tensor = (rounded_tensor * scale).clamp(tensor.min(), tensor.max())
    regularization = lambda_reg * torch.sum(1 - torch.abs(2 * h - 1))
    return quantized_tensor, regularization

def uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor - min_val) / scale) * scale + min_val
    return quantized_tensor

def get_previous_layer_output(model, images, current_layer_name):
    layers = list(model.named_modules())
    prev_layer_name = None
    for i, (layer_name, _) in enumerate(layers):
        if layer_name == current_layer_name and i > 0:
            prev_layer_name = layers[i - 1][0]
            break
    if prev_layer_name is None:
        raise ValueError(f"Could not determine previous layer for {current_layer_name}")

    activation = {}
    def hook_fn(module, input, output):
        activation[prev_layer_name] = output.detach()

    handle = dict(model.named_modules())[prev_layer_name].register_forward_hook(hook_fn)
    with torch.no_grad():
        model(images)
    handle.remove()
    return activation[prev_layer_name]

def optimize_adaround(model, test_loader, num_iterations=1000, lr=0.01):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    updated_state_dict = model.state_dict()

    for name, param in model.named_parameters():
        if "conv" in name and "weight" in name and not 'conv1':  # Only quantize convolutional layers
            print(name)
            v = torch.nn.Parameter(torch.zeros_like(param, device=device))
            original_weight = param.clone().detach()
            optimizer_v = optim.Adam([v], lr=lr)

            for images, _ in test_loader:
                images = images.to(device)
                with torch.no_grad():
                    x = get_previous_layer_output(model, images, name.replace("weight", ""))
                break  # Only need a single batch for optimization

            for _ in range(num_iterations):
                optimizer_v.zero_grad()
                quantized_weight, reg_loss = adaround_round(original_weight, v)

                stride = model.state_dict()[name.replace("weight", "stride")]
                padding = model.state_dict()[name.replace("weight", "padding")]

                quantized_output = nn.functional.conv2d(x, quantized_weight, stride=stride, padding=padding)
                original_output = nn.functional.conv2d(x, original_weight, stride=stride, padding=padding)

                loss = torch.norm(original_output - quantized_output, p='fro') ** 2 + reg_loss
                loss.backward()
                optimizer_v.step()

            with torch.no_grad():
                updated_state_dict[name] = adaround_round(original_weight, v)[0]

    model.load_state_dict(updated_state_dict)  # Ensure model actually uses quantized weights
    print("AdaRound optimization complete. Model weights updated.")

# Apply AdaRound Quantization
optimize_adaround(model, test_loader)

# # Apply Uniform Quantization to Conv Layers Only
# for name, param in model.named_parameters():
#     if "conv" in name and "weight" in name:
#         param.data = uniform_quantization(param.data)

# Reload state dict to ensure updated weights are used
model.load_state_dict(model.state_dict())

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)


Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


AdaRound optimization complete. Model weights updated.
Test Accuracy after Quantization: 86.53%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 4
BETA = 2.0
LAMBDA = 0.0001  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)
# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)



# Store layer activations
temp_activations = {}

def activation_hook(layer_name):
    def hook(module, input, output):
        temp_activations[layer_name] = input[0].detach()
    return hook

# Register hooks for all convolutional layers, skipping the first conv layer
first_conv = True
for name, layer in model.named_modules():
    if isinstance(layer, nn.Conv2d):
        if first_conv:
            first_conv = False
            continue  # Skip the first convolutional layer
        layer.register_forward_hook(activation_hook(name))

# AdaRound: Adaptive Rounding Quantization with V Optimization

def adaround_round(tensor, v, num_bits=NUM_BITS, beta=BETA, lambda_reg=LAMBDA):
    scale = (tensor.max() - tensor.min()) / (2 ** num_bits - 1)
    h = torch.sigmoid(beta * v)
    rounded_tensor = torch.floor(tensor / scale) + h
    quantized_tensor = (rounded_tensor * scale).clamp(tensor.min(), tensor.max())
    regularization = lambda_reg * torch.sum(1 - torch.abs(2 * h - 1))
    return quantized_tensor, regularization

def uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor - min_val) / scale) * scale + min_val
    return quantized_tensor

def optimize_adaround(model, test_loader, num_iterations=100, lr=0.01):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    updated_state_dict = model.state_dict()
    print("optimize started")
    for name, param in model.named_parameters():
        if "conv" in name and "weight" in name:
            print(name)
            name = name.replace(".weight", "")
            if name not in temp_activations:
                continue  # Skip if activation was not stored
            print(name)
            v = torch.nn.Parameter(torch.zeros_like(param, device=device))
            original_weight = param.clone().detach()
            optimizer_v = optim.Adam([v], lr=lr)

            #x = temp_activations[name.replace("weight", "")]  # Get stored activation
            x = temp_activations[name]
            prev_loss = 100
            for iter in range(num_iterations):
                optimizer_v.zero_grad()
                quantized_weight, reg_loss = adaround_round(original_weight, v)

                quantized_output = nn.functional.conv2d(x, quantized_weight, stride=param.shape[2], padding=param.shape[3])
                original_output = nn.functional.conv2d(x, original_weight, stride=param.shape[2], padding=param.shape[3])

                loss = torch.norm(original_output - quantized_output, p='fro') ** 2 + reg_loss
                if prev_loss < torch.norm(original_output - quantized_output, p='fro') ** 2:
                  break
                else :
                  prev_loss = torch.norm(original_output - quantized_output, p='fro') ** 2
                if iter%10==0:
                  print(loss)
                  print(torch.norm(original_output - quantized_output, p='fro') ** 2)
                loss.backward()
                optimizer_v.step()

            with torch.no_grad():
                layer_name = name+".weight"
                print(layer_name)
                updated_state_dict[layer_name] = adaround_round(original_weight, v)[0]

    model.load_state_dict(updated_state_dict)  # Ensure model actually uses quantized weights
    print("AdaRound optimization complete. Model weights updated.")

# Run a forward pass to store activations
with torch.no_grad():
    for images, _ in test_loader:
        images = images.to(device)
        model(images)
        break  # Only need a single batch

# Apply AdaRound Quantization
optimize_adaround(model, test_loader)

# # Apply Uniform Quantization to Conv Layers Only
# for name, param in model.named_parameters():
#     if "conv" in name and "weight" in name:
#         param.data = uniform_quantization(param.data)

# Reload state dict to ensure updated weights are used
model.load_state_dict(model.state_dict())

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)


Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


Test Accuracy after Quantization: 86.53%
optimize started
conv1.weight
layer1.0.conv1.weight
layer1.0.conv1
tensor(4.7359, grad_fn=<AddBackward0>)
tensor(1.0495, grad_fn=<PowBackward0>)
tensor(3.8761, grad_fn=<AddBackward0>)
tensor(0.3731, grad_fn=<PowBackward0>)
tensor(3.6310, grad_fn=<AddBackward0>)
tensor(0.2345, grad_fn=<PowBackward0>)
tensor(3.4613, grad_fn=<AddBackward0>)
tensor(0.1884, grad_fn=<PowBackward0>)
tensor(3.3241, grad_fn=<AddBackward0>)
tensor(0.1641, grad_fn=<PowBackward0>)
tensor(3.2053, grad_fn=<AddBackward0>)
tensor(0.1511, grad_fn=<PowBackward0>)
tensor(3.0989, grad_fn=<AddBackward0>)
tensor(0.1457, grad_fn=<PowBackward0>)
layer1.0.conv1.weight
layer1.0.conv2.weight
layer1.0.conv2
tensor(4.0245, grad_fn=<AddBackward0>)
tensor(0.3381, grad_fn=<PowBackward0>)
layer1.0.conv2.weight
layer1.1.conv1.weight
layer1.1.conv1
tensor(4.7363, grad_fn=<AddBackward0>)
tensor(1.0499, grad_fn=<PowBackward0>)
tensor(3.9523, grad_fn=<AddBackward0>)
tensor(0.4341, grad_fn=<PowBackwa

In [None]:
print(temp_activations.keys())

NameError: name 'temp_activations' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 3
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)



def scaled_uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()

    # Apply power function scaling (x^0.55)
    tensor_sign = torch.sign(tensor)
    tensor_scaled = tensor.abs() ** 0.55 * tensor_sign  # Preserve sign

    scale = (tensor_scaled.max() - tensor_scaled.min()) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor_scaled - tensor_scaled.min()) / scale) * scale + tensor_scaled.min()

    # Descale back to original range
    quantized_tensor = (quantized_tensor.abs() ** (1/0.55)) * torch.sign(quantized_tensor)

    return quantized_tensor





# Apply AdaRound Quantization
#optimize_adaround(model, test_loader)

# Apply Uniform Quantization to Conv Layers Only
for name, param in model.named_parameters():
    if "conv" in name and "weight" in name:
        param.data = scaled_uniform_quantization(param.data)

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)

Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


Test Accuracy after Quantization: 80.88%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 3
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)



def offset_scaled_uniform_quantization(tensor, num_bits=NUM_BITS):
    min_val, max_val = tensor.min(), tensor.max()

    # Compute offset to center weights if necessary
    offset = tensor.mean()
    print(offset)
    tensor_shifted = tensor - offset

    # Apply power function scaling (x^0.55)
    tensor_sign = torch.sign(tensor_shifted)
    tensor_scaled = tensor_shifted.abs() ** 0.55 * tensor_sign  # Preserve sign

    scale = (tensor_scaled.max() - tensor_scaled.min()) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor_scaled - tensor_scaled.min()) / scale) * scale + tensor_scaled.min()

    # Descale back to original range and apply offset correction
    quantized_tensor = ((quantized_tensor.abs() ** (1/0.55)) * torch.sign(quantized_tensor)) + offset

    return quantized_tensor





# Apply AdaRound Quantization
#optimize_adaround(model, test_loader)

# Apply Uniform Quantization to Conv Layers Only
for name, param in model.named_parameters():
    if "conv" in name and "weight" in name:
        param.data = offset_scaled_uniform_quantization(param.data)

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)

Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


tensor(-0.0006)
tensor(-0.0005)
tensor(-0.0004)
tensor(-0.0004)
tensor(-0.0005)
tensor(-0.0006)
tensor(-0.0005)
tensor(-0.0005)
tensor(-0.0005)
tensor(-0.0002)
tensor(-0.0002)
tensor(-0.0002)
tensor(-0.0003)
tensor(-3.3380e-05)
tensor(4.6454e-05)
tensor(9.0216e-05)
tensor(2.5724e-05)
Test Accuracy after Quantization: 83.07%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import timm
import numpy as np
from sklearn.cluster import KMeans

# Hyperparameters
BATCH_SIZE = 128
NUM_BITS = 3
BETA = 2.0
LAMBDA = 0.01  # Regularization weight

def compute_fisher_information(model, dataloader, device):
    model.eval()
    fisher_information = {}
    for name, param in model.named_parameters():
        if "weight" in name:
            fisher_information[name] = torch.zeros_like(param, device=device)
    counter = 0
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        model.zero_grad()
        outputs = model(images)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()

        for name, param in model.named_parameters():
            if "weight" in name and param.grad is not None:
                fisher_information[name] += (param.grad ** 2)
        counter+=1
        if counter==2:
          break #only need 1 batch

    return fisher_information

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

train_loader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Load Pretrained ResNet18 from timm
import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18.resnet18(pretrained=False, device=device)
model.to(device)

state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)

# Compute Fisher Information
fisher_information = compute_fisher_information(model, train_loader, device)


def uniform_quantization(tensor, num_bits=NUM_BITS, name=None):
    min_val, max_val = tensor.min(), tensor.max()

    # Compute Fisher Information sensitivity mask
    if name and name in fisher_information:
        fisher_info = fisher_information[name]
        sensitivity_mask = fisher_info > torch.quantile(fisher_info, 0.3)
        sensitive_weights = tensor[sensitivity_mask]
    else:
        sensitivity_mask = torch.abs(tensor) > torch.quantile(torch.abs(tensor), 0.95)
        sensitive_weights = tensor[sensitivity_mask]

    offset = sensitive_weights.mean() if sensitive_weights.numel() > 0 else tensor.mean()
    print(offset)
    tensor_shifted = tensor - offset

    # # Apply sensitivity-aware clustering (k-means)
    # tensor_flat = tensor_shifted.view(-1, 1).cpu().numpy()
    # kmeans = KMeans(n_clusters=2 ** num_bits, n_init=10).fit(tensor_flat)
    # clustered_tensor = torch.tensor(kmeans.cluster_centers_[kmeans.labels_], device=tensor.device).view(tensor.shape)

    # Apply power function scaling (x^0.55)
    # tensor_sign = torch.sign(clustered_tensor)
    # tensor_scaled = clustered_tensor.abs() ** 0.55 * tensor_sign  # Preserve sign

    tensor_sign = torch.sign(tensor_shifted)
    tensor_scaled = tensor_shifted.abs() ** 0.55 * tensor_sign  # Preserve sign

    scale = (tensor_scaled.max() - tensor_scaled.min()) / (2 ** num_bits - 1)
    quantized_tensor = torch.round((tensor_scaled - tensor_scaled.min()) / scale) * scale + tensor_scaled.min()

    # Descale back to original range and apply offset correction
    quantized_tensor = ((quantized_tensor.abs() ** (1/0.55)) * torch.sign(quantized_tensor)) + offset

    return quantized_tensor

# Apply Uniform Quantization to Conv Layers Only
for name, param in model.named_parameters():
    if "conv" in name and "weight" in name:
        param.data = uniform_quantization(param.data, name=name)

# # Reload state dict to ensure updated weights are used
# model.load_state_dict(model.state_dict())

# Evaluate Model after Quantization
def evaluate(model, test_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy after Quantization: {100 * correct / total:.2f}%")

evaluate(model, test_loader)


Files already downloaded and verified
Files already downloaded and verified


  state_dict = torch.load('/content/resnet18.pt', map_location=torch.device('cpu'))


tensor(-0.0016)
tensor(-0.0007)
tensor(4.6154e-05)
tensor(-0.0001)
tensor(-0.0003)
tensor(-0.0002)
tensor(-0.0001)
tensor(-0.0001)
tensor(-0.0002)
tensor(-2.7181e-05)
tensor(-1.7936e-05)
tensor(-0.0001)
tensor(-0.0003)
tensor(-1.2501e-05)
tensor(9.9470e-05)
tensor(0.0001)
tensor(6.9517e-05)
Test Accuracy after Quantization: 82.50%
