In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.random as random
from torch.utils import data
import time
import matplotlib.pyplot as plt
import numpy as np
import wandb
import quant_lib.Logit_MLP as mlp
import quant_lib.figure as figure
import copy

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
def MLPLogitNet(bit_width=1):
    return mlp.MLP(input_class=10, num_classes=1, bit_width=bit_width)

In [12]:
def load_array(features, labels, batch_size, is_train=True):
    dataset = data.TensorDataset(features, labels)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [None]:
# activate CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set hyperparameter
bit_width = 1
EPOCH = 25
ANNEAL_EPOCH_AS = 5
ANNEAL_EPOCH_PQ = 5
pre_epoch = 0
BATCH_SIZE = 50
LR = 1
# ASkewSGD
DECAY_CONST = 0.88
alpha = 0.1
# ProxQuant
reg_lambda = 5e-2

# Generate training and testing dataset
n_train = 5000
n_test = 1000
n = n_train + n_test
w_star = (torch.rand(10) - 0.5).sign()
X = torch.rand(n, 10)
probs = 1.0 / (1 + torch.exp(-torch.mv(X, w_star)))
Y = torch.bernoulli(probs)
X_train, Y_train = X[:n_train], Y[:n_train]
X_test, Y_test = X[n_train:], Y[n_train:]
trainloader = load_array(X_train, Y_train, batch_size=BATCH_SIZE, is_train=True)
testloader = load_array(X_test, Y_test, batch_size=BATCH_SIZE, is_train=False)
# Define base net
base_net = MLPLogitNet(bit_width=bit_width).to(device)
# Define loss
criterion = nn.BCELoss()

In [14]:
def init(project_name, opt_name, batch_size, architecture, dataset_name, lr, alpha=None, reg_lambda=None):
    wandb.init(
        # set the wandb project where this run will be logged
        project=project_name,
        name=opt_name,
        # track hyperparameters and run metadata
        config={
            "batch_size": batch_size,
            "architecture": architecture,
            "dataset": dataset_name,
            "lr": lr,
            "alpha": alpha, 
            "reg_lambda": reg_lambda,
            "bit_width": base_net.bit_width
        },
    )
    net = copy.deepcopy(base_net)
    net.to(device)
    weights = [p for name, p in net.named_parameters() if 'bias' not in name]
    bias = [p for name, p in net.named_parameters() if 'bias' in name]
    parameters = [{"params": weights, "tag": "weights"}, {"params": bias, "tag": "bias"}]
    optimizer = optim.SGD(parameters, lr=lr)
    return net, optimizer

In [15]:
# SGD
net, optimizer = init(
    project_name="LogisticRegression_binary",
    opt_name="SGD",
    batch_size=BATCH_SIZE,
    architecture="MLP",
    dataset_name="LogisticRegression",
    lr=LR,
)

lr_decay_epochs = [30]
for decay_epoch in lr_decay_epochs:
    if pre_epoch > decay_epoch:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

# Train
for epoch in range(pre_epoch, EPOCH):
    print("\nEpoch: %d" % (epoch + 1))
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0

    if epoch in lr_decay_epochs:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

    for i, data in enumerate(trainloader, 0):
        # prepare dataset
        length = len(trainloader)
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward & backward
        outputs = net(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        sum_loss += loss.item()
        predicted = torch.where(
            outputs > 0.5,
            torch.tensor(1.0, device=device),
            torch.tensor(0.0, device=device),
        )
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()
        print(
            "[Epoch:%d, Iter:%d] Loss: %.03f | Acc: %.3f%% "
            % (
                epoch + 1,
                (i + 1 + (epoch) * length),
                sum_loss / (i + 1),
                100.0 * correct / total,
            )
        )
    print("Waiting Test...")
    # calculate weight_dist and batch_gradient
    model_copy = copy.deepcopy(net)
    weight_dist = torch.norm(w_star - model_copy.fc1.weight.cpu()).item()
    model_copy.to(device)
    model_copy.train()
    weights = [p for name, p in net.named_parameters() if "bias" not in name]
    bias = [p for name, p in net.named_parameters() if "bias" in name]
    parameters = [
        {"params": weights, "tag": "weights"},
        {"params": bias, "tag": "bias"},
    ]
    optimizerx = optim.SGD(parameters, lr=LR)
    grad = torch.zeros(model_copy.fc1.weight.shape, device=device)
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_copy(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        grad += model_copy.fc1.weight.grad
        optimizerx.step()
        model_copy.zero_grad()
    grad = grad / len(trainloader)
    batch_gradient = torch.norm(grad).item()
    # Evaluation
    with torch.no_grad():
        train_loss, train_acc = net.evaluate(
            trainloader, criterion, device, eval=False, qt=False
        )
        qtrain_loss, qtrain_acc = net.evaluate(
            trainloader, criterion, device, eval=False, qt=True
        )
        test_loss, test_acc = net.evaluate(
            testloader, criterion, device, eval=True, qt=False
        )
        qtest_loss, qtest_acc = net.evaluate(
            testloader, criterion, device, eval=True, qt=True
        )
        wandb.log(
            {
                "test_loss": test_loss,
                "quantized_test_loss": qtest_loss,
                "accuracy": train_acc,
                "quantized_accuracy": qtrain_acc,
                "test_accuracy": test_acc,
                "quantized_test_accuracy": qtest_acc,
                "weight_distance": weight_dist,
                "batch_gradient": batch_gradient,
            }
        )
        print(
            "Train Loss: %.03f | Train Acc: %.3f%% | Test Loss: %.03f | Test Acc: %.3f%% "
            % (
                train_loss,
                train_acc,
                test_loss,
                test_acc,
            )
        )
        print(
            "Quantized Train Loss: %.03f | Quantized Train Acc: %.3f%% | Quantized Test Loss: %.03f | Quantized Test Acc: %.3f%% "
            % (
                qtrain_loss,
                qtrain_acc,
                qtest_loss,
                qtest_acc,
            )
        )
wandb.finish()


Epoch: 1
[Epoch:1, Iter:1] Loss: 0.713 | Acc: 46.000% 
[Epoch:1, Iter:2] Loss: 0.694 | Acc: 54.000% 
[Epoch:1, Iter:3] Loss: 0.684 | Acc: 56.000% 
[Epoch:1, Iter:4] Loss: 0.665 | Acc: 59.500% 
[Epoch:1, Iter:5] Loss: 0.649 | Acc: 61.600% 
[Epoch:1, Iter:6] Loss: 0.632 | Acc: 64.000% 
[Epoch:1, Iter:7] Loss: 0.624 | Acc: 65.143% 
[Epoch:1, Iter:8] Loss: 0.623 | Acc: 65.250% 
[Epoch:1, Iter:9] Loss: 0.623 | Acc: 65.111% 
[Epoch:1, Iter:10] Loss: 0.621 | Acc: 65.200% 
[Epoch:1, Iter:11] Loss: 0.617 | Acc: 65.636% 
[Epoch:1, Iter:12] Loss: 0.618 | Acc: 65.500% 
[Epoch:1, Iter:13] Loss: 0.616 | Acc: 66.000% 
[Epoch:1, Iter:14] Loss: 0.613 | Acc: 66.286% 
[Epoch:1, Iter:15] Loss: 0.614 | Acc: 66.133% 
[Epoch:1, Iter:16] Loss: 0.616 | Acc: 66.125% 
[Epoch:1, Iter:17] Loss: 0.617 | Acc: 66.000% 
[Epoch:1, Iter:18] Loss: 0.615 | Acc: 66.111% 
[Epoch:1, Iter:19] Loss: 0.610 | Acc: 66.526% 
[Epoch:1, Iter:20] Loss: 0.610 | Acc: 66.400% 
[Epoch:1, Iter:21] Loss: 0.609 | Acc: 66.857% 
[Epoch:1, It

VBox(children=(Label(value='0.001 MB of 0.135 MB uploaded\r'), FloatProgress(value=0.009875410298944693, max=1…

0,1
accuracy,▅██▄▇▇▇▇█▇█▁▇▆▆▆▅▇█▆▄▄▇█▇
batch_gradient,▃▁▁▇▁▃▄▁▁▂▃█▂▄▄▅▅▂▂▅▆█▄▁▁
quantized_accuracy,█▇█▂▆▂▂▃▁▂▃▁▂▆▂▂▄▂▃▁▂▇▂▁▂
quantized_test_accuracy,█▇▅▁▃▁▂▁▁▁▁▁▁▃▁▁▃▁▁▂▁▅▁▁▁
quantized_test_loss,▁▁▂▅▃▅▆▄▅▅▄█▅▃▇█▄▆▄▇█▂▇▅▅
test_accuracy,▆▇█▃▆▅▄█▆▆▆▁▅█▄▅▇▆▆▄▃▅▄▆▅
test_loss,▂▁▁▅▁▂▂▁▁▁▂▆▁▃▂▃▄▂▂▃▃█▂▁▁
weight_distance,█▂▁▃▄▂▃▃▄▄▄▂▃▅▃▂▄▃▄▂▂▆▃▃▃

0,1
accuracy,72.93999
batch_gradient,0.01134
quantized_accuracy,71.86
quantized_test_accuracy,70.9
quantized_test_loss,0.55399
test_accuracy,72.10001
test_loss,0.54401
weight_distance,0.40814


In [16]:
# ASkewSGD
net, optimizer = init(project_name="LogisticRegression_binary", opt_name="ASkewSGD", batch_size=BATCH_SIZE, architecture="MLP", dataset_name="LogisticRegression", lr=LR)

lr_decay_epochs = [30]
for decay_epoch in lr_decay_epochs:
    if pre_epoch > decay_epoch:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

# Train
for epoch in range(pre_epoch, EPOCH):
    print("\nEpoch: %d" % (epoch + 1))
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0

    if epoch in lr_decay_epochs:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

    for i, data in enumerate(trainloader, 0):
        if epoch < ANNEAL_EPOCH_AS:
            epsilon=1
        else:
            epsilon =  DECAY_CONST ** ((epoch - ANNEAL_EPOCH_AS)+(i/len(trainloader)))
        # prepare dataset
        length = len(trainloader)
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward & backward
        outputs = net(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        for param_group in optimizer.param_groups:
            if param_group["tag"] == "weights":
                for idx, p in enumerate(param_group["params"]):
                    constr = epsilon - (p.data**2 - 1) ** 2
                    Kx = -4 * (p.data**2 - 1) * p.data
                    direct_grad = torch.logical_or(
                        torch.logical_or(constr >= 0, Kx == 0),
                        torch.logical_and(
                            constr < 0, (-Kx * p.grad.data) >= -alpha * constr
                        ),
                    )
                    p.grad.data[direct_grad] = p.grad.data[direct_grad]
                    p.grad.data[~direct_grad] = (torch.clip(
                        alpha * constr / Kx,
                        -1,
                        1,
                    ))[~direct_grad]
        optimizer.step()
        optimizer.zero_grad()
        sum_loss += loss.item()
        predicted = torch.where(outputs.data>0.5, torch.tensor(1.0, device=device), torch.tensor(0.0, device=device))
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()
        print(
            "[Epoch:%d, Iter:%d] Loss: %.03f | Acc: %.3f%% "
            % (
                epoch + 1,
                (i + 1 + (epoch) * length),
                sum_loss / (i + 1),
                100.0 * correct / total,
            )
        )
    print("Waiting Test...")
    # calculate weight_dist and batch_gradient
    model_copy=copy.deepcopy(net)
    weight_dist=torch.norm(w_star-model_copy.fc1.weight.cpu()).item()
    model_copy.to(device)
    model_copy.train()
    weights = [p for name, p in net.named_parameters() if 'bias' not in name]
    bias = [p for name, p in net.named_parameters() if 'bias' in name]
    parameters = [{"params": weights, "tag": "weights"}, {"params": bias, "tag": "bias"}]
    optimizerx = optim.SGD(parameters, lr=LR)
    grad=torch.zeros(model_copy.fc1.weight.shape, device=device)
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_copy(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        grad += model_copy.fc1.weight.grad
        optimizerx.step()
        model_copy.zero_grad()
    grad = grad / len(trainloader)
    batch_gradient = torch.norm(grad).item()
    # Evaluation
    with torch.no_grad():
        train_loss, train_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=False)
        qtrain_loss, qtrain_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=True)
        test_loss, test_acc=net.evaluate(testloader, criterion, device, eval=True, qt=False)
        qtest_loss, qtest_acc=net.evaluate(testloader, criterion, device, eval=True, qt=True)
        wandb.log(
            {
                "test_loss": test_loss,
                "quantized_test_loss": qtest_loss,
                "accuracy": train_acc,
                "quantized_accuracy": qtrain_acc,
                "test_accuracy": test_acc,
                "quantized_test_accuracy": qtest_acc,
                "weight_distance": weight_dist,
                "batch_gradient": batch_gradient,
            }
        )
        print(
            "Train Loss: %.03f | Train Acc: %.3f%% | Test Loss: %.03f | Test Acc: %.3f%% "
            % (
                train_loss,
                train_acc,
                test_loss,
                test_acc,
            )
        )
        print(
            "Quantized Train Loss: %.03f | Quantized Train Acc: %.3f%% | Quantized Test Loss: %.03f | Quantized Test Acc: %.3f%% "
            % (
                qtrain_loss,
                qtrain_acc,
                qtest_loss,
                qtest_acc,
            )
        )
wandb.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888884685, max=1.0…


Epoch: 1
[Epoch:1, Iter:1] Loss: 0.760 | Acc: 28.000% 
[Epoch:1, Iter:2] Loss: 0.665 | Acc: 51.000% 
[Epoch:1, Iter:3] Loss: 0.632 | Acc: 59.333% 
[Epoch:1, Iter:4] Loss: 0.603 | Acc: 64.000% 
[Epoch:1, Iter:5] Loss: 0.593 | Acc: 66.000% 
[Epoch:1, Iter:6] Loss: 0.597 | Acc: 66.667% 
[Epoch:1, Iter:7] Loss: 0.590 | Acc: 68.000% 
[Epoch:1, Iter:8] Loss: 0.588 | Acc: 68.500% 
[Epoch:1, Iter:9] Loss: 0.595 | Acc: 68.000% 
[Epoch:1, Iter:10] Loss: 0.600 | Acc: 67.200% 
[Epoch:1, Iter:11] Loss: 0.602 | Acc: 66.727% 
[Epoch:1, Iter:12] Loss: 0.606 | Acc: 66.000% 
[Epoch:1, Iter:13] Loss: 0.604 | Acc: 66.769% 
[Epoch:1, Iter:14] Loss: 0.600 | Acc: 67.143% 
[Epoch:1, Iter:15] Loss: 0.599 | Acc: 67.467% 
[Epoch:1, Iter:16] Loss: 0.591 | Acc: 68.375% 
[Epoch:1, Iter:17] Loss: 0.586 | Acc: 68.941% 
[Epoch:1, Iter:18] Loss: 0.584 | Acc: 69.222% 
[Epoch:1, Iter:19] Loss: 0.576 | Acc: 69.895% 
[Epoch:1, Iter:20] Loss: 0.580 | Acc: 69.700% 
[Epoch:1, Iter:21] Loss: 0.580 | Acc: 69.810% 
[Epoch:1, It

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▆▆▅▅▃▆▆▃▇▆▅▆▄▅▂▆▁█▇▄▃▅▁▄▁
batch_gradient,▂▂▃▂▆▁▃▆▁▂▄▁▄▄█▁█▂▁▇▆▃▇▂█
quantized_accuracy,█▇▇▆▇▄▄▂▃▂▁▁▁▁▇▄█▆▅▁▂▆▆▆▃
quantized_test_accuracy,▇██▄▅▂▄▁▂▁▁▁▂▂▅▃▇▅▅▂▁▅█▅▂
quantized_test_loss,▁▁▂▄▂▄▄█▅▅▇▆▇▇▃▄▂▃▃▆▅▃▂▄▅
test_accuracy,▇▇█▇▇▅█▁▅▅▄▅▄▂▇▇▇█▇▂▂▆▇▆▁
test_loss,▃▂▂▂▅▂▂▅▁▂▃▂▂▃█▂█▁▁▅▄▂▇▂▇
weight_distance,█▃▃▃▄▄▄▃▄▄▃▄▃▃▅▅▅▃▃▂▂▂▃▁▁

0,1
accuracy,72.06
batch_gradient,0.10386
quantized_accuracy,72.12
quantized_test_accuracy,71.10001
quantized_test_loss,0.55075
test_accuracy,71.4
test_loss,0.55438
weight_distance,0.21251


In [17]:
# Deterministic BinaryConnect
net, optimizer = init(project_name="LogisticRegression_binary", opt_name="Deterministic BinaryConnect", batch_size=BATCH_SIZE, architecture="MLP", dataset_name="LogisticRegression", lr=LR)

model_copy = copy.deepcopy(net)

lr_decay_epochs = [30]
for decay_epoch in lr_decay_epochs:
    if pre_epoch > decay_epoch:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

# Train
for epoch in range(pre_epoch, EPOCH):
    print("\nEpoch: %d" % (epoch + 1))
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0

    if epoch in lr_decay_epochs:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5

    for i, data in enumerate(trainloader, 0):
        # prepare dataset
        length = len(trainloader)
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward & backward
        with torch.no_grad():
            outputs = net(inputs).squeeze()
            loss2 = criterion(outputs, labels)
            for net_name, net_param in net.named_parameters():
                if not net_name.endswith(".bias"):
                    net_param.data=net.quantize(net_param.data, net.bit_width)
        outputs2 = net(inputs).squeeze()
        loss = criterion(outputs2, labels)
        loss.backward()
        optimizer.step()
        for (net_name, net_param), (model_copy_name, model_copy_param) in zip(
            net.named_parameters(), model_copy.named_parameters()
        ):
            if not net_name.endswith(".bias"):
                delta = net_param.data - model_copy.quantize(model_copy_param.data, model_copy.bit_width)
                net_param.data = torch.clamp(model_copy_param.data + delta, -1, 1)
                model_copy_param.data = torch.clamp(model_copy_param.data + delta, -1, 1)
        sum_loss += loss.item()
        optimizer.zero_grad()
        predicted = torch.where(outputs.data>0.5, torch.tensor(1.0, device=device), torch.tensor(0.0, device=device))
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()
        print(
            "[epoch:%d, iter:%d] Loss: %.03f | Acc: %.3f%% "
            % (
                epoch + 1,
                (i + 1 + (epoch) * length),
                sum_loss / (i + 1),
                100.0 * correct / total,
            )
        )
    print("Waiting Test...")
    # calculate weight_dist and batch_gradient
    model_copy=copy.deepcopy(net)
    weight_dist=torch.norm(w_star-model_copy.fc1.weight.cpu()).item()
    model_copy.to(device)
    model_copy.train()
    weights = [p for name, p in net.named_parameters() if 'bias' not in name]
    bias = [p for name, p in net.named_parameters() if 'bias' in name]
    parameters = [{"params": weights, "tag": "weights"}, {"params": bias, "tag": "bias"}]
    optimizerx = optim.SGD(parameters, lr=LR)
    grad=torch.zeros(model_copy.fc1.weight.shape, device=device)
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_copy(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        grad += model_copy.fc1.weight.grad
        optimizerx.step()
        model_copy.zero_grad()
    grad = grad / len(trainloader)
    batch_gradient = torch.norm(grad).item()
    # Evaluation
    with torch.no_grad():
        train_loss, train_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=False)
        qtrain_loss, qtrain_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=True)
        test_loss, test_acc=net.evaluate(testloader, criterion, device, eval=True, qt=False)
        qtest_loss, qtest_acc=net.evaluate(testloader, criterion, device, eval=True, qt=True)
        wandb.log(
            {
                "test_loss": test_loss,
                "quantized_test_loss": qtest_loss,
                "accuracy": train_acc,
                "quantized_accuracy": qtrain_acc,
                "test_accuracy": test_acc,
                "quantized_test_accuracy": qtest_acc,
                "weight_distance": weight_dist,
                "batch_gradient": batch_gradient,
            }
        )
        print(
            "Train Loss: %.03f | Train Acc: %.3f%% | Test Loss: %.03f | Test Acc: %.3f%% "
            % (
                train_loss,
                train_acc,
                test_loss,
                test_acc,
            )
        )
        print(
            "Quantized Train Loss: %.03f | Quantized Train Acc: %.3f%% | Quantized Test Loss: %.03f | Quantized Test Acc: %.3f%% "
            % (
                qtrain_loss,
                qtrain_acc,
                qtest_loss,
                qtest_acc,
            )
        )
wandb.finish()


Epoch: 1
[epoch:1, iter:1] Loss: 0.739 | Acc: 36.000% 
[epoch:1, iter:2] Loss: 1.095 | Acc: 53.000% 
[epoch:1, iter:3] Loss: 1.355 | Acc: 42.000% 
[epoch:1, iter:4] Loss: 1.455 | Acc: 48.000% 
[epoch:1, iter:5] Loss: 1.475 | Acc: 52.000% 
[epoch:1, iter:6] Loss: 1.341 | Acc: 48.667% 
[epoch:1, iter:7] Loss: 1.232 | Acc: 52.286% 
[epoch:1, iter:8] Loss: 1.153 | Acc: 49.750% 
[epoch:1, iter:9] Loss: 1.082 | Acc: 48.889% 
[epoch:1, iter:10] Loss: 1.070 | Acc: 50.200% 
[epoch:1, iter:11] Loss: 1.072 | Acc: 49.091% 
[epoch:1, iter:12] Loss: 1.062 | Acc: 50.667% 
[epoch:1, iter:13] Loss: 1.028 | Acc: 50.923% 
[epoch:1, iter:14] Loss: 0.996 | Acc: 52.429% 
[epoch:1, iter:15] Loss: 0.968 | Acc: 51.200% 
[epoch:1, iter:16] Loss: 0.954 | Acc: 52.375% 
[epoch:1, iter:17] Loss: 0.948 | Acc: 50.824% 
[epoch:1, iter:18] Loss: 0.929 | Acc: 51.889% 
[epoch:1, iter:19] Loss: 0.913 | Acc: 52.421% 
[epoch:1, iter:20] Loss: 0.898 | Acc: 51.900% 
[epoch:1, iter:21] Loss: 0.879 | Acc: 51.048% 
[epoch:1, it

VBox(children=(Label(value='0.001 MB of 0.134 MB uploaded\r'), FloatProgress(value=0.009976111527079544, max=1…

0,1
accuracy,▃▂▁▁▃▁▂▃▁▁█▂▄▁▃▁▅▂▁▁▄▃▂▅▁
batch_gradient,▁▄▇▇▆▆▆▄▇█▁▆▄█▅▇▄▆▆█▄▅▆▃▇
quantized_accuracy,███████▇█▁▇█▇███▇███▇██▇█
quantized_test_accuracy,▇▇█▆▇█▇▇█▁▅▇▇▇▇█▇▇▇▇▇▇▇▇▇
quantized_test_loss,▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▂▁▁▁▂▂▂▂▁▁█▂▃▁▂▁▄▂▁▁▂▂▂▄▁
test_loss,▂▂▅▅▄▅▅▃▆█▁▄▂▇▃▆▂▄▄█▃▃▄▂▆
weight_distance,█▄▄▃▂▂▃▃▃▃▁▂▂▃▂▃▂▂▃▃▂▂▁▂▃

0,1
accuracy,69.12
batch_gradient,0.25292
quantized_accuracy,73.02
quantized_test_accuracy,73.2
quantized_test_loss,0.54253
test_accuracy,69.4
test_loss,0.6501
weight_distance,1.86721


In [18]:
# ProxQuant
net, optimizer = init(project_name="LogisticRegression_binary", opt_name="ProxQuant", batch_size=BATCH_SIZE, architecture="MLP", dataset_name="LogisticRegression", lr=LR)

model_copy = copy.deepcopy(net)

lr_decay_epochs = [30]
lr=LR/10
for decay_epoch in lr_decay_epochs:
    if pre_epoch > decay_epoch:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5
            lr*=0.5

# Train
it=-1
total_it=(EPOCH-ANNEAL_EPOCH_PQ)*len(trainloader)

for epoch in range(pre_epoch, EPOCH):
    print("\nEpoch: %d" % (epoch + 1))
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0

    if epoch in lr_decay_epochs:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.5
            lr*=0.5

    for i, data in enumerate(trainloader, 0):
        # prepare dataset
        if epoch < ANNEAL_EPOCH_PQ:
            epsilon=0
        else:
            it+=1
            epsilon = reg_lambda*it/total_it
        net.train()
        length = len(trainloader)
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward & backward
        outputs = net(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            for name, param in net.named_parameters():
                if not name.endswith(".bias"):
                    # Prox Step
                    param.data=(param.data+epsilon*torch.sign(param.data))/(1+epsilon)
        optimizer.zero_grad()
        sum_loss += loss.item()
        predicted = torch.where(outputs.data>0.5, torch.tensor(1.0, device=device), torch.tensor(0.0, device=device))
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()
        print(
            "[Epoch:%d, Iter:%d] Loss: %.03f | Acc: %.3f%% "
            % (
                epoch + 1,
                (i + 1 + (epoch) * length),
                sum_loss / (i + 1),
                100.0 * correct / total,
            )
        )
    print("Waiting Test...")
    # calculate weight_dist and batch_gradient
    model_copy=copy.deepcopy(net)
    weight_dist=torch.norm(w_star-model_copy.fc1.weight.cpu()).item()
    model_copy.to(device)
    model_copy.train()
    weights = [p for name, p in net.named_parameters() if 'bias' not in name]
    bias = [p for name, p in net.named_parameters() if 'bias' in name]
    parameters = [{"params": weights, "tag": "weights"}, {"params": bias, "tag": "bias"}]
    optimizerx = optim.SGD(parameters, lr=LR)
    grad=torch.zeros(model_copy.fc1.weight.shape, device=device)
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_copy(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        grad += model_copy.fc1.weight.grad
        optimizerx.step()
        model_copy.zero_grad()
    grad = grad / len(trainloader)
    batch_gradient = torch.norm(grad).item()
    # Evaluation
    with torch.no_grad():
        train_loss, train_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=False)
        qtrain_loss, qtrain_acc = net.evaluate(trainloader, criterion, device, eval=False, qt=True)
        test_loss, test_acc=net.evaluate(testloader, criterion, device, eval=True, qt=False)
        qtest_loss, qtest_acc=net.evaluate(testloader, criterion, device, eval=True, qt=True)
        wandb.log(
            {
                "test_loss": test_loss,
                "quantized_test_loss": qtest_loss,
                "accuracy": train_acc,
                "quantized_accuracy": qtrain_acc,
                "test_accuracy": test_acc,
                "quantized_test_accuracy": qtest_acc,
                "weight_distance": weight_dist,
                "batch_gradient": batch_gradient,
            }
        )
        print(
            "Train Loss: %.03f | Train Acc: %.3f%% | Test Loss: %.03f | Test Acc: %.3f%% "
            % (
                train_loss,
                train_acc,
                test_loss,
                test_acc,
            )
        )
        print(
            "Quantized Train Loss: %.03f | Quantized Train Acc: %.3f%% | Quantized Test Loss: %.03f | Quantized Test Acc: %.3f%% "
            % (
                qtrain_loss,
                qtrain_acc,
                qtest_loss,
                qtest_acc,
            )
        )
wandb.finish()


Epoch: 1
[Epoch:1, Iter:1] Loss: 0.744 | Acc: 34.000% 
[Epoch:1, Iter:2] Loss: 0.679 | Acc: 52.000% 
[Epoch:1, Iter:3] Loss: 0.648 | Acc: 58.000% 
[Epoch:1, Iter:4] Loss: 0.638 | Acc: 61.000% 
[Epoch:1, Iter:5] Loss: 0.627 | Acc: 63.600% 
[Epoch:1, Iter:6] Loss: 0.637 | Acc: 63.000% 
[Epoch:1, Iter:7] Loss: 0.634 | Acc: 63.714% 
[Epoch:1, Iter:8] Loss: 0.643 | Acc: 62.750% 
[Epoch:1, Iter:9] Loss: 0.642 | Acc: 64.000% 
[Epoch:1, Iter:10] Loss: 0.638 | Acc: 64.400% 
[Epoch:1, Iter:11] Loss: 0.631 | Acc: 65.091% 
[Epoch:1, Iter:12] Loss: 0.630 | Acc: 65.167% 
[Epoch:1, Iter:13] Loss: 0.629 | Acc: 65.385% 
[Epoch:1, Iter:14] Loss: 0.621 | Acc: 66.286% 
[Epoch:1, Iter:15] Loss: 0.612 | Acc: 67.067% 
[Epoch:1, Iter:16] Loss: 0.609 | Acc: 67.375% 
[Epoch:1, Iter:17] Loss: 0.609 | Acc: 67.294% 
[Epoch:1, Iter:18] Loss: 0.609 | Acc: 67.111% 
[Epoch:1, Iter:19] Loss: 0.610 | Acc: 66.947% 
[Epoch:1, Iter:20] Loss: 0.608 | Acc: 67.100% 
[Epoch:1, Iter:21] Loss: 0.610 | Acc: 66.857% 
[Epoch:1, It

VBox(children=(Label(value='0.001 MB of 0.135 MB uploaded\r'), FloatProgress(value=0.00983987330635228, max=1.…

0,1
accuracy,▆▁▇▅▇▃▁▆█▄▃▆▇▅▇▇▆▅▃▃▇▅▇██
batch_gradient,▁█▂▄▂▇▇▂▃▆▇▄▃▅▂▂▂▅▆▆▄▅▁▁▁
quantized_accuracy,█▅▆▃▂▇▁▂▇▂▃▅▆█▇▇▆▆▇███▇█▇
quantized_test_accuracy,▇▄▅▂▁▅▁▁▅▁▁▅▇▇▇▇▇▅▇█▇▇▇▇▇
quantized_test_loss,▁▃▃▄▅▂█▅▂▅▅▃▂▁▂▂▂▃▁▁▁▂▁▁▁
test_accuracy,▆▂▅▄▅▆▁▅▇▃▂▃▇▆▆▆▅▄▅▅▆▂█▇▇
test_loss,▂▇▁▃▂█▇▂▂▅▅▂▂▃▁▁▁▃▅▆▂▃▁▁▁
weight_distance,█▅▄▄▄▆▄▃▄▂▃▂▃▃▂▂▂▁▃▃▂▂▂▁▁

0,1
accuracy,73.02
batch_gradient,0.00731
quantized_accuracy,72.9
quantized_test_accuracy,73.3
quantized_test_loss,0.54311
test_accuracy,73.2
test_loss,0.54236
weight_distance,0.12398


In [19]:
print(model_copy.fc1.weight)
print(w_star)

Parameter containing:
tensor([[-1.0510, -1.0311,  0.9678, -1.0093, -0.9910,  1.0555, -1.0553, -1.0067,
          0.9820,  0.9360]], device='cuda:0', requires_grad=True)
tensor([-1., -1.,  1., -1., -1.,  1., -1., -1.,  1.,  1.])
