In [1]:
from torchvision import datasets
from torchvision.transforms import ToTensor
import torch
import torch.nn as nn

dataset = "emnist"

if dataset == "cifar10":
    size = 32
    ch = 3
    num_classes = 10

    train_data = datasets.CIFAR10(
        root="datac10",
        train=True,
        download=True,
        transform=ToTensor()
    )
    
    test_data = datasets.CIFAR10(
        root="datac10",
        train=True,
        transform=ToTensor()
    )

elif dataset == "emnist":
    size = 28
    ch = 1
    num_classes = 47

    train_data = datasets.EMNIST(
        root="datae",
        split="balanced",
        train=True,
        download=True,
        transform=ToTensor()
    )
    
    test_data = datasets.EMNIST(
        root="datae",
        split="balanced",
        train=True,
        transform=ToTensor()
    )

elif dataset == "mnist":

    size = 28
    ch = 1
    num_classes = 10

    train_data = datasets.MNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor()
    )
    
    test_data = datasets.MNIST(
        root="data",
        train=True,
        transform=ToTensor()
    )

elif dataset == "cifar100":
    size = 32
    ch = 3
    num_classes = 100

    train_data = datasets.CIFAR100(
        root="datac100",
        train=True,
        download=True,
        transform=ToTensor()
    )
    
    test_data = datasets.CIFAR100(
        root="datac100",
        train=True,
        transform=ToTensor()
    )

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def get_data(train_ds, valid_ds, bs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs),
    )

class DatasetStandardizer:

    def __init__(self, eps=1e-9):
        self.eps=1e-9
        self.trained=False

    def fit(self, ds):
        x_std, x_mean = torch.std_mean(x, dim=0)
        self.std = x_std + self.eps
        self.mean = x_mean
        self.trained = True

    def transform(self, ds):
        if self.trained == False:
            raise Exception()
        return 

    def fit_transform(self, ds):
        self.fit(ds)
        return self.transform(ds)


def preprocess_transform(x, y):
    return (x.view(-1, ch, size, size)).to(device=device), y.to(device=device)

def preprocess_classification(x, y):
    return x.to(device=device), y.to(device=device)

def preprocess_classification_norm(x, y):
    x, y = x.to(device=device), y.to(device=device)
    x_std, x_mean = torch.std_mean(x, dim=0)
    eps = 1e-6
    x_std += eps
    return (x - x_mean) / x_std, y 


def preprocess_test(x, y):
    return x.view(-1, size*size*ch).to(device=device), y.to(device=device)

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))

# Dataset Reduction

In [3]:
train_data

Dataset EMNIST
    Number of datapoints: 112800
    Root location: datae
    Split: Train
    StandardTransform
Transform: ToTensor()

In [4]:
max(train_data.targets)

tensor(46)

In [5]:
from torch.utils.data import TensorDataset

number_of_samples = 20000

train_ds = TensorDataset(torch.tensor(train_data.data[:number_of_samples], dtype=torch.float32)/255, torch.tensor(train_data.targets[:number_of_samples]))
val_ds = TensorDataset(torch.tensor(test_data.data[:number_of_samples//2], dtype=torch.float32)/255, torch.tensor(test_data.targets[:number_of_samples//2]))

  train_ds = TensorDataset(torch.tensor(train_data.data[:number_of_samples], dtype=torch.float32)/255, torch.tensor(train_data.targets[:number_of_samples]))
  val_ds = TensorDataset(torch.tensor(test_data.data[:number_of_samples//2], dtype=torch.float32)/255, torch.tensor(test_data.targets[:number_of_samples//2]))


# R2D2

In [6]:
import tqdm

def transform_dataset(transformer, dl, desc):
    progress_bar = tqdm.tqdm(total=len(dl))
    progress_bar.set_description(desc)
    lx = []
    ly = []
    for xb, yb in dl:
        xb_t = transformer(xb)
        lx.append(xb_t.to("cpu"))
        ly.append(yb.to("cpu"))
        progress_bar.update(1)
    progress_bar.close()
    return TensorDataset(torch.cat(lx), torch.cat(ly))

In [7]:
import torch
import torch.nn as nn
import r2d2
import tqdm
import importlib
importlib.reload(r2d2)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Initializing... ", end="")
transform = r2d2.R2D2(ch=ch, h=size, w=size, threshold=1e-7, n1=1, n2=10000, auto_pca=True, device=device).to(device=device)
print("done")

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_transform)
val_dl = WrappedDataLoader(val_dl, func=preprocess_transform)

train_ds = transform_dataset(transform, train_dl, "Training")
#torch.save(train_ds, "train_ds.pt")
val_ds = transform_dataset(transform, val_dl, "Validation")
#torch.save(val_ds, "val_ds.pt")

Initializing... done


Training: 100%|██████████| 20/20 [01:00<00:00,  3.03s/it]
Validation: 100%|██████████| 10/10 [00:30<00:00,  3.03s/it]


In [8]:
#torch.save(train_ds, "train_ds.pt")
#torch.save(val_ds, "val_ds.pt")

In [9]:
#train_ds = torch.load("train_ds.pt")
#val_ds = torch.load("val_ds.pt")

In [10]:
train_ds[0][0].shape

torch.Size([7967])

# Ridge Test

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_flag = False

scaler = StandardScaler()

if pca_flag == True:
    pca = PCA(0.99999)
    x, y = train_ds.tensors
    x = pca.fit_transform(x.numpy())
    x = scaler.fit_transform(x)
    train_ds = TensorDataset(torch.tensor(x), y)
else:
    x, y = train_ds.tensors
    x = scaler.fit_transform(x.numpy())
    train_ds = TensorDataset(torch.tensor(x), y)

if pca_flag == True:
    x, y = val_ds.tensors
    x = pca.transform(x.numpy())
    x = scaler.transform(x)
    val_ds = TensorDataset(torch.tensor(x), y)
    print(pca.n_components_)
else:
    x, y = val_ds.tensors
    x = scaler.transform(x.numpy())
    val_ds = TensorDataset(torch.tensor(x), y)

In [13]:
from sklearn.linear_model import RidgeClassifier

X_train, y_train = train_ds.tensors
X_train, y_train = X_train.numpy(), y_train.numpy()

X_val, y_val = val_ds.tensors
X_val, y_val = X_val.numpy(), y_val.numpy()

clf = RidgeClassifier(alpha=1)
clf.fit(X_train, y_train)
clf.score(X_val, y_val)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.9848

# R2D2 + Logistic Regression

In [None]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]
clf = r2d2.LogisticRegression(inputs=number_of_features, outputs=num_classes).to(device=device)
cg = torch.zeros_like(clf.linear.weight).to(device=device)

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

optimizer = torch.optim.NAdam(
                    clf.parameters(), 
                    lr=1e-4,
                    weight_decay=1
                )

r2d2.fit_model( epochs=300, 
                model=clf, 
                loss_func=nn.NLLLoss(), 
                opt=optimizer, 
                train_dl=train_dl, 
                val_dl=val_dl,
                cumulative_gradient=cg,
                #scheduler=scheduler,
                num_classes=num_classes)

# R2D2 + Ridge

In [11]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]
clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
cg = torch.zeros_like(clf.linear.weight).to(device=device)

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

optimizer = torch.optim.NAdam(
                    clf.parameters(), 
                    lr=1e-4,
                    weight_decay=1
                )

r2d2.fit_model( epochs=200, 
                model=clf, 
                loss_func=nn.MSELoss(), 
                opt=optimizer, 
                train_dl=train_dl, 
                val_dl=val_dl,
                cumulative_gradient=cg,
                #scheduler=scheduler,
                is_ridge=True,
                num_classes=num_classes)

Epoch 0 norm: 107.96 lr: 0
train loss: 13.825814    acc: 2.08 %
valid loss: 4.169221    acc: 1.79 %
---------------------------
Epoch 1 norm: 66.91 lr: 0
train loss: 6.343453    acc: 1.74 %
valid loss: 2.446138    acc: 1.76 %
---------------------------
Epoch 2 norm: 49.21 lr: 0
train loss: 4.219252    acc: 1.78 %
valid loss: 1.823254    acc: 1.63 %
---------------------------
Epoch 3 norm: 39.08 lr: 0
train loss: 3.299834    acc: 1.70 %
valid loss: 1.497423    acc: 1.83 %
---------------------------
Epoch 4 norm: 32.21 lr: 0
train loss: 2.794650    acc: 1.85 %
valid loss: 1.308506    acc: 1.76 %
---------------------------
Epoch 5 norm: 27.35 lr: 0
train loss: 2.494684    acc: 1.93 %
valid loss: 1.191474    acc: 2.49 %
---------------------------
Epoch 6 norm: 23.84 lr: 0
train loss: 2.307527    acc: 2.32 %
valid loss: 1.116699    acc: 2.92 %
---------------------------
Epoch 7 norm: 21.24 lr: 0
train loss: 2.184103    acc: 3.17 %
valid loss: 1.068266    acc: 3.31 %
------------------

KeyboardInterrupt: 

# One-vs-one Ridge

In [18]:
class BatchGenerator:
    def __init__(self, tensors_list, batch_size, device):
        self.l = tensors_list
        self.bs = batch_size
        self.device = device
        self.maxlen = max([len(t) for t in tensors_list])

    def __len__(self):
        return (self.maxlen + 1) // self.bs

    def __iter__(self):
        for i in range(0, self.maxlen, self.bs):
            yield [t[i:i+self.bs].to(device=self.device) for t in self.l]

In [19]:
def onevsone_step(list_by_class, batch_size, device, clf_dict, opt_dict, loss_func, positive_tensor, negative_tensor, epoch, train, verbose=False):    
    batch_generator = BatchGenerator(list_by_class, batch_size, device)
    global_loss = 0
    average_accuracy = []
    for batch_tensors in batch_generator:
        for i in range(num_classes):
            for j in range(i+1, num_classes):
                # this is downsampling so as to obtain perfectly balanced classes
                # as well as for optimization reasons so we can use only two tensors for labels across all the dataset 
                if not batch_tensors[i].shape[0]==batch_tensors[j].shape[0]==batch_size:
                    continue

                indices = f"{i}_{j}"
                clf = clf_dict[indices]
                cumulative_gradient = torch.zeros_like(clf.linear.weight).to(device=device)
                if train == True:
                    opt = opt_dict[indices]
                    clf.train()
                else:
                    clf.eval()
                loss = 0
                correct = 0
                # positive class
                pos_pred = clf(batch_tensors[i])
                pos_loss = loss_func(pos_pred, positive_tensor)
                # negative class
                neg_pred = clf(batch_tensors[j])
                neg_loss = loss_func(neg_pred, negative_tensor)
                # final training stage
                total_loss = pos_loss + neg_loss
                if train == True:
                    total_loss.backward()
                    opt.step()
                    cumulative_gradient += clf.linear.weight.grad
                    opt.zero_grad()
                loss += total_loss.item()
                correct += (torch.sign(pos_pred.data).flatten() == positive_tensor).sum()
                correct += (torch.sign(neg_pred.data).flatten() == negative_tensor).sum()
                correct_percent = (correct/(2*batch_size))*100
                global_loss += loss
                average_accuracy.append(correct_percent)
                if verbose == True:
                    print(f"""Epoch {epoch} classes: {indices} norm: {cumulative_gradient.norm():.2f}\n{"train" if train == True else "valid"} loss: {loss:.6f}    acc: {correct_percent:.2f} %\n---------------------------""")
                if train == True:
                    cumulative_gradient -= cumulative_gradient
    l = len(average_accuracy)
    average_accuracy = sum(average_accuracy) / l
    if verbose == False:
        print(f"Epoch: {epoch}\ntotal_loss: {global_loss:6f} mean_accuracy: {average_accuracy:.2f} %\n---------------------------------")

In [20]:
import r2d2
import importlib

importlib.reload(r2d2)

# create tensors by class
# maybe shuffle them
x_train, y_train = train_ds.tensors
x_val, y_val = val_ds.tensors

number_of_features = train_ds[0][0].shape[0]

train_by_class = []
for i in range(num_classes):
    train_by_class.append(x_train[y_train == i])

val_by_class = []
for i in range(num_classes):
    val_by_class.append(x_val[y_val == i])

clf_dict = {}
opt_dict = {}
labels_train_dict = {}
labels_val_dict = {}
loss_func = nn.MSELoss()
for i in range(num_classes):
    for j in range(i+1, num_classes):
        clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=1).to(device=device)
        opt = torch.optim.RMSprop(
            clf.parameters(),
            lr=1e-4 * 5,
            weight_decay=1
        )
        indices = f"{i}_{j}"
        clf_dict[indices] = clf
        opt_dict[indices] = opt
        #labels_train_dict[indices] = torch.cat((torch.tensor([1.0]*train_by_class[indices].shape[0]), torch.tensor([-1.0]*train_by_class[indices].shape[0])), dim=0)
        #labels_val_dict[indices] = torch.cat((torch.tensor([1.0]*val_by_class[indices].shape[0]), torch.tensor([-1.0]*val_by_class[indices].shape[0])), dim=0)

epochs = 50
batch_size = 128

positive_tensor = torch.tensor([1.0]*batch_size).to(device=device)
negative_tensor = torch.tensor([-1.0]*batch_size).to(device=device)

for epoch in range(epochs):
    # train
    onevsone_step(
        list_by_class=train_by_class,
        batch_size=batch_size,
        clf_dict=clf_dict,
        opt_dict=opt_dict,
        loss_func=loss_func,
        positive_tensor=positive_tensor,
        negative_tensor=negative_tensor,
        epoch=epoch,
        device=device,
        train=True
    )
    # validate
    with torch.no_grad():
        onevsone_step(
            list_by_class=train_by_class,
            batch_size=batch_size,
            clf_dict=clf_dict,
            opt_dict=opt_dict,
            loss_func=loss_func,
            positive_tensor=positive_tensor,
            negative_tensor=negative_tensor,
            epoch=epoch,
            device=device,
            train=False
        )


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0
total_loss: 13201.039679 mean_accuracy: 49.97 %
---------------------------------
Epoch: 0
total_loss: 13133.465624 mean_accuracy: 99.63 %
---------------------------------
Epoch: 1
total_loss: 13133.465624 mean_accuracy: 99.63 %
---------------------------------
Epoch: 1
total_loss: 8642.940781 mean_accuracy: 87.38 %
---------------------------------
Epoch: 2
total_loss: 8642.940781 mean_accuracy: 87.38 %
---------------------------------
Epoch: 2
total_loss: 6356.431804 mean_accuracy: 99.37 %
---------------------------------
Epoch: 3
total_loss: 6356.431804 mean_accuracy: 99.37 %
---------------------------------
Epoch: 3
total_loss: 4554.201294 mean_accuracy: 96.95 %
---------------------------------
Epoch: 4
total_loss: 4554.201294 mean_accuracy: 96.95 %
---------------------------------
Epoch: 4
total_loss: 3176.786151 mean_accuracy: 99.42 %
---------------------------------
Epoch: 5
total_loss: 3176.786151 mean_accuracy: 99.42 %
---------------------------------
Epoch: 

In [21]:
def get_onevsone_predictions(clf_dict, num_classes, xb):
    preds = torch.zeros(xb.shape[0], num_classes).to(device=device)
    for i in range(num_classes):
        for j in range(i+1, num_classes):
            indices = f"{i}_{j}"
            clf = clf_dict[indices]
            pred = clf(xb).flatten()
            class_i = torch.where(pred >= 0, 1, 0)
            class_j = torch.where(pred < 0, 1, 0)
            preds[:,i] += class_i
            preds[:,j] += class_j

    return torch.argmax(preds, dim=1)

def get_onevsone_soft_predictions(clf_dict, num_classes, xb):
    preds = torch.zeros(xb.shape[0], num_classes).to(device=device)
    for i in range(num_classes):
        for j in range(i+1, num_classes):
            indices = f"{i}_{j}"
            clf = clf_dict[indices]
            pred = clf(xb).flatten()
            class_i = torch.where(pred >= 0, pred, -pred)
            class_j = torch.where(pred < 0, -pred, pred)
            preds[:,i] += class_i
            preds[:,j] += class_j

    return torch.argmax(preds, dim=1)

In [22]:
with torch.no_grad():

    _, val_dl = get_data(train_ds, val_ds, 128)
    val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        #pred = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
        preds = get_onevsone_predictions(clf_dict, num_classes, xb)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (preds == yb).sum()
    print(f"Final validation accuracy (Hard): {val_correct * 100 / val_length:.2f} %")

Final validation accuracy (Hard): 67.04 %


In [23]:
with torch.no_grad():

    _, val_dl = get_data(train_ds, val_ds, 128)
    val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        #pred = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
        preds = get_onevsone_soft_predictions(clf_dict, num_classes, xb)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (preds == yb).sum()
    print(f"Final validation accuracy (Soft): {val_correct * 100 / val_length:.2f} %")

Final validation accuracy (Soft): 66.34 %


# One-vs-all Ridge

In [48]:
class Trainer():

    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.losses = []
        self.correct = []
        self.lengths = []
        for _ in range(num_classes):
            self.losses.append(0)
            self.correct.append(0)
            self.lengths.append(0)
        
    def onevsall_step(self, clf, optimizer, xb, yb, label, train=True):
        if train == True:
            clf.train()
        else:
            clf.eval()
        # single batch
        pred = clf.forward(xb).flatten()
        yb = torch.where(yb == label, 1.0, -1.0).flatten()
        #print(yb.shape, pred.shape)
        loss = loss_func(pred, yb)
        if train == True:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        self.losses[label] += loss.item()
        self.correct[label] += (torch.sign(pred.data) == yb).sum()
        self.lengths[label] += len(xb)
        #print(f"""Epoch {epoch} class: {label} norm: {cumulative_gradient.norm():.2f}\n {"train" if train == True else "valid"} loss: {loss:.6f}    acc: {(correct/length)*100:.2f} %\n---------------------------""")
        
    def print_stats(self, epoch, train):
        print(f"Epoch {epoch}")
        for label in range(self.num_classes):
            loss = self.losses[label]
            correct = self.correct[label]
            length = self.lengths[label]
            self.losses[label] = 0
            self.correct[label] = 0
            self.lengths[label] = 0
            print(f"""class: {label} {"train" if train == True else "valid"} loss: {loss:.6f}    acc: {(correct/length)*100:.2f} %""")
        print(f"end epoch {epoch} -----------------------------------------")



In [49]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

epochs = 100

clf_list = []
opt_list = []
loss_func = torch.nn.MSELoss()

for label in range(num_classes):
    clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=1).to(device=device)
    optimizer = torch.optim.Adam(
        clf.parameters(),
        lr=1e-4,
        weight_decay=2
    )
    clf_list.append(clf)
    opt_list.append(optimizer)

cumulative_gradient = torch.zeros_like(clf.linear.weight).to(device=device)
trainer = Trainer(num_classes=num_classes)

for epoch in range(epochs):
    # train step
    for xb, yb in train_dl:
        for label in range(num_classes):
            clf = clf_list[label]
            optimizer = opt_list[label]
            # train step
            trainer.onevsall_step(
                clf=clf,
                optimizer=optimizer,
                xb=xb,
                yb=yb,
                label=label,
                train=True
            )
    
    trainer.print_stats(epoch=epoch, train=True)
    # validation step
    with torch.no_grad():
        for xb, yb in val_dl:
            for label in range(num_classes):
                clf = clf_list[label]
                optimizer = opt_list[label]
                trainer.onevsall_step(
                    clf=clf,
                    optimizer=optimizer,
                    xb=xb,
                    yb=yb,
                    label=label,
                    train=False
                )
    trainer.print_stats(epoch=epoch, train=False)

Epoch 0
class: 0 train loss: 26.306479    acc: 50.47 %
class: 1 train loss: 26.319952    acc: 49.97 %
class: 2 train loss: 26.454225    acc: 49.42 %
class: 3 train loss: 26.239714    acc: 50.30 %
class: 4 train loss: 25.935467    acc: 50.15 %
class: 5 train loss: 25.861327    acc: 50.76 %
class: 6 train loss: 25.885553    acc: 50.04 %
class: 7 train loss: 26.480304    acc: 49.31 %
class: 8 train loss: 25.991639    acc: 50.37 %
class: 9 train loss: 26.521261    acc: 49.17 %
end epoch 0 -----------------------------------------
Epoch 0
class: 0 valid loss: 12.616972    acc: 50.93 %
class: 1 valid loss: 12.417427    acc: 50.71 %
class: 2 valid loss: 12.655237    acc: 50.39 %
class: 3 valid loss: 12.581924    acc: 51.06 %
class: 4 valid loss: 12.236708    acc: 51.66 %
class: 5 valid loss: 12.316279    acc: 51.69 %
class: 6 valid loss: 12.107962    acc: 51.63 %
class: 7 valid loss: 12.531495    acc: 50.75 %
class: 8 valid loss: 12.335482    acc: 51.34 %
class: 9 valid loss: 12.706175    acc

In [50]:
with torch.no_grad():
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        pred = torch.stack([el.forward(xb).flatten() for el in clf_list], dim=-1)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (torch.argmax(pred.data, 1) == yb).sum()
    print(f"Final validation accuracy: {val_correct * 100 / val_length:.2f} %")

Final validation accuracy: 92.24 %


In [51]:
with torch.no_grad():
    preds = []
    for xb, yb in train_dl:
        pred = torch.stack([el.forward(xb).flatten() for el in clf_list], dim=-1)
        preds.append(pred)
    total = torch.cat(preds, dim=0)

std, mean = torch.std_mean(total, dim=0)

with torch.no_grad():
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        pred = torch.stack([clf.forward(xb).flatten() for clf in clf_list], dim=-1)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (torch.argmax((pred.data - mean) / std, 1) == yb).sum()
    print(f"Final validation (normalized) accuracy: {val_correct * 100 / val_length:.2f} %")

Final validation (normalized) accuracy: 92.36 %


# Multiple Ridge Classifiers (One-vs-all)

In [34]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

epochs = 100

l = []

for label in range(num_classes):
    clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=1).to(device=device)
    cumulative_gradient = torch.zeros_like(clf.linear.weight).to(device=device)
    optimizer = torch.optim.Adam(
        clf.parameters(),
        lr=1e-4,
        weight_decay=2
    )
    loss_func = torch.nn.MSELoss()

    for epoch in range(epochs):
        clf.train()
        train_loss = 0
        train_length = 0
        train_correct = 0
        for xb, yb in train_dl:
            pred = clf.forward(xb).flatten()
            yb = torch.where(yb == label, 1.0, -1.0).flatten()
            #print(yb.shape, pred.shape)
            loss = loss_func(pred, yb)
            loss.backward()
            optimizer.step()
            cumulative_gradient += clf.linear.weight.grad
            optimizer.zero_grad()
            train_loss += loss.item()
            train_length += len(xb)
            train_correct += (torch.sign(pred.data) == yb).sum()

        clf.eval()
        val_loss = 0
        val_length = 0
        val_correct = 0
        with torch.no_grad():
            for xb, yb in val_dl:
                pred = clf.forward(xb).flatten()
                yb = torch.where(yb == label, 1.0, -1.0).flatten()
                loss = loss_func(pred, yb)
                val_loss += loss.item() 
                val_length += len(xb)
                val_correct += (torch.sign(pred.data) == yb).sum()
        print(f"""Epoch {epoch} class: {label} norm: {cumulative_gradient.norm():.2f}\ntrain loss: {train_loss:.6f}    acc: {(train_correct/train_length)*100:.2f} %\nvalid loss: {val_loss:.6f}    acc: {(val_correct/val_length)*100:.2f} %\n---------------------------""")
        cumulative_gradient -= cumulative_gradient

    l.append(clf)


Epoch 0 class: 0 norm: 22.90
train loss: 25.835045    acc: 50.59 %
valid loss: 12.376352    acc: 51.22 %
---------------------------
Epoch 1 class: 0 norm: 20.89
train loss: 24.557817    acc: 50.99 %
valid loss: 11.836413    acc: 51.63 %
---------------------------
Epoch 2 class: 0 norm: 19.00
train loss: 23.543110    acc: 51.42 %
valid loss: 11.382172    acc: 52.08 %
---------------------------
Epoch 3 class: 0 norm: 17.16
train loss: 22.652791    acc: 51.81 %
valid loss: 11.001751    acc: 52.32 %
---------------------------
Epoch 4 class: 0 norm: 15.48
train loss: 21.896911    acc: 52.22 %
valid loss: 10.680191    acc: 52.86 %
---------------------------
Epoch 5 class: 0 norm: 14.11
train loss: 21.311223    acc: 52.81 %
valid loss: 10.410090    acc: 53.51 %
---------------------------
Epoch 6 class: 0 norm: 12.72
train loss: 20.788090    acc: 53.53 %
valid loss: 10.180999    acc: 54.15 %
---------------------------
Epoch 7 class: 0 norm: 11.57
train loss: 20.355532    acc: 54.25 %
va

In [35]:
with torch.no_grad():
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        pred = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (torch.argmax(pred.data, 1) == yb).sum()
    print(f"Final validation accuracy: {val_correct * 100 / val_length:.2f} %")

Final validation accuracy: 92.95 %


In [36]:
with torch.no_grad():
    preds = []
    for xb, yb in train_dl:
        pred = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
        preds.append(pred)
    total = torch.cat(preds, dim=0)

std, mean = torch.std_mean(total, dim=0)

with torch.no_grad():
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        pred = torch.stack([clf.forward(xb).flatten() for clf in l], dim=-1)
        #print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (torch.argmax((pred.data - mean) / std, 1) == yb).sum()
    print(f"Final validation accuracy: {val_correct * 100 / val_length:.2f} %")

Final validation accuracy: 94.08 %


# One-vs-all Logistic Regression

In [None]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

epochs = 100

l = []

for label in range(num_classes):
    clf = r2d2.LogisticRegression(inputs=number_of_features, outputs=1).to(device=device)
    cumulative_gradient = torch.zeros_like(clf.linear.weight).to(device=device)
    optimizer = torch.optim.NAdam(
        clf.parameters(),
        lr=1e-4,
        weight_decay=2
    )
    loss_func = torch.nn.BCELoss()

    for epoch in range(epochs):
        clf.train()
        train_loss = 0
        train_length = 0
        train_correct = 0
        for xb, yb in train_dl:
            pred = clf.forward(xb).flatten()
            yb = torch.where(yb == label, 1.0, 0).flatten()
            #print(yb.shape, pred.shape)
            #print(pred.dtype, yb.dtype)
            #print(pred.shape, yb.shape)
            loss = loss_func(pred, yb)
            loss.backward()
            optimizer.step()
            cumulative_gradient += clf.linear.weight.grad
            optimizer.zero_grad()
            train_loss += loss.item()
            train_length += len(xb)
            train_correct += (torch.round(pred.data) == yb).sum()

        clf.eval()
        val_loss = 0
        val_length = 0
        val_correct = 0
        with torch.no_grad():
            for xb, yb in val_dl:
                pred = clf.forward(xb).flatten()
                yb = torch.where(yb == label, 1.0, 0).flatten()
                loss = loss_func(pred, yb)
                val_loss += loss.item() 
                val_length += len(xb)
                val_correct += (torch.round(pred.data) == yb).sum()
        print(f"""Epoch {epoch} class: {label} norm: {cumulative_gradient.norm():.2f}\ntrain loss: {train_loss:.6f}    acc: {(train_correct/train_length)*100:.2f} %\nvalid loss: {val_loss:.6f}    acc: {(val_correct/val_length)*100:.2f} %\n---------------------------""")
        cumulative_gradient -= cumulative_gradient

    l.append(clf)

In [None]:
with torch.no_grad():
    val_length = 0
    val_correct = 0
    for xb, yb in val_dl:
        pred = torch.stack([clf.forward(xb).flatten() for clf in l], dim=-1)
        print(pred[:1], yb[:1])
        val_length += len(xb)
        val_correct += (torch.argmax(pred.data, 1) == yb).sum()
    print(f"Final validation accuracy: {val_correct * 100 / val_length:.2f} %")

# Ridge

In [None]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = num_classes
clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
cumulative_gradient = torch.zeros_like(clf.linear.weight).to(device=device)

optimizer = torch.optim.NAdam(
    clf.parameters(),
    lr=1e-3,
    weight_decay=1
)

epochs = 300
loss_func = nn.MSELoss()

for epoch in range(epochs):
        clf.train()
        train_loss = 0
        train_length = 0
        train_correct = 0
        for xb, yb in train_dl:
            xb = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
            pred = clf.forward(xb)
            #yb = torch.where(yb == label, 1.0, -1.0).flatten()
            #print(yb.shape, pred.shape)
            binarized_labels = (nn.functional.one_hot(yb, num_classes=num_classes)*2)-1
            binarized_labels = binarized_labels.type(torch.float32)
            loss = loss_func(pred, binarized_labels)
            loss.backward()
            optimizer.step()
            cumulative_gradient += clf.linear.weight.grad
            optimizer.zero_grad()
            train_loss += loss.item()
            train_length += len(xb)
            train_correct += (torch.argmax(pred.data, 1) == yb).sum()

        clf.eval()
        val_loss = 0
        val_length = 0
        val_correct = 0
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = torch.stack([el.forward(xb).flatten() for el in l], dim=-1)
                pred = clf.forward(xb)
                #yb = torch.where(yb == label, 1.0, -1.0).flatten()
                binarized_labels = (nn.functional.one_hot(yb, num_classes=num_classes)*2)-1
                binarized_labels = binarized_labels.type(torch.float32)
                loss = loss_func(pred, binarized_labels)
                val_loss += loss.item() 
                val_length += len(xb)
                val_correct += (torch.argmax(pred.data, 1) == yb).sum()
        print(f"""Epoch {epoch} norm: {cumulative_gradient.norm():.2f}\ntrain loss: {train_loss:.6f}    acc: {(train_correct/train_length)*100:.2f} %\nvalid loss: {val_loss:.6f}    acc: {(val_correct/val_length)*100:.2f} %\n---------------------------""")
        cumulative_gradient -= cumulative_gradient

In [None]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]
clf = r2d2.MultiRidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
#cg = torch.zeros_like(clf.linear.weight).to(device=device)

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

optimizer = torch.optim.SGD(
                    clf.parameters(), 
                    lr=1e-7,
                    weight_decay=1
                )

r2d2.fit_model_multi(   epochs=100, 
                        model=clf, 
                        loss_func=nn.MSELoss(), 
                        opt=optimizer, 
                        train_dl=train_dl, 
                        val_dl=val_dl,
                        #cumulative_gradient=cg,
                        #scheduler=scheduler,
                        num_classes=num_classes)

In [None]:
clf.layer

In [None]:
import r2d2
import importlib

importlib.reload(r2d2)

number_of_features = train_ds[0][0].shape[0]
clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
#cg = torch.zeros_like(clf.linear.weight).to(device=device)

train_dl, val_dl = get_data(train_ds, val_ds, 1024)
train_dl = WrappedDataLoader(train_dl, func=preprocess_classification)
val_dl = WrappedDataLoader(val_dl, func=preprocess_classification)

optimizer = torch.optim.NAdam(
                    clf.parameters(), 
                    lr=1e-4,
                    weight_decay=1
                )

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer=optimizer,
                    mode="max",
                    patience=10,
                    threshold=1e-3,
                    cooldown=40
                )

r2d2.fit_model( epochs=50, 
                model=clf, 
                loss_func=nn.MSELoss(), 
                opt=optimizer, 
                train_dl=train_dl, 
                val_dl=val_dl,
                cumulative_gradient=cg,
                #scheduler=scheduler,
                is_multi_ridge=True,
                num_classes=num_classes)

# Multistart + Averaging SGD

In [None]:
k = 10

for i in range(k):
    clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
    cg = torch.zeros_like(clf.linear.weight).to(device=device)

    print("--------------------------------------")
    print(f"        -         {i}       -         ")
    print("--------------------------------------")


    optimizer = torch.optim.NAdam(
                    clf.parameters(), 
                    lr=1e-4,
                    weight_decay=1
                )

    r2d2.fit_model( 
        epochs=50, 
        model=clf, 
        loss_func=nn.MSELoss(), 
        opt=optimizer, 
        train_dl=train_dl, 
        val_dl=val_dl,
        cumulative_gradient=cg,
        #scheduler=scheduler,
        is_ridge=True,
        num_classes=num_classes
    )

    d = clf.state_dict()
    torch.save(d, f"m_{i}_state_dict.pt")

In [None]:
l = []

for i in range(k):
    d = torch.load(f"m_{i}_state_dict.pt")
    l.append(d)

print(l[0]["linear.weight"])

for el in l[0]:
    total = l[0][el]
    for i in range(1, len(l)):
        total += l[i][el]
    total /= len(l)

print(l[0]["linear.weight"])

In [None]:
clf = r2d2.RidgeClassifier(inputs=number_of_features, outputs=num_classes).to(device=device)
cg = torch.zeros_like(clf.linear.weight).to(device=device)

clf.load_state_dict(l[0])

optimizer = torch.optim.SGD(
                    clf.parameters(), 
                    lr=1e-2,
                    weight_decay=1
                )

r2d2.fit_model( 
    epochs=50, 
    model=clf, 
    loss_func=nn.MSELoss(), 
    opt=optimizer, 
    train_dl=train_dl, 
    val_dl=val_dl,
    cumulative_gradient=cg,
    #scheduler=scheduler,
    is_ridge=True,
    num_classes=num_classes
)