In [None]:
import sys, os
project_dir = os.path.split(os.getcwd())[0]
if project_dir not in sys.path:
    sys.path.append(project_dir)

import torch
from torch import Tensor, nn
from IPDL import MatrixEstimator, ClassificationInformationPlane, AutoEncoderInformationPlane
from IPDL.optim import AligmentOptimizer, SilvermanOptimizer

import torchvision
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor, Lambda
from torch.nn.functional import one_hot
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Introduction

This notebook has the purpose to explain how to use this framework (IPDL). For any suggestion... meh!

# Network Design

In order to obtain the Information Plane, it is necessary to generates the matrix $A_T$ which is a representation RKHS of the T layer's output. In this framework, this task is performed by the MatrixEstimator class which is necessary to indicate a intial $\sigma$ value due to this framework apply RBF kernel in order to obtain the RKHS. 

The following cell is a example of network design where at the end of each layer we are applying a MatrixEstimator:

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(784, 1024),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(1024, affine=True),
            MatrixEstimator(0.1),
        )

        self.layer2 = nn.Sequential(
            nn.Linear(1024, 128),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(128, affine=True),
            MatrixEstimator(0.1),
        )
        
        self.layer3 = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(64, affine=True),
            MatrixEstimator(0.1),
        )

        self.layer4 = nn.Sequential(
            nn.Linear(64, 32),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(32, affine=True),
            MatrixEstimator(0.1),
        )

        self.layer5 = nn.Sequential(
            nn.Linear(32, 10),
            nn.LeakyReLU(inplace=True),
            MatrixEstimator(0.1),
        )


        for m in self.modules():
            self.weight_init(m)

    def forward(self, x: Tensor) -> Tensor:
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)

        return x

    def weight_init(self, module):
        if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
            nn.init.kaiming_normal_(module.weight.data, nonlinearity='relu')

# Training

In the training process, we have to define a optimizer for the *MatrixEstimator*, which is independent from the optimizer which is going to be used in order to optimize the network. This new optimizer, which base class is called *MatrixOptimizer*, will update the sigma value which is used in the RBF kernel.

About the Information Plane, for this operation a specific class have been implemented, *InformationPlane*, which contains the *computeMutualInformation()* method which giving the input matrix, $A_x$, and output matrix $A_y$, is going to compute the mutual information $I(A_x,A_t)$ and $I(A_t,A_x)$ that are used for generate the Information Plane.

For this example, MNIST dataset will be used...

In [None]:
transformToTensor = Compose([ ToTensor() ])
dataset = torchvision.datasets.MNIST("../datasets/MNIST/", train=True, download=True, transform=transformToTensor)
train_set, val_set = torch.utils.data.random_split(dataset, [59488 , 512])
train_dataloader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=0)
eval_dataloader = DataLoader(val_set, batch_size=150, shuffle=False, num_workers=0)

The first step is to generate the matrices $A_x$ and $A_y$. In this case, the matrices are going to be generate directly but it could be generate applying the MatrixEstimator class. The sigma values used are the proposed in [referencia]..

In [None]:
from IPDL.functional import matrix_estimator

val_inputs, val_targets = next(iter(eval_dataloader))
val_inputs = val_inputs.flatten(1).to(device)
val_targets = one_hot(val_targets, num_classes=10).float().to(device) 
    
_, Ax = matrix_estimator(val_inputs, sigma=8)
Ky, Ay = matrix_estimator(val_targets, sigma=.1)

Construir nuestro modelo para crear el matrix optimizer y information plane...

In [None]:
model = MLP().to(device)
matrix_optimizer = AligmentOptimizer(model, beta=0.9, n_sigmas=200)
ip = ClassificationInformationPlane(model, use_softmax=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.009, momentum=0.9)

loss_record = []

for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    i = 0

    with torch.no_grad():
        model.eval()
        model(val_inputs)
        
    for inputs, labels in tqdm(train_dataloader):
        inputs = inputs.flatten(1).to(device)
        labels = labels.to(device)

        model.train()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)

        loss = criterion(outputs, labels)

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:
            loss_record.append(running_loss / 50)
            running_loss = 0.0

        loss.backward()
        optimizer.step()

        with torch.no_grad():
            model.eval()
            model(val_inputs)
            matrix_optimizer.step(Ky.to(device))
            ip.computeMutualInformation(Ax.to(device), Ay.to(device))

        if i > 500:
            for ip in net.getInformationPlaneLayers():
                ip.setNumberOfSigma(100)

        i += 1
 
print('Finished Training')

In [None]:
from IPDL.utils import showMutualInformation
showMutualInformation(ip, 10)

# Autoencoder

La optimización por *kernel aligment* no es posible para el caso de un autoencoder. En esos casos es más propio usar *Silverman’s rule of thumb*. Para kernels de alta dimensionalidad, propia de las vista en las redes neuronales, Nicolás I. Tapia et al. proponen una simplificación de este con una posible normalización teniendo en cuenta la dimensionalida. The *SilvermanOptimizer* is a implementation of this proposed method.

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(784, 512),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(512, affine=False),
            MatrixEstimator(0.1),
        )

        self.layer2 = nn.Sequential(
            # nn.Dropout(p=0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(256, affine=False),
            MatrixEstimator(0.1),
        )
        
        self.layer3 = nn.Sequential(
            # nn.Dropout(p=0.1),
            nn.Linear(256, 128),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(128, affine=False),
            MatrixEstimator(0.1),
        )

        self.layer4 = nn.Sequential(
            # nn.Dropout(p=0.1),
            nn.Linear(128, 256),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(256, affine=False),
            MatrixEstimator(0.1),
        )

        self.layer5 = nn.Sequential(
            # nn.Dropout(p=0.1),
            nn.Linear(256, 512),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(512, affine=False),
            MatrixEstimator(0.1),
        )

        self.layer6 = nn.Sequential(
            # nn.Dropout(p=0.1),
            nn.Linear(512, 784),
            # nn.LeakyReLU(inplace=True),
            nn.Sigmoid(),
            MatrixEstimator(0.1),
        )

        for m in self.modules():
            self.weight_init(m)

    def forward(self, x: Tensor) -> Tensor:
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)

        return x

    def weight_init(self, module):
        if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
            nn.init.kaiming_normal_(module.weight.data, nonlinearity='relu')

In [None]:
transformToTensor = Compose([
            ToTensor(), # first, convert image to PyTorch tensor
            Lambda(lambda x: torch.flatten(x)) # Auto-flatten
        ])

dataset = torchvision.datasets.MNIST("../datasets/MNIST/", train=True, download=True, transform=transformToTensor)
train_set, val_set = torch.utils.data.random_split(dataset, [59488 , 512])
train_dataloader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=0)
eval_dataloader = DataLoader(val_set, batch_size=512, shuffle=False, num_workers=0)

In [None]:
import math
from IPDL.functional import matrix_estimator

val_inputs, val_targets = next(iter(eval_dataloader))
val_inputs = val_inputs.flatten(1).to(device)

n = val_inputs.size(0)
d = val_inputs.size(1) if len(val_inputs.shape) == 2 else reduce(lambda x, y: x*y, val_inputs.shape[1:])
gamma = 0.8
sigma = gamma * n ** (-1 / (4+d)) * math.sqrt(d) 

_, Ax = matrix_estimator(val_inputs, sigma=sigma)
Ax = Ax.to(device)

In [None]:
model = AutoEncoder().to(device)

In [None]:
n_epoch = 25
criterion = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 100, gamma=0.1)
matrix_optimizer = SilvermanOptimizer(model, gamma=0.8, normalize_dim=True)

model.train()

eval_inputs, _ = next(iter(eval_dataloader))
eval_inputs = eval_inputs.to(device)

epoch_iterator = tqdm(
    range(n_epoch),
    leave=True,
    unit="epoch",
    postfix={"lss": "%.6f" % 0.0, "vls": "%.6f" % -1,},
)

ip = AutoEncoderInformationPlane(model)

for epoch in epoch_iterator:
    for idx, (inputs, _) in enumerate(train_dataloader):
        inputs = inputs.flatten(1).to(device)

        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs, inputs)
        loss_value = float(loss.item())        
        loss.backward()

        optimizer.step()

        if idx % 25 == 0:
            with torch.no_grad():
                model.eval()
                outputs = model(eval_inputs.flatten(1))
                eval_loss_value = float((criterion(outputs, eval_inputs.flatten(1))).item())
                epoch_iterator.set_postfix(
                    lss="%.6f" % loss_value, vls="%.6f" % eval_loss_value,
                )

            model.train()
    
    scheduler.step()
    if epoch == 0: # Solo necesario una vez
        matrix_optimizer.step()
    
    Ixt, Ity = ip.computeMutualInformation(Ax)

In [None]:
print(Ixt)
print(Ity)

In [None]:
from matplotlib import pyplot as plt
from IPDL import MatrixBasedRenyisEntropy
with plt.style.context('seaborn'):
    f, ax = plt.subplots(figsize=(12, 12))
    reference = MatrixBasedRenyisEntropy.entropy(Ax).cpu()
    ax.set(xlim=(0, reference), ylim=(0, reference))

    Ixt, Ity = ip.getMutualInformation(moving_average_n=6)

    for idx, current_Ixt in enumerate(Ixt):
        current_Ity = Ity[idx]
        ax.scatter(current_Ixt, current_Ity, label="layer {}".format(idx+1))
        ax.plot(current_Ixt, current_Ity)

ax.set_xlabel("I(X,T)")
ax.set_ylabel("I(T,Y)")
ax.plot([0, 1], [0, 1], transform=ax.transAxes, linestyle='dashed')
plt.legend()
plt.show()

In [None]:
import matplotlib as mpl
import numpy as np
from IPDL.utils import gen_log_space

def show_information_plane(ip: AutoEncoderInformationPlane, reference) -> mpl.figure.Figure:
    markers = "o^spdP*"
    cmap = mpl.cm.Blues
    # reference = MatrixBasedRenyisEntropy.entropy(ip.get_input_matrix()).cpu()

    Ixt, Ity = ip.getMutualInformation(moving_average_n=2)

    with plt.style.context('seaborn'):
        fig = plt.figure(constrained_layout=True, figsize=(16,8))
        gs1 = fig.add_gridspec(nrows=10, ncols=2, left=0.05, right=0.84, wspace=0.05, hspace=10)

        f8_ax1 = fig.add_subplot(gs1[0:9, 0])
        f8_ax1.set_title("Encoder")
        f8_ax1.set_xlabel("I(X, T)")
        f8_ax1.set_ylabel("I(T, Y)")
        f8_ax1.set(xlim=(0, reference), ylim=(0, reference))
        f8_ax1.plot([0, 1], [0, 1], transform=f8_ax1.transAxes, linestyle='dashed')

        for idx in range((len(Ixt)//2)+1):
            if idx == (len(Ixt)//2):
                label = "Bottleneck"
            else:
                label = "Encoder {}".format(idx+1)
            current_Ixt = np.array(Ixt[idx])
            current_Ity = np.array(Ity[idx])

            log_spaced = gen_log_space(len(current_Ixt), math.ceil(len(current_Ixt)*0.1))
            iterations = np.arange(len(log_spaced))
            # iterations = np.arange(len(current_Ity))

            f8_ax1.scatter(current_Ixt[log_spaced], current_Ity[log_spaced], c=iterations, vmin=0, vmax=iterations.max(), label=label, marker=markers[idx], cmap=cmap, edgecolors='black')
            f8_ax1.plot(current_Ixt[log_spaced], current_Ity[log_spaced], color=(0, 0, 0.75, 0.3))
        f8_ax1.legend()

        f8_ax2 = fig.add_subplot(gs1[0:9, 1])
        f8_ax2.set_title("Decoder")
        f8_ax2.set_xlabel("I(X, T)")
        f8_ax2.set_ylabel("I(T, Y)")
        f8_ax2.yaxis.tick_right()
        f8_ax2.yaxis.set_label_position("right")
        f8_ax2.set(xlim=(0, reference), ylim=(0, reference))
        f8_ax2.plot([0, 1], [0, 1], transform=f8_ax2.transAxes, linestyle='dashed')

        decode_markers = markers[:idx+1]
        decode_markers = decode_markers[::-1]
        for marker_idx, idx in enumerate(range((len(Ixt)//2), len(Ixt))):
            if idx == (len(Ixt)//2):
                label = "Bottleneck"
            else:
                label = "Decoder {}".format(idx+1)
            current_Ixt = np.array(Ixt[idx])
            current_Ity = np.array(Ity[idx])
            log_spaced = gen_log_space(len(current_Ixt), math.ceil(len(current_Ixt)*0.1))
            
            marker = decode_markers[marker_idx]
            f8_ax2.scatter(current_Ixt[log_spaced], current_Ity[log_spaced], c=iterations, vmin=0, vmax=iterations.max(), label=label, marker=marker, cmap=cmap, edgecolors='black')
            f8_ax2.plot(current_Ixt[log_spaced], current_Ity[log_spaced], color=(0, 0, 0.75, 0.3))
        
        f8_ax2.legend()

        f8_ax3 = fig.add_subplot(gs1[9, :])
        f8_ax3.set_title("Iterations")
        norm = mpl.colors.Normalize(vmin=0, vmax=len(current_Ixt))
        cb1 = mpl.colorbar.ColorbarBase(f8_ax3, cmap=cmap,
                                        norm=norm,
                                        orientation='horizontal')

    return fig

In [None]:
reference = MatrixBasedRenyisEntropy.entropy(Ax).cpu()
fig = show_information_plane(ip, reference)
plt.show()

In [None]:
model

In [None]:
Ity[4]

In [None]:
model

In [None]:
# With Learnable Parameters
from torch.nn import ReLU6
m = nn.BatchNorm1d(100)
# Without Learnable Parameters
m = nn.BatchNorm1d(100, affine=False)
relu = nn.ReLU6()
input = relu(torch.randn(20, 100))
output = m(input)

In [None]:
output

In [None]:
test = torch.nn.functional.batch_norm

In [None]:
m = nn.BatchNorm1d(100)
# Without Learnable Parameters
m = nn.BatchNorm1d(100, affine=False)
x = torch.randn(20, 100)
relu = nn.ReLU()
input = relu(x)
output = m(input)
output_2 = torch.nn.functional.batch_norm(input, running_mean=None, running_var=None)

In [None]:
a = torch.rand(10, 100)
norm = nn.BatchNorm1d(100)

mean = a.mean(dim=0)
std = a.std(dim=0)

b = (a - mean)/std

In [None]:
print(b[2, :10])
print(norm(a)[2, :10])

In [None]:
a = torch.rand((10,100))
b = (a - a.mean(dim=0))/torch.sqrt(a.var(dim=0, unbiased=False))

In [None]:
print(b.mean(dim=0))
print(b.std(dim=0))

print(b[0].mean() + b[0].std())
print(b[1].mean() + b[1].std())

In [None]:
m = nn.BatchNorm1d(100, affine=False)
c = m(a)

In [None]:
print(c.mean(dim=0))
print(c.std(dim=0))

In [None]:
j = torch.zeros(100)
k = torch.zeros(100)

d = torch.nn.functional.batch_norm(a, j, k, training=True)
print(c[0])
print(d[0])

In [None]:
b = (a - a.mean(dim=0)) / torch.sqrt(a.var(dim=0, unbiased=False))

In [None]:
a = torch.zeros(10)
b = a.to('cuda').clone()
b[0] = 1
print(a)
print(b)