In [1]:
project_dir = os.path.split(os.getcwd())[0]
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [2]:
import torch
import numpy as np
from torch import nn
from torch import Tensor
from InformationPlane import TensorKernel, MatrixBasedRenyisEntropy, RKHSMatrixOptimizer, InformationPlane
from numpy import linalg as LA




from torchvision.transforms import Compose, ToTensor, ToPILImage, Normalize
from torch.utils.data import DataLoader
import torch.optim as optim
import torchvision
from tqdm import tqdm
import math

In [84]:
class TensorKernel:
    '''
        Tensor Based Radial Basis Function (RBF) Kernel

        @param x
        @param sigma
    '''
    @staticmethod
    def RBF(x: Tensor, sigma: float) -> Tensor:
        distance = torch.cdist(x, x) 
        return torch.exp(-distance**2 / (sigma**2) )

class MatrixBasedRenyisEntropy():
    @staticmethod
    def entropy(A: Tensor) -> float:
        eigv = torch.symeig(A)[0].abs()
        return -torch.sum(eigv*(torch.log2(eigv)))

    @staticmethod
    def jointEntropy(*args: Tensor) -> float:
        for idx, val in enumerate(args):
            if idx==0:
                A = val.clone()
            else:
                A *= val
        
        A /= A.trace()
        return MatrixBasedRenyisEntropy.entropy(A)

    @staticmethod
    def mutualInformation(Kx: Tensor, Ky: Tensor) -> float:
        entropy_Ax = MatrixBasedRenyisEntropy.entropy(Kx)
        entropy_Ay = MatrixBasedRenyisEntropy.entropy(Ky)
        joint_entropy = MatrixBasedRenyisEntropy.jointEntropy(Kx, Ky)
        return (entropy_Ax + entropy_Ay - joint_entropy)


    '''
        Generates the 'A' matrix based on RBF kernel

        @return 'A' matrix
    '''
    @staticmethod
    def tensorRBFMatrix(x, sigma):
        return TensorKernel.RBF(x, sigma) / len(x)


In [4]:
class RKHSMatrixOptimizer():
    def __init__(self, beta=0.5):
        if not(0 <= beta <= 1):
            raise Exception('beta must be in the range [0, 1]')

        self.beta = beta
        self.sigma = None
        self.sigma_tmp = [] #Just for saving sigma values

    # Temporal, just for testing
    def getSigmaValues(self):
        return self.sigma_tmp

    def getSigma(self):
        return self.sigma

    '''
        @param The output of a specific layer
        @param label_kernel_matrix
        @param n_sigmas
    '''
    def step(self, layer_output: Tensor, Ky: Tensor, sigma_values: list) -> float:
        sigma_t = self.optimize(layer_output, Ky, sigma_values)
        self.sigma = ( (self.beta*sigma_t) + ((1-self.beta)*self.sigma) ) if not(self.sigma is None) else sigma_t
        return self.getSigma()

    '''
        This function is used in orter to obtain the optimal kernel width for
        an T DNN layer

        @param layer_output
        @param n_sigmas: number of possible sigma values

        [Descripción del procedimiento]
    '''
    def optimize(self, x: Tensor, Ky: Tensor, sigma_values: list) -> float:
        Kt = list( map(lambda sigma: TensorKernel.RBF(x, sigma).detach(), sigma_values) )
        loss = np.array( list( map(lambda k: self.kernelAligmentLoss(k, Ky), Kt) ) )
        self.sigma_tmp.append(sigma_values[ np.argwhere(loss == loss.max()).item(0) ])
        return self.sigma_tmp[-1]

    '''
        Kernel Aligment Loss Function.

        This function is used in order to obtain the optimal sigma parameter from
        RBF kernel.  
    '''
    def kernelAligmentLoss(self, x, y):
        return (torch.sum(x*y))/(torch.norm(x) * torch.norm(y))

In [5]:
class InformationPlane(torch.nn.Module):
    '''
        @param input_kernel: preprocessed input kernel matrix
        @param input_kernel: preprocessed label kernel matrix
        @param sigma_values: number of possible sigma values for optimizing process.
        @param step: indicates the number of step for reducing the number of possible sigma values
    '''
    def __init__(self, mini_batch_size, beta=0.5, n_sigmas=75):
        super(InformationPlane, self).__init__()

        self.mini_batch_size = mini_batch_size
        self.sigma_optimizer = RKHSMatrixOptimizer(beta)
        self.Ixt = []
        self.Ity = []

        self.input_batch = None
        self.label_batch = None
        self.n_sigmas=n_sigmas

    def setNumberOfSigma(self, n_sigmas):
        self.n_sigmas = n_sigmas

    '''
        It's necessary to update the X and Y, input and label, in each iteration.

        @param input: batch with the original input
        @param label: label of the data
    '''
    def setInputLabel(self, inputs: Tensor, labels: Tensor):
        self.input_batch = inputs
        self.label_batch = labels

    '''
        @return mutual information with label {I(X,T), I(T,Y)}
    '''
    def forward(self, x: Tensor) -> Tensor:
        if self.training:
            return x
        
        original_shape = x.shape
        x = x.flatten(1)

        # Dividir en minibatchs [x]
        # Utilizar el optimizador [x]
        # Obtener la matrix A con el valor de sigma optimizado [x]
        sigma_values = self.getPossibleSigmaValues(x)       

        for idx in range(0, len(x), self.mini_batch_size):
            batch = x[idx:idx+self.mini_batch_size]
            input_batch = self.input_batch[idx:idx+self.mini_batch_size].flatten(1)
            label_batch = self.label_batch[idx:idx+self.mini_batch_size]
            label_kernel_matrix = TensorKernel.RBF(label_batch, 0.1)
            
            self.sigma_optimizer.step(batch, label_kernel_matrix, sigma_values)

            A = MatrixBasedRenyisEntropy.tensorRBFMatrix(batch, self.sigma_optimizer.getSigma()).detach()
            Ay = MatrixBasedRenyisEntropy.tensorRBFMatrix(label_batch, 0.1).detach()
            Ax = MatrixBasedRenyisEntropy.tensorRBFMatrix(input_batch, 8).detach()

            self.Ixt.append(MatrixBasedRenyisEntropy.mutualInformation(Ax, A))
            self.Ity.append(MatrixBasedRenyisEntropy.mutualInformation(A, Ay))

        x = x.reshape(original_shape)
        return x

    '''
        Defines an array which contains the possible sigma values in 1-D array. The number of possible
        sigma values can be modified using the function setNumberOfSigma().

        @param x: Batch tensor
    '''
    def getPossibleSigmaValues(self, x: Tensor) -> list:
        distance = torch.cdist(x, x)
        mean_distance = distance[~torch.eye(len(distance), dtype=bool)].mean().item()
        start = (mean_distance*0.1)
        end = (mean_distance*10)
        return torch.arange(start, end, (end - start)/self.n_sigmas).tolist()

    def moving_average(x: Tensor, n=10) -> Tensor :
        ret = torch.cumsum(x, dtype=float)
        ret[n:] = ret[n:] - ret[:-n]
        return ret[n - 1:] / n

    ''' 
        @return Mutual Information {I(X,T), I(T,Y)}
    '''
    def getMutualInformation(self):
        return self.Ixt, self.Ity

In [6]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        # self.layer1_IP = InformationPlane(30, beta=0.5)
        # self.layer2_IP = InformationPlane(30, beta=0.5)
        # self.layer3_IP = InformationPlane(30, beta=0.5)
        # self.layer4_IP = InformationPlane(30, beta=0.5)
        self.layer5_IP = InformationPlane(25, beta=0.5)

        self.layer1 = nn.Sequential(
            nn.Linear(784, 1024),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(1024)
            # self.layer1_IP,
        )

        self.layer2 = nn.Sequential(
            nn.Linear(1024, 20),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(20)
            # self.layer2_IP,
        )
        
        self.layer3 = nn.Sequential(
            nn.Linear(20, 20),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(20)
            # self.layer3_IP,
        )

        self.layer4 = nn.Sequential(
            nn.Linear(20, 20),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(20)
            # self.layer4_IP,
        )

        self.layer5 = nn.Sequential(
            nn.Linear(20, 10)
        )


        self.softmax = torch.nn.Softmax()

    def forward(self, x: Tensor, labels=None) -> Tensor:
        if not(self.training):
            [ip.setInputLabel(x, labels) for ip in self.getInformationPlaneLayers()]

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        self.layer5_IP(self.softmax(x))

        return x

    def getInformationPlaneLayers(self):
        return [self.layer5_IP]
        # return [self.layer1_IP, self.layer2_IP, self.layer3_IP, self.layer4_IP, self.layer5_IP]

In [21]:
net = MLP().cuda()

In [25]:
from torchvision.transforms import Compose, ToTensor, ToPILImage, Normalize
from torch.utils.data import DataLoader
import torch.optim as optim
import torchvision
from tqdm import tqdm
import math

# transformToTensor = Compose([ ToTensor(), Normalize((0.1307,), (0.3081,))])
transformToTensor = Compose([ ToTensor()])
dataset = torchvision.datasets.MNIST("../datasets/MNIST/", train=True, download=True, transform=transformToTensor)
train_set, val_set = torch.utils.data.random_split(dataset, [45000, 15000])
dataloader = DataLoader(train_set, batch_size=200, shuffle=True, num_workers=0)

criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
optimizer = torch.optim.Adam(net.parameters(), lr=0.09)

for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    net.train()
    i = 0
    for inputs, labels in tqdm(dataloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs = inputs.flatten(1).cuda()
        # inputs = inputs.cuda()
        labels = labels.cuda()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        # outputs = net(inputs, one_hot(labels, num_classes=10).float())
        outputs = net(inputs, labels.reshape((len(labels), 1)).float())

        loss = criterion(outputs, labels)

        # print statistics
        running_loss += loss.item()
        if (i+1) % 25 == 0:    # print every 25 mini-batches
            print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 25))
            running_loss = 0.0

        loss.backward()
        optimizer.step()

        i += 1

print('Finished Training')

 14%|█▍        | 32/225 [00:00<00:05, 35.76it/s][1,    25] loss: 0.981
 25%|██▍       | 56/225 [00:01<00:04, 36.11it/s][1,    50] loss: 0.344
 36%|███▌      | 80/225 [00:02<00:04, 35.75it/s][1,    75] loss: 0.300
 46%|████▌     | 104/225 [00:02<00:03, 36.36it/s][1,   100] loss: 0.241
 59%|█████▊    | 132/225 [00:03<00:02, 37.19it/s][1,   125] loss: 0.231
 69%|██████▉   | 156/225 [00:04<00:01, 36.61it/s][1,   150] loss: 0.198
 80%|████████  | 180/225 [00:04<00:01, 37.39it/s][1,   175] loss: 0.206
 91%|█████████ | 204/225 [00:05<00:00, 37.19it/s][1,   200] loss: 0.198
100%|██████████| 225/225 [00:06<00:00, 36.52it/s][1,   225] loss: 0.171
Finished Training



In [12]:
# from datasets.MyMNIST import MyMNIST

# dataset = MyMNIST()
# dataloader = DataLoader(dataset, batch_size=100, shuffle=False, num_workers=0)

# dataset.eval()
# net.eval()
# with torch.no_grad():
#     i = 0
#     for inputs, labels in tqdm(dataloader):
        # inputs = inputs.flatten(1).cuda()
        # net(inputs, labels)
        

  0%|          | 0/150 [00:00&lt;?, ?it/s]


In [None]:
# from matplotlib import pyplot as plt

# ip = net.getInformationPlaneLayers()[0]
# a, b = ip.getMutualInformation()
# # print()
# plt.plot(ip.sigma_optimizer.getSigmaValues())
# plt.show()
# # print(a)
# # print(b)

# Testing new Information Plane

In [26]:
inputs = torch.load("tests/eval_inputs_2.pt").flatten(1)
labels = torch.load("tests/eval_labels_2.pt")
x_1 = net.layer1(inputs.cuda())
x_2 = net.layer2(x_1)
x_3 = net.layer3(x_2)
pre_output = net.layer4(x_3)
output = net.layer5(pre_output)
softmax = torch.nn.Softmax()
softmax_output = softmax(output)

In [46]:
softmax = torch.nn.Softmax()
softmax_output = softmax(output)
n_sigmas = 75
distance = torch.cdist(softmax_output, softmax_output)
# distance /= distance.max()
mean_distance = distance[~torch.eye(len(distance), dtype=bool)].mean().item()
start = (mean_distance*0.1)
end = (mean_distance*10)

possible_sigma_values = torch.linspace(start, end, n_sigmas).tolist()
print(possible_sigma_values)

[0.1240960955619812, 0.290116548538208, 0.4561370015144348, 0.6221574544906616, 0.7881779074668884, 0.9541983604431152, 1.1202187538146973, 1.2862393856048584, 1.4522597789764404, 1.6182801723480225, 1.7843005657196045, 1.9503211975097656, 2.1163415908813477, 2.2823619842529297, 2.448382616043091, 2.614403009414673, 2.780423402786255, 2.946443796157837, 3.112464189529419, 3.27848482131958, 3.444505214691162, 3.610525608062744, 3.7765462398529053, 3.9425666332244873, 4.10858678817749, 4.274607181549072, 4.440627574920654, 4.606647968292236, 4.772668838500977, 4.938689231872559, 5.104709625244141, 5.270730018615723, 5.436750411987305, 5.602770805358887, 5.768791198730469, 5.934811592102051, 6.100831985473633, 6.266852855682373, 6.432873249053955, 6.598893642425537, 6.764914035797119, 6.930934429168701, 7.096954822540283, 7.262975215911865, 7.4289960861206055, 7.5950164794921875, 7.7610368728637695, 7.927057266235352, 8.093077659606934, 8.259098052978516, 8.425118446350098, 8.591138839721

In [61]:
def kernel_loss(k_x, k_y, k_l):
        beta = 1.0

        L = torch.norm(k_l)
        Y = torch.norm(k_y) ** beta
        X = torch.norm(k_x) ** (1-beta)

        LY = torch.trace(torch.matmul(k_l, k_y))**beta
        LX = torch.trace(torch.matmul(k_l, k_x))**(1-beta)

        return 2*torch.log2((LY*LX)/(L*Y*X))

def kernelAligmentLoss(x, y) -> float:
        return (torch.sum(x*y)/(torch.norm(x) * torch.norm(y))).item()

In [56]:
Kx = TensorKernel.RBF(inputs[0:], 8) / 100
Ky = TensorKernel.RBF(labels[0:], 0.1) / 100
Kt_1 = TensorKernel.RBF(x_1[0:], 15) / 100
Kt_2 = TensorKernel.RBF(x_2[0:], 1.5) / 100
Kt = TensorKernel.RBF(softmax_output[0:], 0.1) / 100

entropy_kx = MatrixBasedRenyisEntropy.entropy(Kx.cuda())
entropy_ky = MatrixBasedRenyisEntropy.entropy(Ky.cuda())
entropy_kt = MatrixBasedRenyisEntropy.entropy(Kt.cuda())
entropy_kt_1 = MatrixBasedRenyisEntropy.entropy(Kt_1.cuda())
entropy_kt_2 = MatrixBasedRenyisEntropy.entropy(Kt_2.cuda())
joint_entropy_kxkt = MatrixBasedRenyisEntropy.jointEntropy(Kx.cuda(), Kt.cuda())
joint_entropy_kxkt_1 = MatrixBasedRenyisEntropy.jointEntropy(Kx.cuda(), Kt_1.cuda())
joint_entropy_kxkt_2 = MatrixBasedRenyisEntropy.jointEntropy(Kx.cuda(), Kt_2.cuda())
joint_entropy_ktky = MatrixBasedRenyisEntropy.jointEntropy(Kt.cuda(), Ky.cuda())
joint_entropy_kt_1ky = MatrixBasedRenyisEntropy.jointEntropy(Kt_1.cuda(), Ky.cuda())
joint_entropy_kt_2ky = MatrixBasedRenyisEntropy.jointEntropy(Kt_2.cuda(), Ky.cuda())

# print(entropy_kx)
# print(entropy_ky)
# print(entropy_kt)
# print(joint_entropy_kxkt)
# print(joint_entropy_ktky)

print("Mutual Information Kt_1")
print(entropy_kx + entropy_kt_1 - joint_entropy_kxkt_1)
print(entropy_kt_1 + entropy_ky - joint_entropy_kt_1ky)

print("Mutual Information Kt_2")
print(entropy_kx + entropy_kt_2 - joint_entropy_kxkt_2)
print(entropy_kt_2 + entropy_ky - joint_entropy_kt_2ky)

print("Mutual Information Kt")
print(entropy_kx + entropy_kt - joint_entropy_kxkt)
print(entropy_kt + entropy_ky - joint_entropy_ktky)


Mutual Information Kt_1
tensor(5.4736, device='cuda:0', grad_fn=<SubBackward0>)
tensor(3.2971, device='cuda:0', grad_fn=<SubBackward0>)
Mutual Information Kt_2
tensor(5.3874, device='cuda:0', grad_fn=<SubBackward0>)
tensor(3.2959, device='cuda:0', grad_fn=<SubBackward0>)
Mutual Information Kt
tensor(3.4205, device='cuda:0', grad_fn=<SubBackward0>)
tensor(3.2223, device='cuda:0', grad_fn=<SubBackward0>)


In [57]:
length = 30
Kx = TensorKernel.RBF(inputs[0:length], 8) / length
Ky = TensorKernel.RBF(labels[0:length], 0.1) / length
Kt_1 = TensorKernel.RBF(x_1[0:length], 15.8) / length
Kt = TensorKernel.RBF(softmax_output[0:length], 0.6) / length

print("A")
entropy_kx = MatrixBasedRenyisEntropy.entropy(Kx.cuda())
print("B")
entropy_ky = MatrixBasedRenyisEntropy.entropy(Ky.cuda())
print("C")
entropy_kt = MatrixBasedRenyisEntropy.entropy(Kt.cuda())
print("D")
entropy_kt_1 = MatrixBasedRenyisEntropy.entropy(Kt_1.cuda())
joint_entropy_kxkt = MatrixBasedRenyisEntropy.jointEntropy(Kx.cuda(), Kt.cuda())
joint_entropy_kxkt_1 = MatrixBasedRenyisEntropy.jointEntropy(Kx.cuda(), Kt_1.cuda())
joint_entropy_ktky = MatrixBasedRenyisEntropy.jointEntropy(Kt.cuda(), Ky.cuda())
joint_entropy_kt_1ky = MatrixBasedRenyisEntropy.jointEntropy(Kt_1.cuda(), Ky.cuda())

# print(entropy_kx)
# print(entropy_ky)
# print(entropy_kt)
# print(joint_entropy_kxkt)
# print(joint_entropy_ktky)

print("Mutual Information Kt_1")
print(entropy_kx + entropy_kt_1 - joint_entropy_kxkt_1)
print(entropy_kt_1 + entropy_ky - joint_entropy_kt_1ky)

print("Mutual Information Kt")
print(entropy_kx + entropy_kt - joint_entropy_kxkt)
print(entropy_kt + entropy_ky - joint_entropy_ktky)

A
B
C
D
Mutual Information Kt_1
tensor(4.3471, device='cuda:0', grad_fn=<SubBackward0>)
tensor(3.1829, device='cuda:0', grad_fn=<SubBackward0>)
Mutual Information Kt
tensor(3.0835, device='cuda:0', grad_fn=<SubBackward0>)
tensor(3.0981, device='cuda:0', grad_fn=<SubBackward0>)


In [92]:
softmax = torch.nn.Softmax()
softmax_output = softmax(output)
with torch.no_grad():
    Ky = TensorKernel.RBF(labels[0:30], 0.1).cuda()
    Kx = TensorKernel.RBF(inputs[0:30], 8).cuda()
    Kts = list( map(lambda sigma: TensorKernel.RBF(softmax_output[0:30], sigma).cuda(), possible_sigma_values) )
    loss = np.array( list( map(lambda k: kernelAligmentLoss(k, Ky), Kts) ) )
    # loss = np.array( list( map(lambda k: kernel_loss(Kx, Ky, k), Kts) ) )
    best_sigma = possible_sigma_values[ (np.argwhere(loss == loss.max())).item() ]
    # print(loss)
    # print(possible_sigma_values)
    print(best_sigma)
    # self.sigma_tmp.append(sigma_values[ np.argwhere(loss == loss.max()).item(0) ])

    # Ay = MatrixBasedRenyisEntropy.tensorRBFMatrix(labels[0:30], 0.1)
    # Ax = MatrixBasedRenyisEntropy.tensorRBFMatrix(inputs[0:30], 8)
    # A = MatrixBasedRenyisEntropy.tensorRBFMatrix(softmax_output[0:30], best_sigma)
    
    Ay = TensorKernel.RBF(labels[0:30], 0.1) / 30
    Ax = TensorKernel.RBF(inputs[0:30], 8) / 30
    A = TensorKernel.RBF(softmax_output[0:30], best_sigma) / 30

    print(Ay.shape)
    print(torch.matrix_rank(Ay))
    print(Ax.shape)
    print(torch.matrix_rank(Ax))
    print(A.shape)
    print(torch.matrix_rank(A))

    print(MatrixBasedRenyisEntropy.mutualInformation(Ax.cuda(), A.cuda()))
    print(MatrixBasedRenyisEntropy.mutualInformation(A.cuda(), Ay.cuda()))

0.1240960955619812
torch.Size([30, 30])
tensor(30)
torch.Size([30, 30])
tensor(30)
torch.Size([30, 30])
tensor(28, device='cuda:0')
tensor(3.1974, device='cuda:0')
tensor(3.1661, device='cuda:0')


In [47]:
def dist_mat(x, y=None):
    try:
        x = torch.from_numpy(x)
    except TypeError:
        x = x

    dist = torch.norm(x[:, None] - x, dim=2, p=2)
    return dist / dist.max()

In [None]:
distance_1 = dist_mat(pre_output)
distance_2 = torch.cdist(pre_output, pre_output)
print(torch.allclose(distance_1, distance_2/distance_2.max()))

In [None]:
distance = torch.cdist(pre_output, pre_output, compute_mode='use_mm_for_euclid_dist') 
distance = distance / distance.max()
print(distance[:2])
print(distance_1[:2])

In [None]:
k = dist_mat(softmax_output)
print(k.shape)
print(torch.sort(k)[0][:, :50].shape)
sigma = torch.sort(k)[0][:, :50].mean()
print(sigma)

In [41]:
Tensor([10]).cuda().item()

10.0