In [None]:
import torch
from torchvision import datasets
from torchvision import transforms
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap
# For presentations purposes only (not needed in Colab)
#plt.style.use('notebook.mplstyle')
# Keeps the kernel from dying in notebooks on Windows machines (not needed in Colab)
#import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Define my own colormap
colors = plt.get_cmap("tab10")
gray = 0.5
red = np.hstack([np.linspace(colors(0)[0], gray, 128), np.linspace(gray, colors(1)[0], 127)])
green = np.hstack([np.linspace(colors(0)[1], gray, 128), np.linspace(gray, colors(1)[1], 127)])
blue = np.hstack([np.linspace(colors(0)[2], gray, 128), np.linspace(gray, colors(1)[2], 127)])
rgb = np.vstack([red, green, blue]).T
my_cmap = ListedColormap(rgb)

### Generate fake data to work with
We assume that the data lies on a 1-D manifold in a 2-D input space. In simpler terms, this means that we assume that the data can be approximated with a curve in the 2-D input space.

In [None]:
# Generate 2-D input data with quadratic relatinship
n_data_points = 7
x_lim = [-1, 1]
x1 = np.linspace(x_lim[0], x_lim[1], n_data_points)
x2 = -x1**2 + 0.15*np.random.randn(x1.size)
X = np.vstack([x1, x2]).T

# Visualize what we have
fig, ax = plt.subplots(1, 1)
ax.plot(X[:, 0], X[:, 1], 'o', alpha=0.5)
ax.set(xlabel='$x_1$', ylabel='$x_2$');

### Define an autoencoder in PyTorch
We define a simple autoencoder with two feed-forward neural networks: one encoder network and one decoder network. We make both networks "symmetric" with a single hidden layer containing equally many hidden neurons. There is no specific reason for why the encoder and decoder would have to be symmetric, but people tend to use this as a defualt starting pont due to historical reasons.

In [None]:
# Creating a PyTorch AE class
class AE(torch.nn.Module):
    def __init__(self, n_hidden_neurons):
        super().__init__()
          
        self.n_hidden_neurons = n_hidden_neurons
            
        # Encoder
        # Linear layers followed by 
        # sigmoid activation functions
        # 2 ==> n_hidden_neurons  ==> 1
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(2, self.n_hidden_neurons),
            torch.nn.Sigmoid(),
            torch.nn.Linear(self.n_hidden_neurons, 1)
        )
          
        # Decoder
        # Linear layers followed by 
        # sigmoid activation functions
        # 1 ==> n_hidden_neurons ==> 2
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(1, self.n_hidden_neurons),
            torch.nn.Sigmoid(),
            torch.nn.Linear(self.n_hidden_neurons, 2),
        )
  
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

### Train the network
1. In order to train the network we need to attach our data to a dataloader. The batch size determines how many data points we use to estimate the gradient before updating the model's parameters.
2. We use the mean squarred error (MSE) as our loss function as this is equivalen to minimizing the reconstruciton error between inputs and outputs.
3. The optimizer takes care of updating the model's parameters. The simplest optimizer is stochastic gradient descent (SGD) which is essentially the same as normal gradient descent but with the gradient estimated from mini batches (the batch size) isntead of all available data. However, learnign can often be accelerated using additional tricks (momentum and adaptive learning rates), and the easiest way  to include these are to use the Adam optimizer instead of SGD.
4. The learning rate you will have to set your self based on trial and error.
5. Training duration is normally measured in "Epochs", where one epoch corresponds to using all training data once to update the model parameters. For example: if your batch size is 16 and you have 64 data points, then one epoch corresponds to 4 batches or parameter updates.

In [None]:
# Add the data to a DataLoader 
# PyTorch wants all values as floats thus the conversion below.
loader = torch.utils.data.DataLoader(dataset = torch.from_numpy(X).float(),
                                     batch_size = 16,
                                     shuffle = True)

# Use MSE as the loss function
loss_function = torch.nn.MSELoss()

# Model initialization
n_hidden_neurons = 50
model = AE(n_hidden_neurons)

# Using the Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

# Train the network
epochs = 1500
mse_values = []
for epoch in range(epochs):
    for x_batch in loader:
        
        # Autoencoder output
        reconstructed = model(x_batch)

        # Calculating the loss function
        loss = loss_function(reconstructed, x_batch)

        # Set old gradients to zero
        optimizer.zero_grad()
        # Computes the gradient
        loss.backward()
        # Perform the parameter update
        optimizer.step()

        # Storing the MSE values in a list for plotting
        mse_values.append(loss.detach().numpy())

# Visualize training progress
fig, ax = plt.subplots(1, 1)
ax.plot(mse_values)
ax.set(xlabel='Iterations', ylabel='Loss (MSE)');

In [None]:
# Send our data through the trained model
# The from numpy().float() -> detach().numpy() is needed as we are
# first convert our numpy array to a torch tensor and then back to a numpy array
X_hat = model(torch.from_numpy(X).float()).detach().numpy()
# We can also just run the encoder network to see where each data point lives in 
# 1-D latent space.
latent_var = model.encoder(torch.from_numpy(X).float()).detach().numpy()
# Interpolate in the latent space
latent_var_interp = np.linspace(latent_var.min(), latent_var.max(), 100)
X_hat_interp = model.decoder(torch.from_numpy(latent_var_interp[:, np.newaxis]).float()).detach().numpy()

# Visualize input -> latent space -> output conversion
# We color code each data point based on where it lives in the latent
# space so as to be able to follow how single data points pass through the autoencoder.
marker_size = 100
c_map = cm.viridis
fig, axs = plt.subplots(1, 3, figsize=[15, 4])

# Input
axs[0].scatter(X[:, 0], X[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[0].set(xticks=[], yticks=[], xlabel='$x_1$', ylabel='$x_2$', title='Input')

# Latent space
axs[1].plot(latent_var_interp, np.zeros(latent_var_interp.size), 'k-')
axs[1].scatter(latent_var, np.zeros(latent_var.size), marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[1].set(xticks=[], yticks=[], xlabel='Latent variable', title='Laten space')

# Output
axs[2].plot(np.stack([X[:, 0], X_hat[:, 0]]), np.stack([X[:, 1], X_hat[:, 1]]), 'k:')
axs[2].scatter(X[:, 0], X[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[2].plot(X_hat_interp[:, 0], X_hat_interp[:, 1], 'k-')
#axs[2].scatter(X_hat[:, 0], X_hat[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[2].set(xticks=[], yticks=[], xlabel='$x_1$', ylabel='$x_2$', title='Output');

### Define a variational autoencoder

In [None]:
# Creating a PyTorch variational AE class
class VariationalEncoder(torch.nn.Module):
    def __init__(self, n_hidden_neurons):
        super().__init__()
          
        self.n_hidden_neurons = n_hidden_neurons
        self.N = torch.distributions.Normal(0, 1)
        self.linear1 = torch.nn.Linear(2, self.n_hidden_neurons)
        self.linear2 = torch.nn.Linear(self.n_hidden_neurons, 1)
        self.linear3 = torch.nn.Linear(self.n_hidden_neurons, 1)
        self.kl = 0       
  
    # Encoder
    # Linear layers followed by 
    # sigmoid activation functions
    # 2 ==> n_hidden_neurons  ==> 1 (one value for mu and one log_sigma)
    def forward(self, x):
        # Network that output mu and log_sigma
        x = self.linear1(x)
        x = torch.sigmoid(x)
        mu = self.linear2(x)
        log_sigma = self.linear3(x)
        # Sample from a normal distribution
        z = mu + torch.exp(log_sigma)*self.N.sample(mu.shape)
        # Compute the KL divergence
        self.kl = (torch.exp(log_sigma)**2 + mu**2 - log_sigma - 1/2).sum()
        return z, mu, log_sigma

class VariationalDecoder(torch.nn.Module):
    def __init__(self, n_hidden_neurons):
        super().__init__()
          
        self.n_hidden_neurons = n_hidden_neurons
        self.linear1 = torch.nn.Linear(1, self.n_hidden_neurons)
        self.linear2 = torch.nn.Linear(self.n_hidden_neurons, 2)          

    # Decoder
    # Linear layers followed by 
    # sigmoid activation functions
    # 1 ==> n_hidden_neurons ==> 2
    def forward(self, x):
        x = self.linear1(x)
        x = torch.sigmoid(x)
        x = self.linear2(x)
        return x

class VariationalAutoencoder(torch.nn.Module):
    def __init__(self, n_hidden_neurons):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = VariationalEncoder(n_hidden_neurons)
        self.decoder = VariationalDecoder(n_hidden_neurons)

    def forward(self, x):
        z = self.encoder(x)[0]
        return self.decoder(z)

In [None]:
# Model initialization
alpha = 0.02
n_hidden_neurons = 50
model = VariationalAutoencoder(n_hidden_neurons)

# Using the Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

# Train the network
epochs = 1500
mse_values = []
for epoch in range(epochs):
    for x_batch in loader:
        
        # Autoencoder output
        reconstructed = model(x_batch)

        # Calculating the loss function: MSE + KL divergence
        mse = ((x_batch - reconstructed)**2).sum()
        loss = mse + alpha*model.encoder.kl

        # Set old gradients to zero
        optimizer.zero_grad()
        # Computes the gradient
        loss.backward()
        # Perform the parameter update
        optimizer.step()

        # Storing the MSE values in a list for plotting
        mse_values.append(loss.detach().numpy())

# Visualize training progress
fig, ax = plt.subplots(1, 1)
ax.plot(mse_values)
ax.set(xlabel='Iterations', ylabel='Loss (MSE)');

In [None]:
# Send our data through the encoder
z, z_mu, log_sigma = model.encoder(torch.from_numpy(X).float())
# Send the z_mu values through the decoder
X_hat = model.decoder(z_mu).detach().numpy()

# Detach tensors and convert back to numpy arrays
z = z.detach().numpy()
z_mu = z_mu.detach().numpy()
sigma = np.exp(log_sigma.detach().numpy())
latent_var_interp = np.linspace(z_mu.min(), z_mu.max(), 200)
X_hat_interp = model.decoder(torch.from_numpy(latent_var_interp[:, np.newaxis]).float()).detach().numpy()

# Visualize input -> latent space -> output conversion
# We color code each data point based on where it lives in the latent
# space so as to be able to follow how single data points pass through the autoencoder.
fig, axs = plt.subplots(1, 3, figsize=[15, 4])

# Input
axs[0].scatter(X[:, 0], X[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[0].set(xticks=[], yticks=[], xlabel='$x_1$', ylabel='$x_2$', title='Input')

# Latent space
for i in range(X.shape[0]):
    axs[1].plot(latent_var_interp, np.exp(-(latent_var_interp-z_mu[i])**2 / 2 / sigma[i]**2), '-', color='gray')
axs[1].plot(latent_var_interp, np.zeros(latent_var_interp.size), 'k-')
axs[1].scatter(z_mu, np.zeros(z_mu.size), marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[1].set(xticks=[], yticks=[], xlabel='Latent variable, z', title='Laten space')

# Output
axs[2].plot(np.stack([X[:, 0], X_hat[:, 0]]), np.stack([X[:, 1], X_hat[:, 1]]), 'k:')
axs[2].scatter(X[:, 0], X[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[2].plot(X_hat_interp[:, 0], X_hat_interp[:, 1], 'k-')
#axs[2].scatter(X_hat[:, 0], X_hat[:, 1], marker_size, latent_var, cmap=my_cmap, alpha=0.5)
axs[2].set(xticks=[], yticks=[], xlabel='$x_1$', ylabel='$x_2$', title='Output');