In [None]:
import time
import torch
import random
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.linalg as la
import torch.functional as F
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch.autograd import Variable
from IPython.display import clear_output

np.random.seed(10)

In [None]:
bias=4
slope=3

nr_points=20

mean_noise=0.0
variance_noise=20.0


# create dummy data for training
x_values=[i for i in range(nr_points)]
x_train=np.array(x_values, dtype=np.float32)
x_train=x_train.reshape(-1,1)

y_values=[slope*i+bias+np.random.normal(loc=mean_noise, scale=variance_noise,size=1) for i in range(nr_points)]
y_train=np.array(y_values,dtype=np.float32)
y_train=y_train.reshape(-1,1)

In [None]:
# Defining a Linear Regressor in Pytorch
class LinearRegression(torch.nn.Module):
    def __init__(self,input_size,output_size):
        super().__init__()
        self.linear=torch.nn.Linear(input_size,output_size)
        
    def forward(self,x):
        y=self.linear(x)
        return y

In [None]:
input_dim=1   # takes variable 'x'
output_dim=1  # takes variable 'y'
learning_rate=0.00001
epochs=1000

model = LinearRegression(input_dim, output_dim)

criterion=torch.nn.MSELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=learning_rate)

for epoch in range(epochs):
    # Converting inputs and labels to Variable
    inputs=Variable(torch.from_numpy(x_train))
    labels=Variable(torch.from_numpy(y_train))
    
    optimizer.zero_grad()
    outputs=model(inputs)
    loss=criterion(outputs,labels)
    loss.backward()
    optimizer.step()
 
    clear_output(True)
    print('epoch {}, loss {}'.format(epoch,loss.item()))
    

In [None]:
with torch.no_grad(): 
    predicted=model(Variable(torch.from_numpy(x_train))).data.numpy()
    
plt.clf()
plt.plot(x_train,y_train,'go',label='True data',alpha=0.5)
plt.plot(x_train,predicted,'--',label='Predictions',alpha=0.5)
plt.legend(loc='best')
plt.show()


### Markov Chain Monte-Carlo using Hamiltonian Dynamics

###### Metripolis Hasting algorithm can stuck into one of the modals of the distribution and not expling the rest of the modes.
###### One way to circumvent this my heuristically adjusting the step-size using the spread of the proptoosal distribution. However too large step size leads to a large number of rejections, while a too small step-size makes makes the exploration too slow.
###### In high-dimensional space the exploration is nenarly a random-walk behavior thus the exploration is sup-optimal.

To mitigate these drawbacks Hamitonian Monte-Carlo (HMC) utilizes the target distribution and the laws of dynamics in mechanical physics to design adaptive step-size for the proptoosed samples.

The target distribution p(\omega) is then a modeled using the Gibbs canonical distribution from statistical mechanics as
$$p(\omega)\propto e^{\frac{-U(\omega)}{T}} $$ where T is the temperature and U(\omega) is the energy of the state for the particle at state z.

Apart from the potential energy U(\omega) this method introduces an additional auxilliary  component kinetic energy K(v) that is dependent on the speed (v) as auxilliary variable.

Eventually the total mechanical energy is:
$$E(z,v)=U(\omega)+K(v),s.t: K(v)=\sum_{i}\frac{v_{i}^2}{2}$$

The state distribution of the particles is then dependent on the total energy as:

$$p(z,v)\propto e^{\frac{-E(z,v)}{T}}=e^{\frac{-U(\omega)}{T}}e^{\frac{-K(v)}{T}}\propto p(\omega)p(v)$$

#### The physicial dynamics of the target distribution through Hamiltonian

In order to sample multiple different positions of the samples inside the energy well defined from E(z,v) we utilize these two physics equations:
$$\frac{\partial \omega(t)}{\partial t}=\frac{\partial E(z,v)}{\partial v}=\frac{\partial K(v)}{\partial v}$$
$$m\frac{\partial v(t)}{\partial t}=-\frac{\partial E(z,v)}{\partial \omega}=-\frac{\partial U(\omega)}{\partial \omega} $$

Since the energy of the closed system is preserved $E(z,v)=E_{0}$ it is possible to get different samples inside this target distribution while simulating particles whose statistical trajectory is guided by the two equations above.

Sampling the speed (v) is quite simple as it follows a (multivariate) normal distribution:

$$p(v)=e^{\frac{-K(v)}{T}}=e^{\frac{-\sum_{i}mv_{i}}{2T}}=e^{\frac{-mV^{T}V}{2T}}$$

###### In a nutshell

Start the sample moving with a random speed drawn from the normal distribution and stop it.
Continue this proceedure until the sufficient number of samples have been accummulated.

However, numerical solutions to the partial derivative equations (PDE) cannot be solved analytically and their numerical solution does not ensure the preservation of the energy $E(z,v)$.

To mitigate this problem Metropolis Hastings rejections are employed to compensate difference in energy between energy between the start the and the stop of the particle position.

Leapfrog numerical integration offers an numerical integration that is reversible in time.
This reversability ensures the detailed balance.

###### Physical analogy

The trajectory of the particle that roams inside the energy well defined by the target distribution is equivalent to a classical harmonic oscilator without any dampling (conservation of energy).
This is governt by a second had ordinary differential equation (ODE) $z^{''}+z=0$.
To simplify the solution this is converted into two ODEs where $z^{'}=v$.

In our case:

$$U(\omega)=\frac{(y-f_{\omega}(x))^{T}(y-f_{\omega}(x))}{2\sigma}$$
$$K(v)=-\frac{V^{T}V}{2}$$

$$\frac{\partial \omega(t)}{\partial t}=\frac{\partial K(v)}{\partial V}=-V$$
$$\frac{\partial v(t)}{\partial t}=-\frac{\partial U(\omega)}{\partial \omega}=-\frac{\partial f_{\omega}(x)}{\partial \omega}(y-f_{\omega}(x))$$


###### Euler solution to ODE
As a result the Euler solution to this PDE are:

$$\frac{\omega(t+\Delta t)-\omega(t)}{\Delta t}=\frac{\partial K(v)}{\partial v} \to \omega(t+\Delta t)=\omega(t)+\Delta t \frac{\partial K(v)}{\partial v}$$
$$\frac{v(t+\Delta t)-v(t)}{\Delta t}=-\frac{\partial U(\omega)}{\partial \omega} \to v(t+\Delta t)=v(t) -\Delta t*\frac{\partial U(\omega)}{\partial \omega} $$

###### Leapfrog solution to ODE

Instead of performing the updates simultaneosly, leapfrog method splits this across variables.
It makes one half-step towards the first variable.
Makes a full step towards the second variable using the updated first variable.
Takes one final half step for the first variable using the updated second variable.

Leapfrog integration:

$$v(t+\frac{\Delta t}{2})=v(t) -\frac{\Delta t}{2}\frac{\partial U(\omega)}{\partial \omega}$$
$$\omega(t+\Delta t)=\omega(t)-\Delta t *\frac{\partial K(v)}{\partial v}=\omega(t)+\Delta t*\frac{\partial K(v)}{\partial v}$$
$$v(t+\frac{\Delta t}{2})=v(t) -\frac{\Delta t}{2}\frac{\partial U(\omega)}{\partial \omega}$$


In [None]:
dim=50
boundary=10
x = np.linspace(-boundary, boundary, dim)
y = np.linspace(-boundary, boundary, dim)
Z=np.zeros((dim,dim))
for i,w in enumerate(x):
    for j,b in enumerate(y):
        model.linear.weight.data=torch.tensor(w).float().reshape(1,1)
        model.linear.bias.data=torch.tensor(b).float() .reshape(1,1)
        outputs=model(inputs)
        Z[i,j]=torch.exp(-criterion(outputs,labels)/200)


X1, X2 = np.meshgrid(x, y)

plt.figure()
plt.contour(X1,X2, Z.T, colors='black');
plt.axhline(y=bias, color='r', linestyle='-')
plt.axvline(x=slope, color='r', linestyle='-')
plt.title('Energy landscape')
plt.xlabel('Weight')
plt.ylabel('Bias')
plt.show()

In [None]:
def dUdw(labels,inputs,w,b):
    model.linear.weight.data, model.linear.bias.data=w,b
    optimizer.zero_grad()
    outputs=model(inputs)
    loss=criterion(outputs,labels)
    loss.backward()
    return  model.linear.weight.grad, model.linear.bias.grad
def leapFrog(w_start,b_start,v_w,v_b,DEBUG=True):
    #####LeapFrog Integration#####
    w_trajectory=[]
    b_trajectory=[]
    
    w_gradient=[]
    b_gradient=[]
    
    v_w_speed=[]
    v_b_speed=[]
    
    w_trajectory.append(w_start)
    b_trajectory.append(b_start)
    
    # Get the gradient of the curvature at the given weight and biases
    d_w,d_b=dUdw(labels,inputs,w_start,b_start)

    w_gradient.append(d_w)
    b_gradient.append(d_b)
    
    # Accumulated speed at half step
    v_w-=d_w.squeeze()*integration_steps/2
    v_b-=d_b.squeeze()*integration_steps/2

    v_w_speed.append(v_w.item())
    v_b_speed.append(v_b.item())
    
    for j in range(nr_integration_steps+1):
        
        w_start+=integration_steps*v_w
        b_start+=integration_steps*v_b
        
        w_trajectory.append(w_start.item())
        b_trajectory.append(b_start.item())
    
        # Get the gradient of the curvature at the given weight and biases
        d_w,d_b=dUdw(labels,inputs,w_start,b_start)

        # Accumulated speed at half step
        v_w-=d_w.squeeze()*integration_steps/2
        v_b-=d_b.squeeze()*integration_steps/2
        
        w_gradient.append(d_w.item())
        b_gradient.append(d_b.item())
        
        v_w_speed.append(v_w.item())
        v_b_speed.append(v_b.item())

        
        if DEBUG:
            clear_output(True)
            plt.subplot(211)
            plt.scatter(w_trajectory, b_trajectory)
            plt.contour(X1,X2, Z.T, colors='black');
            #plt.axhline(y=bias, color='r', linestyle='-')
            #plt.axvline(x=slope, color='r', linestyle='-')
            plt.quiver(w_trajectory[-1], b_trajectory[-1], w_gradient[-1], b_gradient[-1],color='r')
            plt.quiver(w_trajectory[-1], b_trajectory[-1], v_w_speed[-1], v_b_speed[-1],color='b')
            plt.xlabel('Weight w')
            plt.ylabel('Bias b')
            #plt.xlim([0, boundary])
            #plt.ylim([-boundary, boundary])

            with torch.no_grad(): 
                predicted=model(inputs).data.numpy()

            plt.subplot(212)
            plt.plot(x_train,y_train,'go',label='True data',alpha=0.5)
            plt.plot(x_train,predicted,'--',label='Predictions',alpha=0.5)
            plt.legend(loc='best')
            plt.show()
        
    return w_start,b_start,v_w,v_b

def energyComputation(labels,inputs,w,b):
    model.linear.weight.data=torch.tensor(w).float().reshape(1,1).clone().detach().requires_grad_(True)
    model.linear.bias.data=torch.tensor(b).float() .reshape(1,1).clone().detach().requires_grad_(True)
    outputs=model(inputs)
    return -criterion(outputs,labels)

In [None]:
nr_iterations = 100
integration_steps = 0.01
nr_integration_steps = 100

# starting position for the particle in the center of the space
w_start,b_start = 0.*model.linear.weight.data, 0.*model.linear.bias.data.data

inputs=Variable(torch.from_numpy(x_train))
labels=Variable(torch.from_numpy(y_train))




w_rejected = []
w_accepted = []

b_rejected = []
b_accepted = []



for i in tqdm(range(nr_iterations+1)):
    
    # Draw a random velocity
    v_w_start = torch.normal(mean=torch.tensor(0.), std=torch.tensor(1.))
    v_b_start = torch.normal(mean=torch.tensor(0.), std=torch.tensor(1.))
    
    energy_start=energyComputation(labels,inputs,w_start,b_start)+v_w_start**2+v_b_start**2
    w_stop,b_stop,v_w_stop,v_b_stop=leapFrog(w_start,b_start,v_w_start,v_b_start,False)   
    
    energy_stop=energyComputation(labels,inputs,w_stop,b_stop)+v_w_stop**2+v_b_stop**2
    
    # Acceptance ratio
    a=torch.exp(energy_start-energy_stop)
    
    # Metropolis-Hasting accept-reject
    r = np.random.rand()
    if r < a:
        w_accepted.append(w_stop.item())
        b_accepted.append(b_stop.item())

    else:
        w_rejected.append(w_stop.item())
        b_rejected.append(b_stop.item())
    
    w_start,b_start=w_stop,b_stop
    clear_output(True)
    
    #plt.figure()
    plt.subplot(311)
            
    plt.contour(X1,X2, Z.T, colors='black');
    plt.scatter(w_accepted, 
                b_accepted,  
                c='r',
                cmap='Reds')
    plt.title("Accepted Points \n Iteration "+str(i)+"/"+str(nr_iterations))
    plt.xlabel('Weights')
    plt.ylabel('Bias')
    plt.xlim([0, boundary])
    plt.ylim([-boundary, boundary])
    
    plt.subplot(312)
            
    plt.contour(X1,X2, Z.T, colors='black');
    plt.scatter(w_rejected, 
                b_rejected,  
                c='r',
                cmap='Reds')
    plt.title("Rejected Points \n Iteration "+str(i)+"/"+str(nr_iterations))
    plt.xlabel('Weights')
    plt.ylabel('Bias')
    plt.xlim([0, boundary])
    plt.ylim([-boundary, boundary])
    
    
    plt.subplot(313)
    plt.plot(x_train,y_train,'go',label='True data',alpha=0.5)
    for w,b in zip(w_accepted,b_accepted):
        model.linear.weight.data=torch.tensor(w).float().reshape(1,1).clone().detach().requires_grad_(True)
        model.linear.bias.data=torch.tensor(b).float().reshape(1,1).clone().detach().requires_grad_(True)

        with torch.no_grad(): 
                    predicted=model(Variable(torch.from_numpy(x_train))).data.numpy()

        plt.plot(x_train,predicted,'--',alpha=0.5)
    plt.plot(x_train,predicted,'--',label='Predictions',alpha=0.5)    
    plt.show()
    time.sleep(3)       

# Seatbelts prediction

In [None]:
data=pd.read_csv('Seatbelts.csv')
print(data)

In [None]:
data_=data.to_numpy()
x_train=data_[:,6]
y_train=data_[:,1]
x_train=x_train.reshape((-1,1))
y_train=y_train.reshape((-1,1))
plt.plot(y)
plt.plot(x)
plt.show()
print(data.shape)

In [None]:
# Defining a Linear Regressor in Pytorch
class LinearRegression(torch.nn.Module):
    def __init__(self,input_size,output_size):
        super().__init__()
        self.linear=torch.nn.Linear(input_size,output_size)
        
    def forward(self,x):
        y=self.linear(x)
        return y

In [None]:
input_dim=1   # takes variable 'x'
output_dim=1  # takes variable 'y'
learning_rate=0.001
epochs=10000

model = LinearRegression(input_dim, output_dim)

criterion=torch.nn.MSELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=learning_rate)

for epoch in range(epochs):
    # Converting inputs and labels to Variable
    inputs=Variable(torch.from_numpy(x_train).float())
    labels=Variable(torch.from_numpy(y_train).float())
    
    optimizer.zero_grad()
    outputs=model(inputs)
    loss=criterion(outputs,labels)
    loss.backward()
    optimizer.step()
 
    clear_output(True)
    print('epoch {}, loss {}'.format(epoch,loss.item()))
    

In [None]:
with torch.no_grad(): 
    predicted=model(Variable(torch.from_numpy(x_train).float())).data.numpy()
    
plt.clf()
plt.plot(x_train,y_train,'go',label='True data',alpha=0.5)
plt.plot(x_train,predicted,'--',label='Predictions',alpha=0.5)
plt.legend(loc='best')
plt.show()


In [None]:
dim=50
boundary=200
x = np.linspace(-8*boundary, boundary, dim)
y = np.linspace(-0, 2*boundary, dim)
Z=np.zeros((dim,dim))
for i,w in enumerate(x):
    for j,b in enumerate(y):
        model.linear.weight.data=torch.tensor(w).float().reshape(1,1)
        model.linear.bias.data=torch.tensor(b).float() .reshape(1,1)
        outputs=model(inputs)
        Z[i,j]=torch.exp(-criterion(outputs,labels)/1000)


X1, X2 = np.meshgrid(x, y)

plt.figure()
plt.contour(X1,X2, Z.T, colors='black');
plt.title('Energy landscape')
plt.xlabel('Weight')
plt.ylabel('Bias')
plt.show()

In [None]:
nr_iterations = 100
integration_steps = 0.01
nr_integration_steps = 100

# starting position for the particle in the center of the space
w_start,b_start = 0.*model.linear.weight.data, 0.*model.linear.bias.data.data

inputs=Variable(torch.from_numpy(x_train).float())
labels=Variable(torch.from_numpy(y_train).float())




w_rejected = []
w_accepted = []

b_rejected = []
b_accepted = []



for i in tqdm(range(nr_iterations+1)):
    
    # Draw a random velocity
    v_w_start = torch.normal(mean=torch.tensor(0.), std=torch.tensor(10.))
    v_b_start = torch.normal(mean=torch.tensor(0.), std=torch.tensor(10.))
    
    energy_start=energyComputation(labels,inputs,w_start,b_start)+v_w_start**2+v_b_start**2
    w_stop,b_stop,v_w_stop,v_b_stop=leapFrog(w_start,b_start,v_w_start,v_b_start,False)   
    
    energy_stop=energyComputation(labels,inputs,w_stop,b_stop)+v_w_stop**2+v_b_stop**2
    
    # Acceptance ratio
    a=torch.exp(energy_stop-energy_start)
    
    # Metropolis-Hasting accept-reject
    r = np.random.rand()
    if r < a:
        w_accepted.append(w_stop.item())
        b_accepted.append(b_stop.item())

    else:
        w_rejected.append(w_stop.item())
        b_rejected.append(b_stop.item())
    
    w_start,b_start=w_stop,b_stop
    clear_output(True)
    
    #plt.figure()
    plt.subplot(131)
            
    plt.contour(X1,X2, Z.T, colors='black');
    plt.scatter(w_accepted, 
                b_accepted, 
                s=100,
                c='r',
                cmap='Reds')
    plt.title("Accepted Points \n Iteration "+str(i)+"/"+str(nr_iterations))
    plt.xlabel('Weights')
    plt.ylabel('Bias')
    #plt.xlim([0, boundary])
    #plt.ylim([-boundary, boundary])
    
    plt.subplot(132)
            
    plt.contour(X1,X2, Z.T, colors='black');
    plt.scatter(w_rejected, 
                b_rejected,
                s=100,
                c='r',
                cmap='Reds')
    plt.title("Rejected Points \n Iteration "+str(i)+"/"+str(nr_iterations))
    plt.xlabel('Weights')
    plt.ylabel('Bias')
    #plt.xlim([0, boundary])
    #plt.ylim([-boundary, boundary])
    
    
    plt.subplot(133)
    plt.plot(x_train,y_train,'go',label='True data',alpha=0.5)
    for w,b in zip(w_accepted,b_accepted):
        model.linear.weight.data=torch.tensor(w).float().reshape(1,1).clone().detach().requires_grad_(True)
        model.linear.bias.data=torch.tensor(b).float().reshape(1,1).clone().detach().requires_grad_(True)

        with torch.no_grad(): 
                    predicted=model(inputs).data.numpy()

        plt.plot(x_train,predicted,'--',alpha=0.5)
    plt.plot(x_train,predicted,'--',label='Predictions',alpha=0.5)    
    plt.show()
    time.sleep(3)       