## Starter notebook for MLP ansatz for Helium

1. Energy - using Hessian,
2. Gradients - using known formula (update manually),
3. Optimization - ADAM.

First, non-symmetric, without Jastrow factor, to slowly add complexity

In [91]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from torch import vmap

In [62]:
class MLP(nn.Module):

    def __init__(self, input_dim, n_hidden_layers, hidden_dim, output_size):
        super(MLP, self).__init__()

        layers = []

        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.Tanh())

        # Hidden layers
        for _ in range(n_hidden_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.Tanh())

        # Output layer (no activation here by default)
        layers.append(nn.Linear(hidden_dim, output_size))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [63]:

input_dim = 6
n_hidden_layers = 2
hidden_dim = 32
output_size = 1

network = MLP(
    input_dim=input_dim,
    n_hidden_layers=n_hidden_layers,
    hidden_dim=hidden_dim,
    output_size=output_size
)

In [80]:
def metropolis(N: int, n_runs: int, model: nn.Module):  
    """
    Vectorized metropolis loop
    Over N steps, for n_runs. 
    Alphas passes in must be of same dim as n_runs
    """       
    L = 1
    r1 = (torch.rand(n_runs, 3) * 2 * L - L)
    r2 = (torch.rand(n_runs, 3) * 2 * L - L)
    max_steps = 500
    sampled_Xs = []
    rejection_ratio = 0

    for i in tqdm(range(N)):
        chose = torch.rand(n_runs).reshape(n_runs, 1)
        dummy = torch.rand(n_runs)

        perturbed_r1 = r1 + 0.5 * (torch.rand(n_runs, 3) * 2 * L - L)
        perturbed_r2 = r2 + 0.5 * (torch.rand(n_runs, 3) * 2 * L - L)

        r1_trial = torch.where(chose < 0.5, perturbed_r1, r1)
        r2_trial = torch.where(chose >= 0.5, perturbed_r2, r2)
        psi_val = model(torch.cat((r1, r2), axis=1)).squeeze()
        psi_trial_val = model(torch.cat((r1_trial, r2_trial), axis=1)).squeeze()

        
        psi_ratio = (psi_trial_val / psi_val) ** 2

        dummy_comp = psi_ratio > dummy

        condition = dummy_comp

        rejection_ratio += torch.where(condition, 1./N, 0.0)

        condition = condition.reshape(condition.shape[0], 1)

        r1 = torch.where(condition, r1_trial, r1)
        r2 = torch.where(condition, r2_trial, r2)
                
        if i > max_steps:
            sampled_Xs.append(torch.cat((r1, r2), axis=1))

    return torch.stack(sampled_Xs)

In [97]:
# Start with the simplest one - all the positions

def local_energy(positions):
    # positions: [batch_size, 6] with [r1x, r1y, r1z, r2x, r2y, r2z]
    psi = network(positions).squeeze()

    # Gradient of log_psi w.r.t positions
    grads = torch.autograd.grad(psi.sum(), positions, create_graph=True)[0]

    # Laplacian: second derivative (sum of second partials)
    laplacian = 0
    for i in range(positions.shape[1]):
        grad_i = grads[:, i]
        grad2 = torch.autograd.grad(grad_i.sum(), positions, create_graph=True)[0][:, i]
        laplacian += grad2

    # Kinetic energy
    kinetic = -0.5 * (laplacian) / psi 

    # Reshape positions
    r1 = positions[:, 0:3]
    r2 = positions[:, 3:6]
    r1_norm = torch.norm(r1, dim=1)
    r2_norm = torch.norm(r2, dim=1)
    r12 = torch.norm(r1 - r2, dim=1)

    potential = -2 / r1_norm - 2 / r2_norm + 1 / r12

    E_local = kinetic + potential
    return E_local


In [81]:
sampled_Xs = metropolis(5000, 50, network)

100%|██████████| 5000/5000 [00:02<00:00, 1961.71it/s]


In [118]:
local_es = torch.stack([local_energy(sampled_Xs[:, i]) for i in range(50)])

In [None]:
def psi_gradients(model, x):
    # x: [batch_size, 6] electron positions
    psi = model(x).squeeze()
    
    # Compute gradients of log_psi w.r.t. model parameters
    grads = torch.autograd.grad(
        psi.sum(),                         # scalar output
        model.parameters(),                # list of α (weights & biases)
        retain_graph=True,                 # keep graph for backprop later
        create_graph=True                  # allows higher-order grads (if needed)
    )
    
    # grads is a tuple of parameter gradients; flatten into one vector per sample
    grad_psi = torch.cat([g.view(-1) for g in grads])
    
    return grad_psi

def get_gradients(model, x):

    grad_psi = psi_gradients(model, x)

    



In [126]:
grads = torch.stack([log_psi_gradients(network, sampled_Xs[:, i]) for i in range(50)])

In [132]:
grads.shape

torch.Size([50, 1313])

$$
\frac{\partial E}{\partial \theta_\alpha} = 
2\, \text{Re} \left\langle 
\frac{\partial \ln \Psi}{\partial \theta_\alpha} 
\left( E_\text{loc} - E[\boldsymbol{\theta}] \right) 
\right\rangle
$$
