In [1]:
%load_ext autoreload
%autoreload 2 
%reload_ext autoreload

from mlens.ensemble import SuperLearner
# always import gbm_algos first !
import xgboost, lightgbm, catboost

import numpy as np
import scipy.io as io
from torch.autograd import grad
import shap
from utils import *

from sklearn.ensemble import RandomForestRegressor

# Let's do facy optimizers
from optimizers import Lookahead, AdamGC, SGDGC
from onecyclelr import OneCycleLR

import pcgrad

[MLENS] backend: threading


In [2]:
DATA_PATH = "/Users/pongpisit/Desktop/research/pinn/Solving-Differential-Equations-with-Neural-Networks/SymbolicMathematics/data/burgers_shock.mat"
data = io.loadmat(DATA_PATH)

t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T

X, T = np.meshgrid(x,t)

X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]              

# Doman bounds
lb = X_star.min(0)
ub = X_star.max(0)

N = 2000
print(f"Training with {N} samples")
idx = np.random.choice(X_star.shape[0], N, replace=False)
X_u_train = X_star[idx, :]
u_train = u_star[idx,:]

# Convert to torch.tensor
X_u_train = torch.tensor(X_u_train).float().requires_grad_(True)
u_train = torch.tensor(u_train).float().requires_grad_(True)
X_star = torch.tensor(X_star).float().requires_grad_(True)
u_star = torch.tensor(u_star).float().requires_grad_(True)

feature_names=['uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx']

Training with 2000 samples


In [3]:
class Network(nn.Module):
    def __init__(self, model):
        super(Network, self).__init__()
        self.model = model
        self.model.apply(self.xavier_init)
        # For tracking
        self.index2features = ('uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx')
        self.uf = None
        
    def xavier_init(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        
    def forward(self, x, t):
        self.uf = self.model(torch.cat([x, t], dim=1))
        return self.uf
    
    def get_selector_data(self, x, t):
        uf = self.forward(x, t)
        
        ### PDE Loss calculation ###
        # first-order derivatives
        u_t = self.gradients(uf, t)[0]
        u_x = self.gradients(uf, x)[0]
        # Homo second-order derivatives
        u_tt = self.gradients(u_t,t)[0]
        u_xx = self.gradients(u_x, x)[0]
        # Hetero second-order derivatives
        u_xt = self.gradients(u_t, x)[0]
        u_tx = self.gradients(u_x, t)[0]
        
        X_selector = torch.cat([uf, u_x, u_xx, u_tt, u_xt, u_tx], dim=1)
        y_selector = u_t
        
        return X_selector, y_selector
    
    def gradients(self, func, x):
        return grad(func, x, create_graph=True, retain_graph=True, grad_outputs=torch.ones(func.shape))

In [4]:
# Does the SeclectorNetwork has to be a neural networks ???
class SeclectorNetwork(nn.Module):
    def __init__(self, X_train_dim):
        super().__init__()
        # Nonlinear model, Training with PDE reg.
        layers = [nn.Linear(X_train_dim, 50), nn.Tanh(), nn.Linear(50, 1)]
        self.nonlinear_model = nn.Sequential(*layers)
        self.nonlinear_model.apply(self.xavier_init)
        
    def xavier_init(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        
    def forward(self, inn):
        ut_approx = self.nonlinear_model(inn)
        return ut_approx
    
    def loss(self, X_input, y_input):
        ut_approx = self.forward(X_input)
        mse_loss = F.mse_loss(ut_approx, y_input, reduction='mean')
        return mse_loss

In [5]:
network = Network(model=simple_solver_model(50))
selector = SeclectorNetwork(X_train_dim=6)
# optimizer = torch.optim.Adam(list(network.parameters()) + list(selector.parameters()), lr=1e-3)
epochs1 = 1500; epochs2 = 500;

In [6]:
def pcgrad_closure():
    unsup_loss = selector.loss(*network.get_selector_data(*dimension_slicing(X_u_train)))
    sup_loss = F.mse_loss(network.uf, u_train)
    losses = [sup_loss, unsup_loss]
    updated_grads = []
    
    for i in range(2):
        optimizer1.zero_grad()
        losses[i].backward(retain_graph=True)

        g_task = []
        for param in network.parameters():
            if param.grad is not None:
                g_task.append(Variable(param.grad.clone(), requires_grad=False))
            else:
                g_task.append(Variable(torch.zeros(param.shape), requires_grad=False))
        # appending the gradients from each task
        updated_grads.append(g_task)

    updated_grads = list(pcgrad.pc_grad_update(updated_grads))[0]
    for idx, param in enumerate(network.parameters()): 
        param.grad = (updated_grads[0][idx]+updated_grads[1][idx])
        
    return sum(losses)

def semi_sup_closure():
    optimizer1.zero_grad()

    # Total loss calculation process
    # unsupervised_loss
    unsup_loss = selector.loss(*network.get_selector_data(*dimension_slicing(X_u_train)))
    sup_loss = F.mse_loss(network.uf, u_train)

    # No MTL yet, apply the naive summation first to see if it's working?
    total_loss = unsup_loss + sup_loss
    total_loss.backward(retain_graph=True)

    return total_loss

def closure():
    optimizer2.zero_grad()
    mse_loss = F.mse_loss(network(*dimension_slicing(X_u_train)), u_train)
    mse_loss.backward(retain_graph=True)
    return mse_loss

In [7]:
params = list(network.parameters()) + list(selector.parameters())

### For SGD and Adam ###
learning_rate1, learning_rate2 = 5e-3, 5e-2

### For LBFGS (a good choice already!!!) ###
# learning_rate1, learning_rate2 = 8e-2, 5e-2 # (1e-1, 5e-2) is also OK!

choice = 'SGD'; is_sched = False
if choice == 'LBFGS':
    is_sched = False
    optimizer1 = torch.optim.LBFGS(params, lr=learning_rate1, 
                                   max_iter=80, max_eval=100, 
                                  history_size=120, line_search_fn='strong_wolfe')
if choice == 'Adam':
    optimizer1 = AdamGC(params, lr=learning_rate1, use_gc=True, gc_conv_only=False, gc_loc=False)
if choice == 'SGD':
    optimizer1 = SGDGC(params, lr=learning_rate1, use_gc=True, nesterov=True, momentum=0.9)
if is_sched:
    scheduler = OneCycleLR(optimizer1, num_steps=epochs1, lr_range=(5e-4, 5e-2))

network.train(); selector.train()
curr_loss = 1000; F_print = 10 if choice == 'LBFGS' else 100

# Stage I
for i in range(epochs1):    
    optimizer1.step(pcgrad_closure)
    if is_sched: scheduler.step()
    l = pcgrad_closure()
    if (i % F_print) == 0:
        if l.item() != curr_loss:
            curr_loss = l.item()
        else:
            print("Epoch {}: ".format(i), curr_loss)
            print("Finishing the first stage")
            break
        print("Epoch {}: ".format(i), curr_loss)

Epoch 0:  0.42341139912605286
Epoch 100:  0.2419937402009964
Epoch 200:  0.23437415063381195
Epoch 300:  0.21062539517879486
Epoch 400:  0.13327650725841522
Epoch 500:  0.08342467248439789
Epoch 600:  0.05388389527797699
Epoch 700:  0.04634428396821022
Epoch 800:  0.04316284880042076
Epoch 900:  0.04086717590689659
Epoch 1000:  0.039105866104364395
Epoch 1100:  0.03778686746954918
Epoch 1200:  0.0365154854953289
Epoch 1300:  0.03519538417458534
Epoch 1400:  0.03386590629816055


In [8]:
optimizer2 = torch.optim.LBFGS(network.parameters(), 
                              lr=learning_rate2, max_iter=100, max_eval=125, 
                              history_size=120, line_search_fn='strong_wolfe')

curr_loss = 1000
# Stage II
for i in range(epochs2):
    optimizer2.step(closure)
    l = closure()
    if (i % 10) == 0:
        if l.item() != curr_loss:
            curr_loss = l.item()
        else:
            print("Finishing the second stage")
            break
        print("Epoch {}: ".format(i), curr_loss)

print("Testing")
network.eval()
# should be able to reach the order of 1e-6. 
# So that I can use this algo instead of the ladder networks
# Compare btw the two semi-supervise learning?
F.mse_loss(network(*dimension_slicing(X_star)).detach(), u_star)

Epoch 0:  0.015278980135917664
Epoch 10:  3.98205520468764e-05
Epoch 20:  3.1214153750624973e-06
Epoch 30:  1.4038199651622563e-06
Epoch 40:  1.1086716540376074e-06
Epoch 50:  9.834689080889802e-07
Epoch 60:  8.934815696193255e-07
Epoch 70:  8.624804195278557e-07
Epoch 80:  8.188906122086337e-07
Epoch 90:  8.177488552973955e-07
Finishing the second stage
Testing


tensor(4.6153e-06, grad_fn=<MseLossBackward>)

In [9]:
# BEST-2000: 1e-06 (LBFGS)
# torch.save(network.state_dict(), "./saved_path_inverse_burger/nn_nonlinear_semisup_without_physical_reg_trained2000samples.pth")