### Do I need to implement the GPU version for faster computation?

In [1]:
%load_ext autoreload
%autoreload 2 
%reload_ext autoreload

from mlens.ensemble import SuperLearner
# always import gbm_algos first !
import xgboost, lightgbm, catboost

import numpy as np
import scipy.io as io
from torch.autograd import grad
import shap
from utils import *

from sklearn.ensemble import RandomForestRegressor

[MLENS] backend: threading


In [2]:
DATA_PATH = "/Users/pongpisit/Desktop/research/pinn/Solving-Differential-Equations-with-Neural-Networks/SymbolicMathematics/data/burgers_shock.mat"
data = io.loadmat(DATA_PATH)

t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T

X, T = np.meshgrid(x,t)

X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]              

# Doman bounds
lb = X_star.min(0)
ub = X_star.max(0)

N = 2000
print(f"Training with {N} samples")
idx = np.random.choice(X_star.shape[0], N, replace=False)
X_u_train = X_star[idx, :]
u_train = u_star[idx,:]

# Convert to torch.tensor
X_u_train = torch.tensor(X_u_train).float().requires_grad_(True)
u_train = torch.tensor(u_train).float().requires_grad_(True)
X_star = torch.tensor(X_star).float().requires_grad_(True)
u_star = torch.tensor(u_star).float().requires_grad_(True)

feature_names=['uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx']

Training with 2000 samples


In [3]:
class Network(nn.Module):
    def __init__(self, model):
        super(Network, self).__init__()
        self.model = model
        self.model.apply(self.xavier_init)
        # For tracking
        self.index2features = ('uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx')
        self.uf = None
        
    def xavier_init(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        
    def forward(self, x, t):
        self.uf = self.model(torch.cat([x, t], dim=1))
        return self.uf
    
    def get_selector_data(self, x, t):
        uf = self.forward(x, t)
        
        ### PDE Loss calculation ###
        # first-order derivatives
        u_t = self.gradients(uf, t)[0]
        u_x = self.gradients(uf, x)[0]
        # Homo second-order derivatives
        u_tt = self.gradients(u_t,t)[0]
        u_xx = self.gradients(u_x, x)[0]
        # Hetero second-order derivatives
        u_xt = self.gradients(u_t, x)[0]
        u_tx = self.gradients(u_x, t)[0]
        
        X_selector = torch.cat([uf, u_x, u_xx, u_tt, u_xt, u_tx], dim=1)
        y_selector = u_t
        
        return X_selector, y_selector
    
    def gradients(self, func, x):
        return grad(func, x, create_graph=True, retain_graph=True, grad_outputs=torch.ones(func.shape))

In [4]:
# Does the SeclectorNetwork has to be a neural networks ???
class SeclectorNetwork(nn.Module):
    def __init__(self, X_train_dim):
        super().__init__()
        # Nonlinear model, Training with noisy features -> chk feature importance
        layers = [nn.Linear(X_train_dim, 50), nn.Tanh(), nn.Linear(50, 1)]
        self.nonlinear_model = nn.Sequential(*layers)
        
    def forward(self, inn):
        ut_approx = self.nonlinear_model(inn)
        return ut_approx
    
    def loss(self, X_input, y_input):
        ut_approx = self.forward(X_input)
        mse_loss = F.mse_loss(ut_approx, y_input, reduction='mean')
        return mse_loss

In [5]:
# network = Network(model=simple_solver_model(50))
# selector = SeclectorNetwork(X_train_dim=6)

# optimizer = torch.optim.LBFGS(list(network.parameters()) + list(selector.parameters()), 
#                               lr=5e-2, max_iter=80, max_eval=100, 
#                               history_size=120, line_search_fn='strong_wolfe')

# # optimizer = torch.optim.Adam(list(network.parameters()) + list(selector.parameters()), lr=1e-3)
# epochs = 5000; testing = False

# if testing:
#     # unsupervised_loss
#     unsup_loss = selector.loss(*network.get_selector_data(*dimension_slicing(X_u_train)))
#     sup_loss = F.mse_loss(network.uf, u_train)

#     # No MTL yet, apply the naive summation first to see if it's working?
#     total_loss = unsup_loss + sup_loss
#     print(total_loss)

#     total_loss.backward()

In [6]:
# network.train(); selector.train()
# curr_loss = 1000

# for i in range(epochs):
#     def closure():
#         optimizer.zero_grad()

#         # Total loss calculation process
#         # unsupervised_loss
#         unsup_loss = selector.loss(*network.get_selector_data(*dimension_slicing(X_u_train)))
#         sup_loss = F.mse_loss(network.uf, u_train)

#         # No MTL yet, apply the naive summation first to see if it's working?
#         total_loss = unsup_loss + sup_loss
#         total_loss.backward()
        
#         return total_loss
    
#     optimizer.step(closure)
    
#     l = closure()
#     if l.item() != curr_loss:
#         curr_loss = l.item()
#     else: break; print("Stop training.")
    
#     if (i % 10) == 0:
#         print("Epoch {}: ".format(i), curr_loss)

# print("Testing")
# network.eval()
# F.mse_loss(network(*dimension_slicing(X_star)).detach(), u_star) # Around 3e-3

In [7]:
# X_selector, y_selector = network.get_selector_data(*dimension_slicing(X_u_train))
# e = shap.DeepExplainer(selector, X_selector)
# shap_values = e.shap_values(X_selector)

# import pandas as pd
# df = pd.DataFrame({
#     "mean_abs_shap": np.mean(np.abs(shap_values), axis=0), 
#     "stdev_abs_shap": np.std(np.abs(shap_values), axis=0), 
#     "name": ['uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx']
# })

# print(df.sort_values("mean_abs_shap", ascending=False)[:10])

# shap.summary_plot(shap_values, features=X_selector, feature_names=feature_names)

### Using statistical models to find feature importance.

In [8]:
X_np = np.load("./saved_path_inverse_burger/data/derivatives-2000.npy")
y_np = np.load("./saved_path_inverse_burger/data/dynamics-2000.npy")

X_np_test = np.load("./saved_path_inverse_burger/data/derivatives-25600.npy")
y_np_test = np.load("./saved_path_inverse_burger/data/dynamics-25600.npy")

In [9]:
forest = RandomForestRegressor()
xg = xgboost.XGBRegressor(reg_alpha=0.1)
cat = catboost.CatBoostRegressor(iterations=None, depth=4, learning_rate=0.1, verbose=0, l2_leaf_reg=10)
light = lightgbm.LGBMRegressor(n_estimators=200, learning_rate=0.1, reg_lambda=1)

In [10]:
ensemble = SuperLearner(scorer=mean_squared_error, folds=3, model_selection=True, n_jobs=3)
ensemble.add([xg, light, cat])
ensemble.add_meta(forest)
ensemble.fit(X_np, y_np)

SuperLearner(array_check=None, backend=None, folds=3,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=3, raise_on_ex...5c97e50>)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=True, n_jobs=None, raise_on_exception=True,
       random_state=None, sample_size=20,
       scorer=<function mean_squared_error at 0x135c97e50>, shuffle=False,
       verbose=False)

In [11]:
sklearn_model = SklearnModel(model=ensemble, X_train=X_np, y_train=y_np, feature_names=feature_names)
print('Test MSE:', sklearn_model.test(X_np_test, y_np_test))
print('Training GBM algos...')
xg_feature_importance = SklearnModel(model=xg, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()
light_feature_importance = SklearnModel(model=light, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()
cat_feature_importance = SklearnModel(model=cat, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()

Done training
Training MSE: 0.0033935609
Test MSE: 0.028090756
Training GBM algos...
Done training
Training MSE: 0.00017310298
('u_x', 0.43132782464468283)
('u_xx', 0.2725011438783386)
('uf', 0.23068548996767577)
('u_xt', 0.037911415523790744)
('u_tt', 0.027574125985512055)
('u_tx', 0.0)
Done training
Training MSE: 0.015096086314121248
('u_xx', 0.2805)
('u_x', 0.25283333333333335)
('uf', 0.16966666666666666)
('u_xt', 0.1495)
('u_tt', 0.1475)
('u_tx', 0.0)
Done training
Training MSE: 0.0010312966521267757
('u_x', 0.3998005900828352)
('uf', 0.3689910808122835)
('u_xx', 0.12724149982994676)
('u_tt', 0.06889158417508844)
('u_tx', 0.01928175440192941)
('u_xt', 0.015793490697916725)


In [12]:
xg_feature_importance = SklearnModel(model=xg, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()
light_feature_importance = SklearnModel(model=light, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()
cat_feature_importance = SklearnModel(model=cat, X_train=X_np, y_train=y_np, feature_names=feature_names).feature_importance()

Done training
Training MSE: 0.00017310298
('u_x', 0.43132782464468283)
('u_xx', 0.2725011438783386)
('uf', 0.23068548996767577)
('u_xt', 0.037911415523790744)
('u_tt', 0.027574125985512055)
('u_tx', 0.0)
Done training
Training MSE: 0.015096086314121248
('u_xx', 0.2805)
('u_x', 0.25283333333333335)
('uf', 0.16966666666666666)
('u_xt', 0.1495)
('u_tt', 0.1475)
('u_tx', 0.0)
Done training
Training MSE: 0.0010312966521267757
('u_x', 0.3998005900828352)
('uf', 0.3689910808122835)
('u_xx', 0.12724149982994676)
('u_tt', 0.06889158417508844)
('u_tx', 0.01928175440192941)
('u_xt', 0.015793490697916725)


In [13]:
avg_feature_importances = {}
for f in feature_names:
    avg_feature_importances[f] = (xg_feature_importance[f]+cat_feature_importance[f]+light_feature_importance[f])/3
avg_feature_importances

{'uf': 0.23068548996767577,
 'u_x': 0.43132782464468283,
 'u_xx': 0.2725011438783386,
 'u_tt': 0.027574125985512055,
 'u_xt': 0.037911415523790744,
 'u_tx': 0.0}

### Using a neural network to fit derivatives and dynamics

In [None]:
X_tensor = to_tensor(X_np, False)
y_tensor = to_tensor(y_np, False).reshape(-1, 1)
mlp = TorchMLP([6, 50, 50, 50, 50, 50, 1])
optimizer = torch.optim.LBFGS(mlp.parameters(), 
                              lr=0.1, max_iter=100, max_eval=125, # 80 and 100 are OK!
                              history_size=120, line_search_fn='strong_wolfe')
mlp.train(); epochs=500
for i in range(epochs):
    def closure():
        optimizer.zero_grad()
        l = F.mse_loss(mlp(X_tensor), y_tensor)
        l.backward()
        return l
    optimizer.step(closure)
    l = closure()
    if i % 100 == 0:
        print('MSE Loss:', l.item())

In [None]:
mlp.eval(); ((mlp(X_tensor) - y_tensor)**2).mean().item()

In [None]:
# The SHAP's outputs are not what I expected.

mlp.eval()

e = shap.DeepExplainer(mlp, X_tensor)
shap_values = e.shap_values(X_tensor)

import pandas as pd
df = pd.DataFrame({
    "mean_abs_shap": np.mean(np.abs(shap_values), axis=0), 
    "stdev_abs_shap": np.std(np.abs(shap_values), axis=0), 
    "name": ['uf', 'u_x',  'u_xx', 'u_tt', 'u_xt', 'u_tx']
})

print(df.sort_values("mean_abs_shap", ascending=False))
shap.summary_plot(shap_values, features=X_tensor, feature_names=feature_names)

In [None]:
# imports from captum library
from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation

In [None]:
ig = IntegratedGradients(mlp)
ig_nt = NoiseTunnel(ig)
dl = DeepLift(mlp)
gs = GradientShap(mlp)
fa = FeatureAblation(mlp)

ig_attr_test = ig.attribute(X_tensor, n_steps=50)
ig_nt_attr_test = ig_nt.attribute(X_tensor)
dl_attr_test = dl.attribute(X_tensor)
gs_attr_test = gs.attribute(X_tensor, X_tensor)
fa_attr_test = fa.attribute(X_tensor)

In [None]:
# prepare attributions for visualization
import matplotlib.pyplot as plt

x_axis_data = np.arange(X_tensor.shape[1])
x_axis_data_labels = list(map(lambda idx: feature_names[idx], x_axis_data))

ig_attr_test_sum = ig_attr_test.detach().numpy().sum(0)
ig_attr_test_norm_sum = ig_attr_test_sum / np.linalg.norm(ig_attr_test_sum, ord=1)

ig_nt_attr_test_sum = ig_nt_attr_test.detach().numpy().sum(0)
ig_nt_attr_test_norm_sum = ig_nt_attr_test_sum / np.linalg.norm(ig_nt_attr_test_sum, ord=1)

dl_attr_test_sum = dl_attr_test.detach().numpy().sum(0)
dl_attr_test_norm_sum = dl_attr_test_sum / np.linalg.norm(dl_attr_test_sum, ord=1)

gs_attr_test_sum = gs_attr_test.detach().numpy().sum(0)
gs_attr_test_norm_sum = gs_attr_test_sum / np.linalg.norm(gs_attr_test_sum, ord=1)

fa_attr_test_sum = fa_attr_test.detach().numpy().sum(0)
fa_attr_test_norm_sum = fa_attr_test_sum / np.linalg.norm(fa_attr_test_sum, ord=1)

lin_weight = mlp.model[0].weight[0].detach().numpy()
y_axis_lin_weight = lin_weight / np.linalg.norm(lin_weight, ord=1)

width = 0.14
legends = ['Int Grads', 'Int Grads w/SmoothGrad','DeepLift', 'GradientSHAP', 'Feature Ablation', 'Weights']

plt.figure(figsize=(20, 10))

ax = plt.subplot()
ax.set_title('Comparing input feature importances across multiple algorithms and learned weights')
ax.set_ylabel('Attributions')

FONT_SIZE = 16
plt.rc('font', size=FONT_SIZE)            # fontsize of the text sizes
plt.rc('axes', titlesize=FONT_SIZE)       # fontsize of the axes title
plt.rc('axes', labelsize=FONT_SIZE)       # fontsize of the x and y labels
plt.rc('legend', fontsize=FONT_SIZE - 4)  # fontsize of the legend

ax.bar(x_axis_data, ig_attr_test_norm_sum, width, align='center', alpha=0.8, color='#eb5e7c')
ax.bar(x_axis_data + width, ig_nt_attr_test_norm_sum, width, align='center', alpha=0.7, color='#A90000')
ax.bar(x_axis_data + 2 * width, dl_attr_test_norm_sum, width, align='center', alpha=0.6, color='#34b8e0')
ax.bar(x_axis_data + 3 * width, gs_attr_test_norm_sum, width, align='center',  alpha=0.8, color='#4260f5')
ax.bar(x_axis_data + 4 * width, fa_attr_test_norm_sum, width, align='center', alpha=1.0, color='#49ba81')
ax.bar(x_axis_data + 5 * width, y_axis_lin_weight, width, align='center', alpha=1.0, color='grey')
ax.autoscale_view()
plt.tight_layout()

ax.set_xticks(x_axis_data + 0.5)
ax.set_xticklabels(x_axis_data_labels)

plt.legend(legends, loc=3)
plt.show()

In [None]:
print(np.argsort(ig_attr_test_norm_sum))
print(np.argsort(dl_attr_test_norm_sum))
print(np.argsort(gs_attr_test_norm_sum))
print(np.argsort(fa_attr_test_norm_sum))