### Objective

In this notebook, we aim to train a surrogate model and use it to propose promising, feasible solution for the downstream optimization tasks.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from scipy.stats import norm, qmc
import xgboost as xgb
import sklearn.gaussian_process as gp
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, max_error, brier_score_loss

# Custom Gaussian Process model
GP_path = os.path.abspath(os.path.join('../../'))
if GP_path not in sys.path:
    sys.path.append(GP_path+"\\GaussianProcess")
from GPInterpolator import GPInterpolator

import gpflow
import tensorflow as tf
import tensorflow_probability as tfp






### 1. Load dataset

In [2]:
df = pd.read_csv('./Dataset/TcTj_train.csv', header=None)
df.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2', 'Tc', 'Tj', 'w']
print(f"Training pool: {df.shape[0]}")

Training pool: 9421


In [3]:
# Remove outliers
df = df[df.Tj<250].reset_index(drop=True)
print(f"Filtered pol: {df.shape[0]}")

Filtered pol: 9386


In [4]:
df_candidates = pd.read_csv('./Dataset/candidates.csv')
df_candidates.columns = ['d', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2']
print(f"PCandidate pool: {df_candidates.shape[0]}")

PCandidate pool: 470010


In [5]:
def create_samples(df, train_num):
   
    # Create dataset
    X = df.iloc[:, :-3].to_numpy()
    y = df.iloc[:, -2].to_numpy()
    
    # Train-test split
    if train_num < len(df):
        test_size = 1-train_num/len(df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    else:
        X_train, y_train = X, y
        X_test, y_test = None, None
    
    return X_train, X_test, y_train, y_test

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = create_samples(df, 640)

### 2. Model training

In [13]:
def evaluate_model(y_true, y_pred):
    """This function is used for evaluating the ML models performance."""
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    max_e = max_error(y_true, y_pred)
    
    percentage = np.abs(y_true-y_pred)/y_true
    max_percentage = np.max(percentage)*100
    max_percentage_loc = np.argmax(percentage)
    mean_percentage = np.mean(percentage)*100
    
    return rmse, max_e, max_percentage, max_percentage_loc, mean_percentage

#### Sparse Gaussian Process

In [18]:
from scipy.stats import qmc

# Create initial samples
n_restarts = 5
lb, ub = -3, 3
lhd = qmc.LatinHypercube(d=X_train.shape[1], seed=42).random(n_restarts)
length_scale = (ub-lb)*lhd + lb
length_scale = np.sqrt(1/(2*10**length_scale))

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)

In [19]:
from sklearn.neighbors import NearestNeighbors

def sample_creator(X, sample_num, sampling_scheme='LHS'):
    """Space-filling sampling for inducing points.

    Args:
    -----
    - X: the full training dataset
    - sample_num: the number of training samples
    - sampling_scheme: the sampling scheme
    - verbose: print the information
    """
    
    # Create virtual samples
    if sampling_scheme == 'LHS':
        raw_virtual_samples = qmc.LatinHypercube(d=X.shape[1]).random(n=sample_num)
    elif sampling_scheme == 'Halton':
        raw_virtual_samples = qmc.Halton(d=X.shape[1]).random(sample_num)
    else:
        raise ValueError(f"Invalid sampling scheme: {sampling_scheme}")

    # Dataset statistics
    X_scaled = MinMaxScaler().fit_transform(X)

    # Find closest real samples
    sample_finder = NearestNeighbors(n_neighbors=1).fit(X_scaled)
    _, indices = sample_finder.kneighbors(raw_virtual_samples)
    
    # Drop duplicates
    train_indices = np.unique(indices)

    # Compose train & pool samples
    X_induce = X[train_indices.flatten()]
  
    return X_induce

In [20]:
%%time

models = []
loss = []
induce_points_num = 200

with tf.device("CPU:0"):

    for i, init in enumerate(length_scale):
        print(f"Performing {i+1}-th optimization:")

        # Generate induce points
        X_induce = sample_creator(X_train, induce_points_num, sampling_scheme='Halton')
        X_induce_norm = scaler.transform(X_induce)

        # Set up the kernel
        kernel = gpflow.kernels.SquaredExponential(variance=np.var(y_train), lengthscales=init)
        model = gpflow.models.SGPR(
            (X_train_norm, y_train.reshape(-1, 1)),
            kernel=kernel,
            inducing_variable=X_induce_norm,
            mean_function=gpflow.functions.Polynomial(0)
        )
        
        opt = gpflow.optimizers.Scipy()
        loss_closure = model.training_loss_closure(compile=True)
        opt.minimize(loss_closure, model.trainable_variables, options=dict(maxiter=100))
    
        models.append(model)
        loss.append(loss_closure().numpy())

# Select the model with the highest log-marginal likelihood
best_model_index = np.argmin(loss)
best_model = models[best_model_index]

print(f"Best model loss: {loss[best_model_index]}")

Performing 1-th optimization:
Performing 2-th optimization:
Performing 3-th optimization:
Performing 4-th optimization:
Performing 5-th optimization:
Best model loss: 2457.2956713108283
CPU times: total: 20.5 s
Wall time: 7.93 s


In [21]:
X_test_norm = scaler.transform(X_test)
f_mean, f_var = best_model.predict_f(X_test_norm, full_cov=False)
y_prob = norm.cdf(175, loc=f_mean, scale=np.sqrt(f_var))
label = np.where(y_test > 175, 1, 0)
brier_score = brier_score_loss(label, 1-y_prob)
    
rmse, max_e, max_per, _, mean_per = evaluate_model(y_test, f_mean.numpy().flatten())
print(f"RMSE: {rmse:.4f} / data std: {np.std(y_test):.4f}")
print(f"Max Error: {max_e:.4f}")
print(f"Max Percentage Error: {max_per:.2f}")
print(f"Mean Percentage Error: {mean_per:.2f}")
print(f"Brier score: {brier_score:.5f}")

RMSE: 8.9259 / data std: 29.9466
Max Error: 89.3199
Max Percentage Error: 42.42
Mean Percentage Error: 5.61
Brier score: 0.01320


In [None]:
# Set the default font size
plt.rcParams['font.size'] = 14

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(y_test, f_mean.numpy().flatten(), 'o')
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
ax.set_xlabel('Ground truth')
ax.set_ylabel('Prediction')

plt.tight_layout()

### 3. Propose solutions

In [None]:
# Compose dataset
Q1, Q2 = 300, 200
Q1_array, Q2_array = Q1*np.ones((df_candidates.shape[0], 1)), Q2*np.ones((df_candidates.shape[0], 1))
X_candidates = df_candidates.to_numpy()
X_candidates = np.hstack((Q1_array, Q2_array, X_candidates))
X_candidates_scaled = scaler.transform(X_candidates)

#### Calculate weights

In [None]:
def evaluate_weight(X):
    # Properties
    density_Al = 2700
    Fan_height = 40e-3
    Fan_Weight = 50.8e-3
    N_fan = np.ceil(X[:, 3] / Fan_height)

    # Weight calculation
    w = density_Al*(X[:, 3]*X[:, 2]*X[:, 4]+X[:, 7]*(X[:, 5]*X[:, 8]*X[:, 4]))+ Fan_Weight*N_fan

    return w

In [None]:
w = evaluate_weight(X_candidates)

#### GP propsal

In [None]:
# GP prediction
f_mean, f_var = best_model.predict_f(X_candidates_scaled, full_cov=False)
f_mean = f_mean.numpy().flatten()
f_var = f_var.numpy().flatten()

In [None]:
Tjmax = 175
likelihood = norm.cdf(Tjmax, loc=f_mean, scale=np.sqrt(f_var))
utility = likelihood*1/w

In [None]:
# Sort candidates
df_candidates = pd.DataFrame(X_candidates)
df_candidates.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2']
df_candidates['weight'] = w
df_candidates['pred_T'] = f_mean
df_candidates['utility'] = utility
df_candidates_sorted = df_candidates.sort_values(by='utility', ascending=False).reset_index(drop=True)

In [None]:
df_candidates_sorted.head(20)

In [None]:
trial_index = 0
print(f"Predicted temperature: {df_candidates_sorted.loc[trial_index, 'pred_T']}")
df_candidates_sorted.iloc[trial_index, 2:-3].to_numpy()

In [None]:
def check_constraints(X):

    # Dimension
    c_module, d_module = 61.4e-3, 106e-3
    
    # Position
    Xc_min = c_module / 2
    Xc_max = X[3] - c_module / 2
    Yc_min = d_module / 2
    Yc_max = X[4] - d_module / 2
    
    con1 = X[8] < X[3] / X[7] - 1e-3     # For t
    con2 = (X[-4] < Xc_max) and (X[-4] > Xc_min) and (X[-3] < Yc_max) and (X[-3] > Yc_min)
    con3 = (X[-2] < Xc_max) and (X[-2] > Xc_min) and (X[-1] < Yc_max) and (X[-1] > Yc_min)
    con4 = (np.abs(X[-4] - X[-2]) > c_module) | (np.abs(X[-3] - X[-1]) > d_module)
    return con1 and con2 and con3 and con4

In [None]:
check_constraints(df_candidates_sorted.iloc[10, :].to_numpy()[:-2])

In [None]:
# Save the proposal
df_candidates_sorted.to_csv(f'{Q1}_{Q2}_proposal.csv', index=False)