### Objective

In this notebook, we attempt to build surrogate models for predicting the thermal resistance. 

Here, we aim to train GP adaptively to approximate the Tjmax=175 limit state.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import qmc, norm
import os
import sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, max_error, brier_score_loss
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors

import gpflow
import tensorflow as tf
import tensorflow_probability as tfp

from two_sources import thermal_distribution_maxT






### 1. Load dataset

In [2]:
Data = (25, 50e-3, 65e-3, 61.4e-3, 106e-3)

In [3]:
df_train = pd.read_csv('./dataset/train.csv')
df_train.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2', 'Tc', 'Tj', 'w']
print(f"Training dataset: {df_train.shape[0]}")

Training dataset: 189


In [4]:
df_train

Unnamed: 0,Q1,Q2,d,b,L,c,L_duct,n,t,xc1,yc1,xc2,yc2,Tc,Tj,w
0,355.139820,314.512342,0.029385,0.275739,0.154803,0.015783,0.023692,44.0,0.004294,0.233437,0.086118,0.080215,0.094712,129.300482,129.300482,4.988744
1,120.559225,105.816580,0.019954,0.102969,0.314078,0.034592,0.037687,49.0,0.001070,0.065078,0.095670,0.045501,0.220070,54.411424,54.411424,3.433136
2,301.009415,192.523188,0.023695,0.106825,0.413988,0.021935,0.036011,43.0,0.001233,0.066218,0.306664,0.072154,0.088363,109.506004,109.506004,4.281630
3,358.267488,349.143693,0.028807,0.118856,0.323420,0.011139,0.030872,16.0,0.005567,0.070974,0.062465,0.073593,0.205439,254.349741,254.349741,4.008711
4,235.668062,157.249865,0.012980,0.175404,0.430567,0.016867,0.032250,17.0,0.003725,0.134959,0.102105,0.035446,0.251917,103.081174,103.081174,4.142581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,277.605592,80.214421,0.014407,0.166646,0.161272,0.010378,0.031142,29.0,0.002216,0.034801,0.083433,0.122594,0.095787,119.963601,119.963601,1.589780
185,360.339276,52.364956,0.019092,0.188851,0.371149,0.033252,0.035701,22.0,0.002112,0.081036,0.108038,0.043768,0.266257,103.341218,103.341218,5.415417
186,137.954796,96.679536,0.017386,0.180576,0.399115,0.036451,0.049493,25.0,0.003293,0.054960,0.141772,0.121323,0.211342,56.296731,56.296731,6.870861
187,82.656515,62.385292,0.024895,0.147052,0.374643,0.018100,0.022346,24.0,0.004692,0.040947,0.152328,0.099911,0.295579,56.164247,56.164247,5.968222


In [5]:
# Dedicated testing set
df_test = pd.read_csv('./Dataset/test.csv')
df_test.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2', 'Tc', 'Tj', 'w']
print(f"Testing dataset: {df_test.shape[0]}")

Testing dataset: 5655


In [6]:
df_test

Unnamed: 0,Q1,Q2,d,b,L,c,L_duct,n,t,xc1,yc1,xc2,yc2,Tc,Tj,w
0,356.395266,144.179828,0.027791,0.148169,0.142119,0.028676,0.024699,33.0,0.002149,0.114664,0.080575,0.039811,0.062508,106.200898,106.200898,2.563658
1,368.808646,235.799176,0.029227,0.169697,0.382787,0.014792,0.029531,49.0,0.001814,0.074481,0.257127,0.068794,0.053503,120.544942,120.544942,6.738812
2,109.657702,56.113264,0.012565,0.204452,0.351198,0.022393,0.038982,44.0,0.001429,0.038548,0.169999,0.134746,0.230745,52.631816,52.631816,4.076087
3,290.565204,263.656203,0.008551,0.302306,0.495513,0.033503,0.032348,18.0,0.013610,0.115247,0.084058,0.068212,0.284671,119.094185,119.094185,14.845235
4,323.457603,266.906140,0.008188,0.275833,0.410712,0.034486,0.023580,32.0,0.006791,0.051188,0.149818,0.055778,0.288429,130.464918,130.464918,11.169889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5650,210.270976,101.705695,0.015563,0.149852,0.186680,0.019318,0.022339,17.0,0.004959,0.047727,0.124352,0.110789,0.113483,96.448095,96.448095,2.199603
5651,352.871362,247.671890,0.015205,0.171416,0.219256,0.024036,0.044648,34.0,0.001521,0.125921,0.140918,0.052202,0.159077,116.234587,116.234587,2.532587
5652,224.445837,155.860506,0.018345,0.168901,0.293893,0.026435,0.024752,24.0,0.001282,0.135408,0.091815,0.040881,0.197082,82.777441,82.777441,3.358140
5653,149.993232,91.302898,0.005494,0.303702,0.392907,0.018186,0.026981,12.0,0.007834,0.224231,0.115229,0.073253,0.115265,92.013851,92.013851,3.990053


In [7]:
# Dedicated pool for active learning
df_pool = pd.read_csv('./dataset/AL_train_10000.csv')
df_pool.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2']
print(f"Active learning pool: {df_pool.shape[0]}")

Active learning pool: 9388


In [8]:
df_pool

Unnamed: 0,Q1,Q2,d,b,L,c,L_duct,n,t,xc1,yc1,xc2,yc2
0,354.162049,209.973777,0.008376,0.105262,0.358018,0.024769,0.049020,40.0,0.001542,0.048407,0.056408,0.037934,0.304392
1,310.266548,304.103605,0.016987,0.252752,0.375389,0.025041,0.048750,22.0,0.005418,0.176011,0.056698,0.104327,0.227647
2,143.215680,90.987329,0.024670,0.087385,0.367372,0.011184,0.029552,35.0,0.001247,0.032683,0.289248,0.049411,0.181909
3,164.643507,157.346416,0.026123,0.167066,0.523121,0.024806,0.040541,18.0,0.005343,0.104320,0.229971,0.041316,0.270856
4,223.010215,162.737110,0.013772,0.212688,0.198952,0.022884,0.044573,31.0,0.005747,0.162109,0.140386,0.056991,0.064584
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9383,221.704797,140.655720,0.015763,0.177631,0.298198,0.036769,0.032717,33.0,0.003203,0.091877,0.237605,0.096537,0.054759
9384,348.057919,114.923554,0.011999,0.101835,0.298507,0.032715,0.026921,15.0,0.004623,0.046143,0.196469,0.042999,0.057228
9385,332.899604,220.776222,0.010925,0.265895,0.254384,0.013308,0.049156,33.0,0.003604,0.159820,0.198348,0.159719,0.053509
9386,384.975128,311.977766,0.017512,0.142966,0.168872,0.027084,0.042955,18.0,0.004502,0.043461,0.095763,0.109446,0.059088


In [9]:
def create_samples(df, train_num):
   
    # Create dataset
    X = df.iloc[:, :-3].to_numpy()
    y = df.iloc[:, -2].to_numpy()
    
    # Train-test split
    if train_num < len(df):
        test_size = 1-train_num/len(df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    else:
        X_train, y_train = X, y
        X_test, y_test = None, None
    
    return X_train, X_test, y_train, y_test

In [10]:
# Train-test split
X_train, _, y_train, _ = create_samples(df_train, len(df_train))
X_test, _, y_test, _ = create_samples(df_test, len(df_test))
X_pool = df_pool.to_numpy()

# Normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 2. Training

In [11]:
def evaluate_model(y_true, y_pred):
    """This function is used for evaluating the ML models performance."""
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    max_e = max_error(y_true, y_pred)
    
    percentage = np.abs(y_true-y_pred)/y_true
    max_percentage = np.max(percentage)*100
    max_percentage_loc = np.argmax(percentage)
    mean_percentage = np.mean(percentage)*100
    
    return rmse, max_e, max_percentage, max_percentage_loc, mean_percentage

#### GPflow setup

In [12]:
def init_length_scales(dim, n_restarts, initial_guess=None):
    
    # Random initial params
    lb, ub = -2, 2
    lhd = qmc.LatinHypercube(d=dim, seed=42).random(n_restarts)
    lhd = (ub-lb)*lhd + lb
    length_scales = 10**lhd

    # Informed initial guess
    if initial_guess is not None:
        length_scales = np.vstack((length_scales, initial_guess))

    return length_scales

In [13]:
def fit(X, y, n_restarts=20, init_lengthscales=None, init_variance=None, trainable=True, verbose=True):
    models = []
    log_likelihoods = []
    
    # Generate initial guesses for length scale
    length_scales = init_length_scales(X.shape[1], n_restarts, init_lengthscales)
    if init_variance is None:
        variance=np.var(y_train)
    else:
        variance=init_variance

    if not trainable:
        model = gpflow.models.GPR(
            (X, y.reshape(-1, 1)),
            kernel=gpflow.kernels.SquaredExponential(variance=variance, lengthscales=init_lengthscales),
            mean_function=gpflow.functions.Polynomial(0),
        )

        return model

    else:
        with tf.device("CPU:0"):
            
            for i, init in enumerate(length_scales):
                
                if verbose:
                    print(f"Performing {i+1}-th optimization:")
    
                # Set up the model
                kernel = gpflow.kernels.SquaredExponential(variance=variance, lengthscales=init)
                model = gpflow.models.GPR(
                    (X, y.reshape(-1, 1)),
                    kernel=kernel,
                    mean_function=gpflow.functions.Polynomial(0),
                )
    
                opt = gpflow.optimizers.Scipy()
                opt.minimize(model.training_loss, model.trainable_variables, options=dict(maxiter=100))
    
                models.append(model)
                log_likelihoods.append(model.log_marginal_likelihood().numpy())
    
        # Select the model with the highest log-marginal likelihood
        best_model_index = np.argmax(log_likelihoods)
        best_model = models[best_model_index]

        return best_model

#### Active learning

Select diverse batch

In [14]:
def select_diverse_batch(samples, acq, batch_size=5, pre_filter=False):
    
    if pre_filter:
        thred = np.quantile(acq, pre_filter)
        filtered_indices = np.arange(len(samples))[acq>thred]
        samples = samples[acq>thred]
        acq = acq[acq>thred]
    
    else:
        filtered_indices = np.arange(len(samples))
        
    # Perform weighted K-means clustering on the samples
    kmeans = KMeans(n_clusters=batch_size, n_init=10, random_state=0).fit(samples, sample_weight=acq)
    cluster_labels = kmeans.labels_

    # Find the highest acquisition value sample in each cluster
    selected_indices = []
    for cluster_idx in range(batch_size):
        cluster_indices = np.where(cluster_labels == cluster_idx)[0]
        cluster_acquisition_values = acq[cluster_indices]
        best_index_in_cluster = cluster_indices[np.argmax(cluster_acquisition_values)]
        selected_indices.append(best_index_in_cluster)

    return filtered_indices[selected_indices]

Acquisition function

In [15]:
def acquisition(model, candidate, limit_state_value=0, batch_mode=False, batch_size=None):

    # Compute prediction variance
    f_mean, f_var = model.predict_f(candidate, full_cov=False)
    f_mean = f_mean.numpy().flatten()
    f_var = f_var.numpy().flatten()

    # Calculate U values
    U_values = np.abs(f_mean-limit_state_value)/np.sqrt(f_var)

    # Sample selection
    if batch_mode:
        # Batch selection mode
        U_normalied = MinMaxScaler().fit_transform(1/U_values.reshape(-1, 1))
        indices = select_diverse_batch(candidate, U_normalied.flatten(), batch_size=batch_size)
    
    else:
        # Single point selection mode
        indices = np.argmin(U_values)

    return U_values, indices

In [16]:
def confidence_assessment(model, X_test, y_test, limit_state):
    f_mean, f_var = model.predict_f(X_test, full_cov=False)
    y_prob = norm.cdf(limit_state, loc=f_mean, scale=np.sqrt(f_var))
    label = np.where(y_test > limit_state, 1, 0)
    brier_score = brier_score_loss(label, 1-y_prob)
    return brier_score

In [17]:
# Initial samples 
print(f"Initial training dataset size: {X_train.shape[0]}")

Initial training dataset size: 189


In [18]:
n_iter = 100
U_hist, test_brier_scores = [], []
Tjmax = 175

for i in range(n_iter):
    print(f"Start {i+1}th learning iteration:")

    # 1-GP model training and predicting
    if i == 0:
        model = fit(X_train_scaled, y_train, n_restarts=20, verbose=False)

    else:
        # model = fit(X_train_scaled, y_train, n_restarts=20, verbose=False)
        model = fit(X_train_scaled, y_train, n_restarts=5, init_lengthscales=init_lengthscales, 
                    init_variance=init_variance, verbose=False)

    # 2-Check fitting results
    f_mean, _ = model.predict_f(scaler.transform(X_test), full_cov=False)
    has_nan = tf.reduce_any(tf.math.is_nan(f_mean)).numpy()
    counter = 0
    while has_nan:
        print(f"Bad fitting. Refit the data:")
        counter += 1
        model = fit(X_train_scaled, y_train, n_restarts=5, verbose=False)
        f_mean, _ = model.predict_f(scaler.transform(X_test), full_cov=False)
        has_nan = tf.reduce_any(tf.math.is_nan(f_mean)).numpy()

        if counter > 4:
            print(f"Fallback to parameters from last iteration:")
            model = fit(X_train_scaled, y_train, n_restarts=1, init_lengthscales=init_lengthscales.flatten(), 
                        init_variance=init_variance, trainable=False, verbose=False)
            f_mean, _ = model.predict_f(scaler.transform(X_test), full_cov=False)
            has_nan = tf.reduce_any(tf.math.is_nan(f_mean)).numpy()

    print(f"Good fitting. Proceed:")

    # 3-Model assessment
    brier_score = confidence_assessment(model, scaler.transform(X_test), y_test, Tjmax)
    test_brier_scores.append(brier_score)

    # 4-Acquisition
    X_pool_scaled = scaler.transform(X_pool)
    U_values, indices = acquisition(model, X_pool_scaled, limit_state_value=175, batch_mode=True, batch_size=10)
    target = np.min(U_values[indices])
    U_hist.append(target)
    print(f"Iter {i+1}: test brier score==>{brier_score:.5f}/{0.019}, U==>{target:.4f}/2, index==>{indices}")

    if target >= 2:
        break

    # 5-Updating
    X_train = np.vstack((X_train, X_pool[indices]))
    X_train_scaled = scaler.transform(X_train)

    # Calculating Tj
    print(f"Calculating Tj values.")
    response = []
    for sample_index in indices:
        Tmax, _ = thermal_distribution_maxT(X_pool[sample_index], Data)
        response.append(Tmax)
    y_train = np.append(y_train, np.array(response))
    print(f"Calculation compleed.")

    # Update pool
    X_pool = np.delete(X_pool, obj=indices, axis=0)

    # Update initial guess
    init_lengthscales = model.kernel.lengthscales.numpy().reshape(1, -1)
    init_variance = model.kernel.variance.numpy().flatten()[0]

Start 1th learning iteration:
Good fitting. Proceed:
Iter 1: test brier score==>0.02397/0.019, U==>0.0002/2, index==>[1599  343 8302 7233 2622 6986  799 9203 2703 1135]
Calculating Tj values.
Calculation compleed.
Start 2th learning iteration:
Good fitting. Proceed:
Iter 2: test brier score==>0.02374/0.019, U==>0.0003/2, index==>[3772 7480 8266 1394 3099 4373 3343 8065 8472  619]
Calculating Tj values.
Calculation compleed.
Start 3th learning iteration:
Good fitting. Proceed:
Iter 3: test brier score==>0.02319/0.019, U==>0.0001/2, index==>[3526 7806 3428 1716 8937 2694 6122 8723 5487 2284]
Calculating Tj values.
Calculation compleed.
Start 4th learning iteration:
Good fitting. Proceed:
Iter 4: test brier score==>0.01999/0.019, U==>0.0073/2, index==>[2624 2704 3746 6083 6941 2345  468 2932 3767 2094]
Calculating Tj values.
Calculation compleed.
Start 5th learning iteration:
Good fitting. Proceed:
Iter 5: test brier score==>0.01910/0.019, U==>0.0121/2, index==>[1519  481 4892 6907 2672 4

KeyboardInterrupt: 

In [None]:
# Save model
import pickle
with open('AL_model_params.pickle', 'wb') as handle:
    pickle.dump(gpflow.utilities.parameter_dict(model), handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save training data
np.save('AL_X_train.npy', X_train)
np.save('AL_y_train.npy', y_train)

# Save history
df = pd.DataFrame({"U": np.array(U_hist), "brier_scores": np.array(test_brier_scores)})
df['benchmark'] = 0.01039
df.to_csv("AL_history.csv", index=False)

#### Propose solutions

In [None]:
df_candidates = pd.read_csv('./Dataset/candidates.csv')
df_candidates.columns = ['d', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2']
print(f"PCandidate pool: {df_candidates.shape[0]}")

In [None]:
def evaluate_weight(X):
    # Properties
    density_Al = 2700
    Fan_height = 40e-3
    Fan_Weight = 50.8e-3
    N_fan = np.ceil(X[:, 3] / Fan_height)

    # Weight calculation
    w = density_Al*(X[:, 3]*X[:, 2]*X[:, 4]+X[:, 7]*(X[:, 5]*X[:, 8]*X[:, 4]))+ Fan_Weight*N_fan

    return w

In [None]:
Q_df = pd.read_csv('./Dataset/Q_test_locations.csv')

In [None]:
for i, (Q1, Q2) in enumerate(zip(Q_df['Q1'].to_numpy(), Q_df['Q2'].to_numpy())):

    print(f"Handling {i+1}th condition:")
    
    # Compile feature samples
    Q1_array, Q2_array = Q1*np.ones((df_candidates.shape[0], 1)), Q2*np.ones((df_candidates.shape[0], 1))
    X_candidates = df_candidates.to_numpy()
    X_candidates = np.hstack((Q1_array, Q2_array, X_candidates))
    X_candidates_scaled = scaler.transform(X_candidates)

    # GP prediction
    f_mean, f_var = model.predict_f(X_candidates_scaled, full_cov=False)
    f_mean = f_mean.numpy().flatten()
    f_var = f_var.numpy().flatten()

    # Utility
    Tjmax = 175
    likelihood = norm.cdf(Tjmax, loc=f_mean, scale=np.sqrt(f_var))
    w = evaluate_weight(X_candidates)
    utility = likelihood*1/w

    # Sort candidates
    df = pd.DataFrame(X_candidates)
    df.columns = ['Q1', 'Q2', 'd', 'b', 'L', 'c', 'L_duct', 'n', 't', 'xc1', 'yc1', 'xc2', 'yc2']
    df['weight'] = w
    df['pred_T'] = f_mean
    df['utility'] = utility
    df_sorted = df.sort_values(by='utility', ascending=False).reset_index(drop=True)

    # Output results
    df_reduced = df_sorted.iloc[:20, :].reset_index(drop=True)
    df_reduced.to_csv(f"Exp_{i+1}.csv", index=False)