In [None]:
import warnings

import numpy as np
import pandas as pd

from itertools import product
from perlin_numpy import generate_perlin_noise_2d
from scipy.interpolate import griddata
from scipy.stats import percentileofscore
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# Random seed
np.random.seed(1)

m = 5 # Physical observations per iteration (Default 5)
n = 100 # Simulated observations per iteration (Default 100)

# Number of observations to train the GP on before starting the active learning loop
pretrain_n = 1

# Minimum percentile to explore each iteration
percentile = 50

# Number of iterations to run the active learning loop (Default 10)
iterations = 10

# Corrective constant when calculating percentages
laplace_alpha = 0.01

# Degree of the polynomial to fit to the data
degree = 2

In [None]:
# Loading data for physical data and simulation data 
simulated_data = pd.read_csv('simulated_data.csv')
physical_data = pd.read_csv('physical_data.csv')

# Pretrain simulated data w/ 1 observation
initial_data = simulated_data.sample(pretrain_n)

# Creates Copy of Simulated Data
simul_df = initial_data.copy()

# Fit a polynomial model to the initial data (Step is dependent on type of data and features working with)
poly_features = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly_features.fit_transform(initial_data[["x1", "x2"]])
# Fit Linear Regeression to Polynomial
poly_model = LinearRegression()
poly_model.fit(poly_features, initial_data["target"])

# Generate more initial training data with physical observation 
df = physical_data.sample(m)

# Active learning loop
for idx in range(1, iterations + 1):
    # Fit a Gaussian Process to the data
    kernel = RBF(length_scale=1)
    model = GaussianProcessRegressor(kernel=kernel, normalize_y=False, random_state=3, alpha=0.001)

    # Randomly sample within our function's bounds (using the simulated data)
    sampled_simulated_data = simulated_data.sample(n)
    
    # Fit the GP to the simulated data
    model.fit(sampled_simulated_data[["x1", "x2"]], sampled_simulated_data["target"])

    # Make predictions using the simulated data
    predictions = model.predict(sampled_simulated_data[["x1", "x2"]])
    
    # Construct probability distribution for points to sample from
    p = predictions - predictions.min()
    p = np.where(p > np.percentile(p, percentile), p, 0) + laplace_alpha
    p /= p.sum()
    
    # Sample new physical data points
    new_physical_data = physical_data.sample(m, weights=p)
    
    # Update the training data
    df = pd.concat([df, new_physical_data], ignore_index=True)

    simul_df = pd.concat([simul_df, new_physical_data], ignore_index=True)
    
    # Refit the polynomial model with the updated data (Step is dependent on type of data and features working with)
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    poly_features = poly_features.fit_transform(simul_df[["x1", "x2"]])
    poly_model = LinearRegression()
    poly_model = poly_model.fit(poly_features, simul_df["target"])


kernel = RBF(length_scale=1)
model = GaussianProcessRegressor(kernel=kernel, normalize_y=False, random_state=3, alpha=0.001)

# Fit the final Gaussian Process model on all data
model.fit(df[["x1", "x2"]], df["target"])

# Make final predictions
final_pred = model.predict(df[["x1", "x2"]])

# Evaluate the model performance (e.g., using percentile score)
percentile_score = percentileofscore(df["target"], max(final_pred))

print("Final Percentile Score:", percentile_score)