# United Kingdom Atomic Energy Authority (UK AEA) example

First, import the required libraries

In [47]:
# Standard imports
from pprint import pprint

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Project imports
import twinlab as tl

Now, define some parameters

In [48]:
# File paths
campaign_dir = "./../campaigns/ukaea/"
datasets_dir = "./../datasets/"
file_grid = campaign_dir + "grid.csv"
file_train = datasets_dir + "ukaea_small.csv"
# file_eval = campaign_dir + "eval.csv"
# file_eval = campaign_dir + "post.csv"
file_eval = campaign_dir + "test.csv"

# Campaign parameters
campaign = "ukaea"

# Server
server = "local"
# server = "cloud"

Load the necessary data for training and plotting

In [49]:
# Load training and evaluation data (for plotting purposes)
df_train = pd.read_csv(file_train)
df_eval = pd.read_csv(file_eval)
df_grid = pd.read_csv(file_grid, header=None)

Next, we create the parameter dictionary that we need to give to run twinLab.  At a minimum the user must provide the `filename` of the dataset on which we want to train our model (csv format), together with the columns that we will take to be the `inputs` and `outputs` of our model, once that is trained. By default, `twinLab` will train a Gaussian process (`emulator=gaussian_process`) and use all of the data for training. This latter choice can be overridden by adding e.g., `train_test_split=100` so that only the first 100 entries in `filename` are used for training, and the remaining examples can then be used for model evaluation.

In [50]:
# Column headings for outputs
y_outputs = [f"y{i}" for i in range(len(df_grid))]

# Parameters
params = {
    "filename": "ukaea_small.csv",
    "inputs": ["E1", "E2", "E3", "n1", "n2"],
    "outputs": y_outputs,
    "decompose_outputs": True,
}

Now the dataset can be uploaded to the cloud

In [51]:
# Upload dataset to cloud
tl.upload_dataset(file_train, server=server, verbose=True)

Response: ukaea_small.csv uploaded successfully



List the datasets to check that the upload worked

In [52]:
# List datasets to check that the upload worked
datasets = tl.list_datasets(server=server, verbose=True)
pprint(datasets)

Response: Found 4 datasets

['basic.csv', 'big.csv', 'leaf_spring.csv', 'ukaea_small.csv']


In [53]:
# Query the dataset
tl.query_dataset(params["filename"], server=server, verbose=True)

Response: Dataset summary for ukaea_small.csv

Summary:
                E1          E2          E3          n1          n2  \
count  100.000000  100.000000  100.000000  100.000000  100.000000   
mean     0.847612    1.089453    1.427127    0.002752    0.000567   
std      0.085740    0.114628    0.186975    0.001192    0.000263   
min      0.701242    0.902821    1.116608    0.000567    0.000103   
25%      0.792304    0.976803    1.277430    0.001769    0.000335   
50%      0.832469    1.105568    1.428571    0.002840    0.000586   
75%      0.931261    1.175770    1.591800    0.003681    0.000782   
max      0.995367    1.299079    1.749649    0.004891    0.000997   

                 y0            y1            y2            y3            y4  \
count  1.000000e+02  1.000000e+02  1.000000e+02  1.000000e+02  1.000000e+02   
mean   6.236236e+16  6.740355e+16  7.284150e+16  7.869952e+16  8.498411e+16   
std    6.743758e+16  7.260721e+16  7.814351e+16  8.407752e+16  9.041432e+16   
min  

  return values.astype(dtype, copy=copy)


Unnamed: 0,E1,E2,E3,n1,n2,y0,y1,y2,y3,y4,...,y614,y615,y616,y617,y618,y619,y620,y621,y622,y623
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100,100,100,100,100,100,100,100,100,100
mean,0.847612,1.089453,1.427127,0.002752,0.000567,6.236236e+16,6.740355e+16,7.28415e+16,7.869952e+16,8.498411e+16,...,154559509200000000,152720200900000000,150905701500000000,149093381000000000,147302843100000000,145556334800000000,143890782400000000,142269782200000000,140780263300000000,139618248200000000
std,0.08574,0.114628,0.186975,0.001192,0.000263,6.743758e+16,7.260721e+16,7.814351e+16,8.407752e+16,9.041432e+16,...,143296892999999984,141620385300000000,139970139700000000,138322064900000000,136711470900000000,135118433600000000,133579389900000000,132074587700000000,130669770100000000,129643602700000000
min,0.701242,0.902821,1.116608,0.000567,0.000103,321642800000000.0,347697600000000.0,379101900000000.0,416044400000000.0,458147600000000.0,...,20517653960000000,20352395640000000,20187137320000000,20021879010000000,19856620690000000,19691362370000000,19526104050000000,19360845740000000,19195587420000000,19030329100000000
25%,0.792304,0.976803,1.27743,0.001769,0.000335,3992330000000000.0,4382066000000000.0,4819222000000000.0,5301193000000000.0,5828180000000000.0,...,53976693380000000,53491626840000000,53048313320000000,52697242970000000,52346172620000008,51995102270000000,51644031920000000,51292961570000000,50941891220000008,50434344400000008
50%,0.832469,1.105568,1.428571,0.00284,0.000586,3.338909e+16,3.634879e+16,3.958498e+16,4.310078e+16,4.690331e+16,...,108612447600000000,107585892000000000,106559336400000000,105532780700000000,104506225100000000,103479669500000000,102453113900000000,101426558300000000,100400002700000000,99373447100000000
75%,0.931261,1.17577,1.5918,0.003681,0.000782,9.866285e+16,1.070075e+17,1.160492e+17,1.258204e+17,1.363362e+17,...,183863187100000000,181324046100000000,178588373500000000,176123662700000032,174426208800000000,172728754900000000,170664049500000000,168127592000000000,166548809000000000,165627102200000000
max,0.995367,1.299079,1.749649,0.004891,0.000997,2.403292e+17,2.580857e+17,2.770946e+17,2.974137e+17,3.190538e+17,...,703960285700000000,694930683700000000,685901081600000000,676871479600000000,668839481500000000,661117583800000000,653395686100000000,645673788300000000,637951890600000000,630229992900000000


Train the model, this step takes around 20 seconds.

In [54]:
# Train campaign
# TODO: You get an error message here when the request times out
tl.train_campaign(params, campaign, server=server, verbose=True)

In [None]:
# List campaigns to sanity check upload
campaigns = tl.list_campaigns(server=server, verbose=True)
pprint(campaigns)

In [None]:
# Query the campaign
_ = tl.query_campaign(campaign, server=server, verbose=True)

In [None]:
# Sample from emulator
df_mean, df_std = tl.sample_campaign(file_eval, campaign, server=server, verbose=False)

In [None]:
# Parameters for plot
color = "blue"
alpha = 0.8
xs = {"E1": r"$E_{1}$", "E2": r"$E_{2}$", "E3": r"$E_{3}$", "n1": r"$n_{1}$", "n2": r"$n_{2}$"}
ys = {f"y{i}": fr"$y_{{{i}}}$" for i in [0, 100, 150]}

# Plot some examples
nrow, ncol = len(ys), len(xs)
plt.subplots(nrow, ncol, figsize=(25, 10))
nplot = 0
for y, y_label in ys.items():
    for x, x_label in xs.items():
        nplot += 1
        plt.subplot(nrow, ncol, nplot)
        plt.errorbar(df_eval[x], df_mean[y], yerr=df_std[y], marker='.', lw=1, ls='None', color=color, alpha=alpha, label="Model")
        plt.plot(df_train[x], df_train[y], ".", color="black", alpha=0.1, label="Training data")
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        if nplot==1: plt.legend()
plt.show()

In [None]:
# Parameters for plot
error_inflation_factor = 1. # Factor to multiply error by for plotting
y_fac = 18 # Factor to divide y by for plotting [log10]
plot_eval = True
alpha_data = 0.75
plot_model_mean = False
plot_model_blur = True
alpha_model = 0.75
n_model_blur = 100
model_color = 'blue'
number_of_training_examples = 0
number_of_model_examples = 5

# Plot results
grid = df_grid.iloc[:, 0]
plt.subplots()
if plot_model_blur and not plot_model_mean: 
    plt.fill_between(grid, np.nan, np.nan, color=model_color, alpha=alpha_model, lw=0., label="Model predictions")
for example in range(number_of_training_examples): # Training examples
    train = df_train[y_outputs].iloc[example]/10**y_fac
    label = "Example training data" if example==0 else None
    plt.plot(grid, train, color='black', alpha=0.5, label=label)
for example in range(number_of_model_examples): # Model predictions
    mean = df_mean[y_outputs].iloc[example]/10**y_fac
    err = error_inflation_factor*df_std[y_outputs].iloc[example]/10**y_fac
    if plot_eval and (file_eval == campaign_dir + "test.csv"):
        eval = df_eval[y_outputs].iloc[example]/10**y_fac
        label = "Test data" if example==0 else None
        plt.plot(grid, eval, color='black', alpha=alpha_data, label=label)
    if plot_model_mean:
        label = "Model predictions" if example==0 else None
        plt.plot(grid, mean, color=model_color, label=label, alpha=alpha_model)
    if plot_model_blur:
        alpha = tl.get_alpha(n_model_blur, alpha_model)
        dys = tl.get_boundaries(n_model_blur)
        for dy in dys:
            plt.fill_between(grid, mean-dy*err, mean+dy*err, color=model_color, alpha=alpha, lw=0.)
plt.xlabel(r'Temperature [K]')
plt.xlim((grid.min(), grid.max()))
plt.ylabel(rf"Desorption rate [$10^{{{y_fac}}}$ $m^{{{-2}}}$ $s^{{{-1}}}$]")
plt.ylim(bottom=0.)
plt.legend()
plt.show()

In [None]:
# Delete campaign
tl.delete_campaign(campaign, server=server, verbose=True)