In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.experimental import enable_halving_search_cv
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score, KFold, HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
####### CONSTANTS ########
FILEDIR = "/home/s1835083/Desktop/numu_energy_studies.csv"
FEATURES_HEADERS = ["total_hits2", "total_ring_PEs2", "recoDWallR2", "recoDWallZ2", "lambda_max_2",]
LABEL_HEADERS = ["trueKE",]
SEED = 42

# Dataloading

In [None]:
dataframe = pd.read_csv(FILEDIR, sep=",", header=0, index_col=0).set_index("i")

In [None]:
# Remove Nans
dataframe.dropna(inplace=True)

# Visualise the data
dataframe.head(10)


In [None]:
# Plot the distribution of variables which will be used

fig, ax = plt.subplots(2, 3,)
plot_headers = []
# Append all headers which will be plot
plot_headers.extend(FEATURES_HEADERS)
plot_headers.extend(LABEL_HEADERS)

xlabels = ["n_muons (a.u)", "n_hits (a.u)", "distance (a.u)", "distance (a.u)", "distance (a.u)", "energy (MeV)"]

for header, axis, xlabel in zip(plot_headers, ax.flatten(), xlabels):
    axis.hist(dataframe[header], log=True, color="maroon", bins=50, histtype="step")
    axis.set(
        title=header,
        ylabel="Counts",
        xlabel=xlabel,
    )


plt.tight_layout()


In [None]:
# Define the dataset used as input
train_data, test_data, train_target, test_target = train_test_split (
    dataframe[FEATURES_HEADERS],
    dataframe[LABEL_HEADERS],
    test_size = 0.3,
    random_state = SEED, 
)

# Print size of datasets
print(f"Size of train features: {train_data.shape}")
print(f"Size of train labels:   {train_target.shape}")
print(f"Size of test features:  {test_data.shape}")
print(f"Size of test labels:    {test_target.shape}")

# NN Regressor

In [None]:
def build_regressor(input_features=5, output_features=1, loss="mean_squared_error", optimizer="adam"):
    """
    Create a simple wide layer wide dense neural network w/ no hidden layers
    """
    # Create network
    network = tf.keras.models.Sequential([
        tf.keras.layers.Dense(32, input_dim=input_features, kernel_initializer="normal"),
        tf.keras.layers.LeakyReLU(),
        tf.keras.layers.Dense(16, kernel_initializer="normal"),
        tf.keras.layers.LeakyReLU(),
        tf.keras.layers.Dense(8, kernel_initializer="normal"),
        tf.keras.layers.LeakyReLU(),
        tf.keras.layers.Dense(4, kernel_initializer="normal"),
        tf.keras.layers.LeakyReLU(),
        tf.keras.layers.Dense(output_features, kernel_initializer="normal"),
    ])

    # Compile and return network
    network.compile(loss=loss, optimizer=optimizer)

    return network


In [None]:
# Define our callbacks 
callbacks_ = [
    EarlyStopping(verbose=False, patience=10, monitor="loss"),
    ModelCheckpoint("/home/s1835083/Desktop/model.h5", monitor="loss", verbose=0, save_best_only=True , mode="max")
]

In [None]:
# Define Hyperparametrs
BATCH_SIZE = 64
EPOCHS = 40

# Make pseudorandom packages deterministic
np.random.seed(SEED)

# Define input pipeline
estimators = []
estimators.append (("mlp", KerasRegressor(build_fn=build_regressor, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose =1)))
pipeline = Pipeline(estimators)


kfold = KFold(n_splits =10, random_state=SEED, shuffle=True)
results = cross_val_score(pipeline , train_data , train_target , cv=kfold , scoring="r2")

print("Result: %.2f %s %.2f" % (results.mean(), u"\u00B1", results.std ()))

In [None]:
# Fit the network to the train dataset
network = build_regressor()
history = network.fit(train_data, train_target, validation_data=(test_data, test_target), epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
# Plot the loss functions
plt.plot(history.history["loss"][1:], label="Train", color="darkblue")
plt.yscale("log")
plt.legend()
plt.ylabel("MSE loss")
plt.xlabel("Epoch")
plt.title("Loss function (Training)")

In [None]:
# Create a prediction using traind model
reco_KE = network.predict(test_data)

# Create a plot of reco vs MC truth energy
plt.scatter(test_target, reco_KE, s=0.5, c="maroon", label="Prediction")
plt.plot(test_target, test_target, ls="--", label="Ground truth")
plt.legend()
plt.xlabel("Truth KE (MeV)")
plt.ylabel("Reco KE (MeV)")

# GBRT Regressor

In [None]:
# Make a dirty fit using basic params for regression tree
regressor_tree = GradientBoostingRegressor(n_estimators=100)
regressor_tree.fit(train_data , train_target.to_numpy().ravel())

In [None]:
# Print the r2 scores for the train data and test data

train_r2 = regressor_tree.score(train_data, train_target.to_numpy())
test_r2 = regressor_tree.score(test_data, test_target.to_numpy())

print(f"Train data r2: {train_r2:.4f}")
print(f"Validation data r2: {test_r2:.4f}")

In [None]:
# Grid Search parameters
param_grid_ = {
    "n_estimators": [100],
    "learning_rate": np.linspace(1e-2, 4e-2, 10),
    "max_depth":    np.arange(10, 30),
    "min_samples_leaf": np.arange(40, 50,),
}
n_jobs_ = 8

In [None]:
np.random.seed(SEED)
regressor = GradientBoostingRegressor ()
classifier = HalvinGGridSearchCV(estimator=regressor, cv=kfold , param_grid=param_grid_, n_jobs=n_jobs_, verbose =1)
classifier.fit(train_data, train_target.to_numpy().ravel())
print("Best estimator:")
print(classifier.best_estimator_)