# Model testing

https://www.youtube.com/watch?v=BoaHul6TXCE

In [None]:
import time
import timeit
import pandas as pd
import numpy as np
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime
from IPython.display import HTML
import cufflinks
import numba as nb
from scipy.optimize import least_squares, curve_fit
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.neural_network import MLPRegressor
import mlflow
import os
import mlflow.keras
import mlflow.sklearn
from gewapro.cache import cache
from gewapro.preprocessing import get_waveforms, train_test_split_cond, smoothen_waveforms, get_and_smoothen_waveforms, select_from_source
from gewapro.functions import (quadratic_arr,
                               fit_parabolas,
                               df_with_fits,
                               _fit_final_slope,
                               combine_and, combine_or,
                               calc_ab)
from gewapro.plotting.base import _fwhm_energy_df
from gewapro.plotting import (histogram,
                              corr_fig,
                              mlp_reg_fig,
                              plot_transform,
                              energy_histogram,
                              boxplot,
                              plot_predictions,
                              energy_line_plot,
                              rmse_energy_line_plot)
from gewapro.models import regressor_model, train_model, get_model_version_map, ModelInfo
from gewapro.experiment_flow import run_experiment
import mlflow.pyfunc
import xgboost as xgb
import itertools

cufflinks.go_offline()

# data06_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-06.csv"
# data07_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-07.csv"
# data08_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-08.csv"
# data10_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-10.csv"
# data20_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-20.csv"
# data30_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-30.csv"
# data40_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-40.csv"
# data50_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-50.csv"
# data60_name = "20231110-Na22-d0-12-ML-Tz6-100ns-ecf-60.csv"
data_g1274_name = "20231110-Na22-d0-12-tz6-ML200-g1274.dat"
data_g511_name = "20231110-Na22-d0-12-tz6-ML200-g511.dat"
data_g1274_unfiltered_name = "20231110-Na22-d0-12-tz6-ML200_nofir_noMa_D40-g1274.dat"
data_g511_unfiltered_name = "20231110-Na22-d0-12-tz6-ML200_nofir_noMa_D40-g511.dat"
data_g1274_partfltr_name = "20231110-Na22-d0-12-tz6-ML200_noMa_D10_soft_sc_g1274.dat"
data_g511_partfltr_name = "20231110-Na22-d0-12-tz6-ML200_noMa_D10_soft_sc_g511.dat"
data_g1274_partfltr_name_all = "20231110-Na22-d0-12-tz6-ML200_noMa_D10_soft_sc_g1274_all.dat"
data_g511_partfltr_name_all = "20231110-Na22-d0-12-tz6-ML200_noMa_D10_soft_sc_g511_all.dat"
name_g1274_d = lambda i: f"20231110-Na22-d{i}-tz6-ML200_noMa_D10_soft_sc_g1274.dat"
name_g511_d = lambda i: f"20231110-Na22-d{i}-tz6-ML200_noMa_D10_soft_sc_g511.dat"
# data06 = pd.read_csv("data/"+data06_name)
# data07 = pd.read_csv("data/"+data07_name)
# data08 = pd.read_csv("data/"+data08_name)
# data10 = pd.read_csv("data/"+data10_name)
# data20 = pd.read_csv("data/"+data20_name)
# data30 = pd.read_csv("data/"+data30_name)
# data40 = pd.read_csv("data/"+data40_name)
# data50 = pd.read_csv("data/"+data50_name)
# data60 = pd.read_csv("data/"+data60_name)
datag1274 = pd.read_csv("data/"+data_g1274_name)
datag511 = pd.read_csv("data/"+data_g511_name)
datag1274unfiltered = pd.read_csv("data/"+data_g1274_unfiltered_name)
datag511unfiltered = pd.read_csv("data/"+data_g511_unfiltered_name)
datag1274partfltr = pd.read_csv("data/"+data_g1274_partfltr_name)
datag511partfltr = pd.read_csv("data/"+data_g511_partfltr_name)
datag1274partfltr_all = pd.read_csv("data/"+data_g1274_partfltr_name_all)
datag511partfltr_all = pd.read_csv("data/"+data_g511_partfltr_name_all)
data_dict = {
            #  data06_name: data06,
            #  data07_name:data07,
            #  data08_name:data08,
            #  data10_name:data10,
            #  data20_name:data20,
            #  data30_name:data30,
            #  data40_name:data40,
            #  data50_name:data50,
            #  data60_name:data60,
             data_g1274_name:data_g1274_name,
             data_g511_name:data_g511_name,
             data_g1274_unfiltered_name:datag1274unfiltered,
             data_g511_unfiltered_name:datag511unfiltered,
             data_g1274_partfltr_name:datag1274partfltr,
             data_g511_partfltr_name:datag511partfltr,
             data_g1274_partfltr_name_all:datag1274partfltr_all,
             data_g511_partfltr_name_all:datag511partfltr_all,
            } | {
                    # name_g1274_d(i): select_from_source(datag1274partfltr_all, select_channels=[i]) for i in range(0,13)
            } | {
                    name_g511_d(i): select_from_source(datag511partfltr_all, select_channels=[i]) for i in range(0,13)
            }

display_all = 1
if display_all:
    # display(data_dict[name_g1274_d(1)])
    # display(data_dict[name_g1274_d(2)])
    # display(data_dict[name_g1274_d(0)])
    # display(data_dict[data_g1274_unfiltered_name])
    # display(data_dict[name_g511_d(0)])
    # display(data_dict[data_g511_unfiltered_name])
    pass
    # display(data06) # Ch, E, dT, s0 ... s127, # 18662 rows/waveforms

x_to_t = lambda x: 160-(x*4)
t_to_x = lambda t: (160-t)/4
# def t_to_x(t): return (160 - t) / 4

unnormalized = []
normalized = []
for normalize in [True, False]:
    for data,range_E in [(datag1274, (4000,5000)), (datag511, (11050,11250)), (datag1274unfiltered, (4000,5000)), (datag511unfiltered, (11050,11250)),
                        (datag1274partfltr,(4000,5000)),(datag511partfltr,(11050,11250))]+[(dat,()) for dat in [datag1274,datag511,datag1274unfiltered,datag511unfiltered,datag1274partfltr,datag511partfltr]]:
        data_in_E_range = get_waveforms(source_data=data, select_energies=range_E, include_energy=False)
        if normalize:
            normalized.append(data_in_E_range)
        else:
            s_labels_E = pd.Series(np.array([float([s[s.find("E")+1:] for s in [col.replace(" ","")]][0]) for col in data_in_E_range.columns])) * .1139
            data_in_E_range.loc[:] = s_labels_E.T.values * data_in_E_range
            unnormalized.append(data_in_E_range)

unnorm_fltr_dict = {"g511": unnormalized[0], "g1274": unnormalized[1], "gBOTH":{"range_cut":pd.concat(unnormalized[:2],axis=1), "range_all": pd.concat(unnormalized[6:8],axis=1)}}
unnorm_unfltr_dict = {"g511": unnormalized[2], "g1274": unnormalized[3], "gBOTH":{"range_cut":pd.concat(unnormalized[2:4],axis=1), "range_all": pd.concat(unnormalized[8:10],axis=1)}}
unnorm_partfltr_dict = {"g511": unnormalized[4], "g1274": unnormalized[5], "gBOTH":{"range_cut":pd.concat(unnormalized[4:6],axis=1), "range_all": pd.concat(unnormalized[10:],axis=1)}}
norm_fltr_dict = {"g511": normalized[0], "g1274": normalized[1], "gBOTH":{"range_cut":pd.concat(normalized[:2],axis=1), "range_all": pd.concat(normalized[6:8],axis=1)}}
norm_unfltr_dict = {"g511": normalized[2], "g1274": normalized[3], "gBOTH":{"range_cut":pd.concat(normalized[2:4],axis=1), "range_all": pd.concat(normalized[8:10],axis=1)}}
norm_partfltr_dict = {"g511": normalized[4], "g1274": normalized[5], "gBOTH":{"range_cut":pd.concat(normalized[4:6],axis=1), "range_all": pd.concat(normalized[10:],axis=1)}}

raw_dict = {"unnormalized": {"filtered": unnorm_fltr_dict, "unfiltered": unnorm_unfltr_dict, "FIR": unnorm_partfltr_dict, "FIR_MA": None},
            "normalized": {"filtered": norm_fltr_dict, "unfiltered": norm_unfltr_dict, "FIR": norm_partfltr_dict, "FIR_MA": None} }
data_dict["raw"] = raw_dict

In [None]:
dfplot = get_waveforms(select_energies=(),source_data=datag1274partfltr)
print("Total waveform count:",len(dfplot.columns))
dfplot85_95 = dfplot.loc[:,(dfplot.loc[199] > 0.85) & (dfplot.loc[199] < 0.95)]
dfplot85 = dfplot.loc[:,dfplot.loc[199] < 0.85]
# dfplot.iloc[:,:100].iplot(title="First 100 waveforms")
# display(dfplot85_95)
# dfplot85_95.iloc[:,:100].iplot(title="First 100 waveforms with last value between 0.85 and 0.95")
# display(dfplot85)
# dfplot85.iloc[:,:100].iplot(title="First 100 waveforms with last value below 0.85")
print(f"Final point <0.85 rate: {len(dfplot85.columns)/len(dfplot.columns):.2%} ({len(dfplot85.columns)} waveforms), 0.85-0.95 rate: {len(dfplot85_95.columns)/len(dfplot.columns):.2%} ({len(dfplot85_95.columns)} waveforms)")
data_dict["raw"]["unnormalized"]["unfiltered"]["gBOTH"]["range_all"].iloc[:,-100:].iplot(title="Last 100 unfiltered unnormalized waveforms")
# data_dict["raw"]["normalized"]["FIR"]["gBOTH"]["range_cut"].iloc[:,:50].iplot(title="First 50 partially filtered normalized waveforms E(4000-5000)(11050-11250)")
get_waveforms(source_data=data_dict[name_g511_d(0)], select_energies=(4000,5000)).iloc[:,:100].iplot(title="First 100 partially filtered normalized waveforms")
# data_dict["raw"]["normalized"]["partfltr"]["gBOTH"]["range_cut"].iloc[:,-50:].iplot(title="Last 50 partially filtered normalized waveforms E(4000-5000)(11050-11250)")
# data_dict["raw"]["unnormalized"]["partfltr"]["gBOTH"]["range_cut"].iloc[:,-50:].iplot(title="Last 50 partially filtered unnormalized waveforms E(4000-5000)(11050-11250)")

In [None]:
print(modelinfo := ModelInfo.from_database("MLPRegressorModel",3026))
print(ModelInfo.from_database("MLPRegressorModeleerearwea2wefacd3qdwq",3001))

In [None]:
s_diff = datag511partfltr_all["dT"] - datag511partfltr_all["Tref"]
s_diff.name = "dT - Tref"
display(pd.concat([datag511partfltr_all,s_diff],axis=1))

In [None]:
# Setup
pca_components = 22
hidden_layers = [16]
activation = "relu"

# Internal workings of the fitting
data_df = get_waveforms(select_energies=(5000,50000),source_data=datag511partfltr) # all_data
cutoff_val = 0.85
data_df85: pd.DataFrame = data_df.loc[:,data_df.loc[199] > cutoff_val]
print(f"Discarding {len(data_df.columns)-len(data_df85.columns)} waveforms that end below {cutoff_val} from the data set (initially {len(data_df.columns)} waveforms)")
data = data_df85.values.transpose()
labels_t = np.array([float([s[s.find("Tref")+4:s.find(",dT")] for s in [col.replace(" ","")]][0]) for col in data_df85.columns])
labels_x = t_to_x(labels_t)
wave_i = np.array([int(col[col.find("[")+1:col.find("]")]) for col in data_df85.columns])

# Transform the data (PCA/TruncatedSVD)
pca_method = "sklearn.decomposition.TruncatedSVD"
PCA_seed = round((592138171 * (datetime.now().timestamp()*9732103 % 38045729)) % 3244034593)
model = TruncatedSVD(pca_components, random_state=PCA_seed)
data_trans = model.fit_transform(data)
pca_var_ratio = model.explained_variance_ratio_
# print("pca_var_ratio:",pca_var_ratio)

# Create a conditioned train-test split on the data, with data not passing the condition added to the testing set
d_train, d_test, l_train, l_test, l_train_t, l_test_t, wi_train, wi_test = train_test_split_cond(data_trans, labels_x, labels_t, wave_i, test_size=0.2, 
                                                                                                    random_state=42, 
                                                                                                    add_removed_to_test=True)
print("[MLFlow run] Divided data in train, test sets:",l_train.shape, l_test.shape," -> total set of",l_train.shape[0]+l_test.shape[0],"/ available","unknown")

# Creating model
regr = regressor_model("Tensorflow", pca_components,hidden_layers,activation, max_iter=10, name="try_1")#.summary()
bst = regressor_model("XGBoost", pca_components=22, max_depth=50, n_estimators=5, max_leaves=0) # Best hyperparameters for g511unfiltered: pca_components=22, max_depth=50, n_estimators=5
try:
    regr.summary()
    try_ = True
except:
    try_ = False
print("PCA seed:",PCA_seed)
regr if not try_ else None

In [None]:
data = {data_g511_partfltr_name: datag511partfltr,
        "partfilter_g511_E8000-50000":select_from_source(datag511partfltr,select_energies=(8000,50000)),
        "partfilter_g511_E5000-13000":select_from_source(datag511partfltr,select_energies=(5000,13000)),
        "partfilter_g511_all_E5000-13000":select_from_source(datag511partfltr_all,select_channels=[0,2,5,8,9,10],select_energies=(5000,13000))} | {
        name_g511_d(ch): select_from_source(datag511partfltr_all,select_channels=[ch]) for ch in [0,1,2,3,4,5,7,8,9,10,11]}

# energy_line_plot.clear_cache()
print(energy_line_plot.cache_info())

e_corrections = {0: calc_ab(4477,11197),
                 1: calc_ab(4623,11538),
                 2: calc_ab(4212,10512),
                 3: calc_ab(4672,11662),
                 4: (1,0),
                 5: calc_ab(1582,3948),
                 7: calc_ab(4747,11866),
                 8: calc_ab(4303,10727),
                 9: calc_ab(4750,11861),
                 10:calc_ab(4113,10268),
                 11: calc_ab(4474,11157)}

In [None]:
# energy_line_plot.clear_cache()
check_channels = [0,1,2]
# plot_predictions(name_g511_d(0), (2450,2500), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=1).show()
# plot_predictions(name_g511_d(0), (6450,6500), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=1).show()
# plot_predictions(name_g511_d(0), (8450,8500), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=1).show()
# plot_predictions(name_g511_d(0), (6450,6500), 2851, data, PCA_transform_on="partfilter_g511_E5000-13000", verbose=1).show()
# plot_predictions(name_g511_d(0), (8450,8500), 2851, data, PCA_transform_on="partfilter_g511_E5000-13000", verbose=1).show()
# plot_predictions(name_g511_d(0), (9000,9250), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=0).show()
# plot_predictions(name_g511_d(2), (9000,9250), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=0).show()
# plot_predictions(name_g511_d(3), (9000,9250), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=0).show()
# plot_predictions(name_g511_d(7), (9000,9250), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=0).show()
# plot_predictions(name_g511_d(11), (9000,9250), 2949, data, PCA_transform_on="partfilter_g511_E8000-50000", verbose=0).show()

fig_eline = energy_line_plot(name_g511_d(0), 2000, 10000, 250, 2949, data, PCA_fit="partfilter_g511_E8000-50000",hist_limit=100,y_sd="FWHM GoF",verbose=0)
fig_eline.show()
# energy_line_plot(name_g511_d(0), 6000, 6500, 100, 2949, data, PCA_transform_on="partfilter_g511_E8000-50000",hist_limit=100,y_sd="FWHM GoF",verbose=0).show()
# energy_line_plot.cache_info()

In [None]:
from gewapro.util import pandas_string_rep
def combine_plots(fig_eline: go.Figure, fig_ehist: go.Figure, **layout_kwargs):
    yaxis2_title = layout_kwargs.pop("yaxis2_title", "FWHM [ns]")
    for trace in fig_eline.data:
        y2_max = 30
        if trace.name == "dT - Tref":
            y2_max = round(np.max(trace.y)+0.6)
    fig_comb = make_subplots(specs=[[{"secondary_y": True}]]).update_layout(fig_ehist.layout).update_yaxes(title_text=yaxis2_title, range=[0,y2_max], secondary_y=True)
    fig_eline.update_traces(yaxis="y2")
    fig_comb.add_traces(fig_eline.data+fig_ehist.data)
    return fig_comb.update_layout(**layout_kwargs) if layout_kwargs else fig_comb

def fitted_PCA(model_version: int, waveforms: pd.DataFrame, model_name: str = "MLPRegressorModel") -> PCA:
    """Gets a fitted_PCA for a certain model version, using the provided waveforms"""
    start = datetime.now()
    print(f"Fitting data for {model_name} v{model_version} on {pandas_string_rep(waveforms)}")
    modelinfo: ModelInfo = ModelInfo.from_database(model_name=model_name,model_version=model_version)
    pca: PCA = modelinfo.get_transformer()
    pca.fit(waveforms.T.values)
    print(f"Fitting finished in",datetime.now()-start)
    return pca

modelv_to_name = {2949:"partfilter_g511_E8000-50000"}
modelv_to_name[2851] = modelv_to_name[2897] = "partfilter_g511_E5000-13000"
modelv_to_name[2994] = fitted_PCA(2994, waveforms := get_waveforms(source_data=datag511partfltr_all, select_channels=[0,2,5,8,9,10], select_energies=(5000,13000)))
modelv_to_name[2995] = fitted_PCA(2995, waveforms)

In [None]:
model_v = 2995 #1: 2949, 2: 2851, 3: 2897

for ch in [0,1,2,3,4,5,7]:
    fig_eline = energy_line_plot(name_g511_d(ch), 100, 1300, 25, model_v, data, PCA_fit=modelv_to_name[model_v], correct_energy=e_corrections[ch],hist_limit=75,verbose=0)
    fig_ehist = energy_histogram(name_g511_d(ch), data, select_energies=(0,1300), bins=[0,1400,2], correct_energy=e_corrections[ch], xaxis_title="Energy [keV]")
    combine_plots(fig_eline, fig_ehist).show()

for ch in [8,9,10,11]:
    fig_eline = energy_line_plot(name_g511_d(ch), 100, 1275, 25, model_v, data, PCA_fit=modelv_to_name[model_v], correct_energy=e_corrections[ch],hist_limit=75,verbose=0)
    fig_ehist = energy_histogram(name_g511_d(ch), data, select_energies=(0,1300), bins=[0,1400,2], correct_energy=e_corrections[ch], xaxis_title="Energy [keV]")
    combine_plots(fig_eline, fig_ehist).show()
#     plot_predictions(name_g511_d(ch), (10250,10350), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000").show()
#     plot_predictions(name_g511_d(ch), (10350,10450), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000").show()
#     plot_predictions(name_g511_d(ch), (10450,10550), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000").show()
#     plot_predictions(name_g511_d(ch), (10550,10650), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000").show()

In [None]:
ch = 0
fig0 = plot_predictions(name_g511_d(ch), (11638,12078), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
fig1 = plot_predictions(name_g511_d(ch), (11500,12000), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
fig2 = plot_predictions(name_g511_d(ch), (11350,12000), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
fig3 = plot_predictions(name_g511_d(ch), (11230,12000), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
fig4 = plot_predictions(name_g511_d(ch), (11224,12000), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
fig5 = plot_predictions(name_g511_d(ch), (11250,13000), 2949, data_dict=data, PCA_transform_on="partfilter_g511_E8000-50000",add_df=True)
for i,fig in enumerate([fig0,fig1,fig2,fig5,fig3,fig4]):
    fig_params = {j:fig._params[col+" Gaussian"] for j,col in enumerate(fig._df.columns)}
    for j,col in enumerate(fig._df.columns):
        print(f"Fig {i} goodness of fits   (col {j}, {len(fig._df)} observations):\nx0: {fig_params[j]['x0']}, sigma: {fig_params[j]['sigma']}")
        min_tb,max_tb = fig_params[j]["x0"] - 5*fig_params[j]["sigma"], fig_params[j]["x0"] + 5*fig_params[j]["sigma"]
        # print(".5 :",col,":",ksh := kstest(fig._df.loc[(fig._df[col] > min_tb) & (fig._df[col] < max_tb), col],cdf=norm.cdf,args=(fig_params[i]["x0"],0.5*fig_params[i]["sigma"])))
        # print("1  :",col,":",ks1 := kstest(fig._df.loc[(fig._df[col] > min_tb) & (fig._df[col] < max_tb), col],cdf=norm.cdf,args=(fig_params[i]["x0"],fig_params[i]["sigma"])))
        # print("2  :",col,":",ksd := kstest(fig._df.loc[(fig._df[col] > min_tb) & (fig._df[col] < max_tb), col],cdf=norm.cdf,args=(fig_params[i]["x0"],2*fig_params[i]["sigma"])))
        # print(col,"1/0.5:",ks1.pvalue/ksh.pvalue,"1/2:",ks1.pvalue/ksd.pvalue)
        # print("metric:",col,fig_params[i]["sigma"]*((ks1.pvalue/ksh.pvalue)*(ks1.pvalue/ksd.pvalue))**(-0.25))
        hist_params = histogram(fig._df,bins=[-30,30,0.25])._params[col+" Gaussian"]
        # print(hist_params.keys(),np.sqrt(np.diag(hist_params["Covariance"])),dict(zip(hist_params.keys(), np.sqrt(np.diag(hist_params["Covariance"])))))
        print("metric:",col,hist_params["sigma"]*hist_params["GoodnessOfFitMetric"],"\n",hist_params["GoodnessOfFitMetric"])
    fig.show()
# histogram_ks(s_E_ranged,bins=[-30,30,0.25]).show()

In [None]:
from scipy import stats
from functools import partial

size = 100
df_stat=pd.DataFrame(data={"uniform":(dist_unif := stats.uniform.rvs(size=size,loc=-1,scale=2)),
                           "normal":(dist_norm := stats.norm.rvs(size=size)),
                           "normal_shifted":(dist_nors := dist_norm+6),
                           "normal_shifted2":(dist_nos2 := stats.norm.rvs(size=size, loc=6)),
                           "normal_shifted3":(dist_nos3 := stats.norm.rvs(6,2,size=size)),
                           "normal_spread":(dist_nos4 := stats.norm.rvs(0,5,size=size))})
display(df_stat)
print(stats.kstest(dist_unif,
                   stats.norm.cdf),"   for uniform dist")
print(stats.kstest(dist_norm,
                   stats.norm.cdf),"   for normal dist")
print(stats.kstest(rvs=dist_nors,
                   cdf=stats.norm.cdf,
                   args=(6,1)),"   for normal shifted dist")
print(stats.kstest(rvs=dist_nos2,
                   cdf=stats.norm.cdf,
                   args=(6,1)),"   for normal shifted2 dist")
print(stats.kstest(rvs=dist_nos3,
                   cdf=stats.norm.cdf,
                   args=(6,2)),"   for normal shifted3 dist")
print(stats.kstest(dist_nos4,
                   stats.norm.cdf,
                   (0,5)).pvalue,"   for normal spread dist")
histogram(df_stat,[-10,10,0.25]).show()
df_stat.mean()


In [None]:
from gewapro.plotting.base import _get_ranges
from gewapro.util import correct_energy, _validate_a_b, get_len, invert_start_end_step, correct_start_end_step

slabelsE = pd.Series(s:=[123,2342,1231,4482,4232,2300,3203,4292,534,1230,8353],index=range(len(s)))
# display(slabelsE)
a,b = 0.17, 131
start, end, step = [400,8177,101]
(a,b),len_ = _validate_a_b((a,b)),get_len(start,end,step)
print("original:",start, end, step,"periods:", get_len(start,end,step))
corrected_start_end_step = correct_start_end_step(start, end, step, a,b)
print("corrected:",corrected_start_end_step,"periods:", get_len(*corrected_start_end_step))
increased_start_end_step = invert_start_end_step(*corrected_start_end_step, a,b)
print("backtracked:",increased_start_end_step,"periods:", get_len(*increased_start_end_step))
slabelsEcorr = correct_energy((a,b),slabelsE)
ranges = _get_ranges(start,end,step)
ranges2 = _get_ranges(*corrected_start_end_step)
print(ranges)
print(ranges2)
bins = {rang:((slabelsE >= rang[0]) & (slabelsE < rang[1])).sum() for rang in ranges}
bins2 = {rang:((slabelsEcorr >= rang[0]) & (slabelsEcorr < rang[1])).sum() for rang in ranges2}
print(bins)
print(bins2)
slabelsE = correct_energy((1/a,-b/a),correct_energy((a,b),slabelsE))
dftestt = pd.DataFrame({"old_bins":bins.values(),"new_bins":bins2.values()})
# dftestt.iplot()

edf = _fwhm_energy_df(name_g511_d(0), 6000, 6500, 100, 2949, data, PCA_transform_on="partfilter_g511_E8000-50000")
display(edf)
edf.index = pd.MultiIndex.from_tuples([(i[0], a*i[1]+b) for i in edf.index],names=edf.index.names)
display(edf)

In [None]:
# fitting model
# bst = train_model(bst, d_train, l_train)
models_to_show = [] # [bst, regr]
from gewapro.models import predict

for model in models_to_show:
# Combining labels and creating prediction Series
    s_labels_t = pd.Series(np.append(l_train_t,l_test_t))
    shift = -round(s_labels_t.mode().iloc[0])
    s_labels_t.name = f"Initial data: dT {'-' if shift < 0 else '+'} {abs(shift)} ns"
    predicted_train = predict(model, d_train)
    print(predicted_train.shape, predicted_train)
    pred_s_train = pd.Series(l_train_t - x_to_t(predicted_train),name="dT_act - dT_pred (train)")
    pred_s_test = pd.Series(l_test_t - x_to_t(predict(model, d_test)),name="dT_act - dT_pred (test)")
    pred_s = pd.Series(s_labels_t.values - x_to_t(predict(model, np.append(d_train,d_test, axis=0))),name="dT_act - dT_pred (both)")
    title = f"Arrival time histogram for XGBoost regressor on g511" # g511g1274 unfiltered combined data set (E range 4000-5000, 11050-11250)"
    # Add histogram with predicted vs actual data
    fig_hist = histogram(pd.concat([s_labels_t+shift,pred_s_train,pred_s_test,pred_s], axis=1), [-30,30,0.25], xaxis_title="Time (ns)", yaxis_title="Prevalence", title=title, layout_width=1250)
    print(f"[MLFlow run] Created histogram with params: {fig_hist._params}")
    fig_hist.show()

data_ = {"data_g1274_partfltr":datag1274partfltr,"data_g511_partfltr":datag511partfltr,"exp_raw":unnormalized_dict[all_data_partfltr_name],"511_raw_train":data_df}
# display(datag511partfltr)
layout_options = dict(yaxis_title="dT_act-dT_pred (ns)", xaxis_title="Energy (arb. units)") #plot_type="EnergyScatter"
plot_predictions("data_g511_partfltr", (4550,14000), 369, data_, "XGBoostedTree", PCA_transform_on="511_raw_train", **layout_options).show()
plot_predictions("data_g511_partfltr", (4550,14000), 370, data_, "XGBoostedTree", PCA_transform_on="511_raw_train", **layout_options).show()
# predict_from_model("data_g1274_partfltr", (4000,5000), 366, data_, "XGBoostedTree", PCA_transform_on="exp_raw").show()
# predict_from_model("data_g511_partfltr", (11050,11250), 366, data_, "XGBoostedTree", PCA_transform_on="exp_raw").show()

In [None]:
# model
# PCA: 22 comp., 1261822223 seed
# regr.save()
# import keras
# keras.saving.save_model(regr, "data/models/reasonable_mixed_model.keras")
# regr = keras.saving.load_model("data/models/reasonable_mixed_model.keras")
import mlflow.xgboost

save_model_run = True

from mlflow.models import infer_signature
experiment = mlflow.set_experiment("Sklearn NN, Na22 Ch0")
run_name = "KerasNN[22]_nan%_xxx"
run_name = "XGBoostedTree_nan%_xxx"
model = bst
model_name = "XGBoostedTree"

if save_model_run:
    with mlflow.start_run(run_name=run_name) as mlflow_run:

            # Log all parameters and metrics
            mlflow.set_experiment_tag("BaseModel","TensorFlow Keras Sequential Model")
            mlflow.log_params({
                "Channels used": "0",
                "Energy range used": "ALL",
                "Energy included for training": False,
                "Applied conditions": "-",
                "Used conditionally removed data in test set": "-",
                "Train - Test set shapes": f'{l_train_t.shape} - {l_test_t.shape}'.replace("(","[").replace(",)","]").replace(",","").replace(")","]"),
                "Waveforms used": f'{l_train_t.shape[0]+l_test_t.shape[0]} / available unknown',
                "Waveform smoothing": "None",
                "Smoothing energy range": "-",
                "PCA components": pca_components,
                "PCA random seed": PCA_seed,
                "PCA explained variance": pca_var_ratio,
                "Hidden layers": str(hidden_layers).replace(","," "), 
                "Activation function": activation,
                "Solver": "rmsprop",
                "Alpha": "-",
                "Max epochs": 10,
            })
            # Log the model
            model_info = mlflow.xgboost.log_model(
                xgb_model=model,
                artifact_path="xgboost_models",
                signature=infer_signature(d_train, predicted_train),
                input_example=np.array([d_train[0]]),
                registered_model_name=model_name,
                metadata={"PCA random seed": PCA_seed,
                          "PCA method": pca_method},
                )
            # model_info = mlflow.sklearn.log_model(
            #     sk_model=regr,
            #     artifact_path="sk_models",
            #     signature=infer_signature(d_train, predicted_train),
            #     input_example=d_train,
            #     registered_model_name="MLPRegressorModel",
            #     metadata={"PCA random seed": PCA_seed}
            # )
# plot_predictions(exp_g511_unfiltered_name, (10350,14000), 213, combined_dict, "XGBoostedTree").show()

In [None]:
# PCAcomp.  BEST / WORST (g511)     BEST / WORST (g1274)
# 20:       2353 / 2355             2373 / 2370
# 21:       2360 / 2357             2374 / 2379
# 22:       2361 / 2365             2383 / 2384
# df_results = pd.DataFrame({"model":[2353, 2360, 2361, 2373, 2374, 2383, 2355, 2357, 2365, 2370, 2379, 2384],
#                            "PCA components":[20, 21, 22]*4,"_g511": [np.nan]*12,"FWHM_g1274": [np.nan]*12,
#                            "FWHM_g511 (450-600)": [np.nan]*12,"FWHM_g1274 (450-600)": [np.nan]*12,
#                            "trained_on":(["g511"]*3+["g1274"]*3)*2,
#                            }).set_index("model")#rename_axis("limbs", axis="columns")
# for model in df_results.index:
#     model_pred_fig_511 = plot_predictions(data_g511_name, (11050,11250), model, data_dict, "MLPRegressorModel", False)
#     model_pred_fig_1274 = plot_predictions(data_g1274_name, (4000,5000), model, data_dict, "MLPRegressorModel", False)
#     model_pred_fig_511_515 = plot_predictions(data_g511_name, (450,600), model, data_dict, "MLPRegressorModel", False)
#     model_pred_fig_1274_515 = plot_predictions(data_g1274_name, (450,600), model, data_dict, "MLPRegressorModel", False)
#     df_results.loc[model, "FWHM_g511"] = model_pred_fig_511._params["dT_act - dT_pred Gaussian"]["sigma"]*2*np.sqrt(2*np.log(2))
#     df_results.loc[model, "FWHM_g1274"] = model_pred_fig_1274._params["dT_act - dT_pred Gaussian"]["sigma"]*2*np.sqrt(2*np.log(2))
#     df_results.loc[model, "FWHM_g511 (450-600)"] = model_pred_fig_511_515._params["dT_act - dT_pred Gaussian"]["sigma"]*2*np.sqrt(2*np.log(2))
#     df_results.loc[model, "FWHM_g1274 (450-600)"] = model_pred_fig_1274_515._params["dT_act - dT_pred Gaussian"]["sigma"]*2*np.sqrt(2*np.log(2))
# display(df_results)

from functools import partial

def lineariser(df: pd.DataFrame, lin_term: float, bias: float = 0):
    return (df.max().values * lin_term) + bias

def part_lin(lin_term: float, bias: float = 0):
    return partial(lineariser, lin_term=lin_term, bias=bias) #custom_func=part_lin(0.01)

#  511 * x - 17.8370 + y = 0  -   (0.034906 * 511 = )
# 1274 * x +  8.6423 + y = 0  +     ()
# =============================
#  763 * x + 26.4793     = 0    -> x,y = -0.034704194,35.57084313
import mlflow.pyfunc
regressor = mlflow.pyfunc.load_model(model_uri=f"models:/MLPRegressorModel/2398")
# print(isinstance(regressor, mlflow.pyfunc.PyFuncModel), str(regressor.loader_module) == "mlflow.sklearn")

combined_model_v = 2411 # 2388, 2406, 2402, best: 2411 (worth a try?: 2447, 2464)
predict_dict = {"gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)": data_dict["raw"]["normalized"]["partfltr"]["gBOTH"]["range_cut"],
                "gBOTH_raw_normalized_filtered_(4k,5k)(11.05k,11.25k)": data_dict["raw"]["normalized"]["filtered"]["gBOTH"]["range_cut"],
                "g511_raw_normalized_filtered": data_dict["raw"]["normalized"]["filtered"]["g511"],
                "g1274_raw_normalized_filtered": data_dict["raw"]["normalized"]["filtered"]["g1274"]}
plot_predictions("gBOTH_raw_normalized_filtered_(4k,5k)(11.05k,11.25k)", "ALL", combined_model_v, predict_dict, "MLPRegressorModel").show()
plot_predictions("g1274_raw_normalized_filtered", (4000,5000), combined_model_v, predict_dict, "MLPRegressorModel", PCA_transform_on="gBOTH_raw_normalized_filtered_(4k,5k)(11.05k,11.25k)").show()
plot_predictions("g511_raw_normalized_filtered", (11050,11250), combined_model_v, predict_dict, "MLPRegressorModel", PCA_transform_on="gBOTH_raw_normalized_filtered_(4k,5k)(11.05k,11.25k)").show()
print("\n\n\n\n\nnew tests:")
plot_predictions("gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)", (), 2584, predict_dict, "MLPRegressorModel").show()
plot_predictions("gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)", (4000,5000), 2584, predict_dict, "MLPRegressorModel", PCA_transform_on="gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)").show()
plot_predictions("gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)", (11050,11250), 2584, predict_dict, "MLPRegressorModel", PCA_transform_on="gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)").show()

# Alright predictions, but need to be modified for energy:
# predict_from_model(exp_data_name, "ALL", 2402, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00809, 9.5)).show()
# predict_from_model(exp_data_name, "ALL", 2406, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00864, 8.8455)).show()
# predict_from_model(exp_data_name, "ALL", 2411, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00614, 7.5068)).show() # <- 10.03
# predict_from_model(exp_data_name, "ALL", 2388, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00059, 0.7411)).show()
# predict_from_model(exp_data_name, "ALL", 2392, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(0.00320, -0.8021)).show()
# predict_from_model(exp_data_name, "ALL", 2391, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(0.00546, -2.78)).show()
# predict_from_model(exp_data_name, "ALL", 2395, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00392, 4.1411)).show()
# predict_from_model(exp_data_name, "ALL", 2398, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00195)).show()
# predict_from_model(exp_data_name, "ALL", 2424, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.01268, 6)).show()
# predict_from_model(exp_data_name, "ALL", 2430, unnormalized_dict, "MLPRegressorModel", custom_func=part_lin(-0.00536)).show() # <- 11.3153
# predict_from_model(exp_data_name, "ALL", 2440, combined_dict, "MLPRegressorModel", custom_func=part_lin(-0.00580)).show()
# predict_from_model(exp_data_name, "ALL", 2447, combined_dict, "MLPRegressorModel", custom_func=part_lin(-0.00260)).show()
# predict_from_model(exp_data_name, "ALL", 2464, combined_dict, "MLPRegressorModel", custom_func=part_lin(-0.00497)).show()
# predict_from_model(exp_data_name, "ALL", 2398, unnormalized_dict, "MLPRegressorModel", part_lin(0,0)).show()
# predict_from_model(exp_data_name, "ALL", 1, combined_dict, "XGBoostedTree", part_lin(0,0)).show()
# predict_from_model(exp_data_name, "ALL", 2, unnormalized_dict, "TF_NN_22-16-1_relu_0.0001").show()
# predict_from_model(exp_data_name, "ALL", 2, unnormalized_dict, "TF_NN_22-16-1_relu_0.0001", custom_func=part_lin(-0.01646, 19)).show() # <- 11.9124
# regr.summary()

In [None]:
# ESTIMATION OF FWHM AFTER 511, 1274 OVERLAPPING

# @np.vectorize
# def f_surf(x: np.ndarray|float, y: np.ndarray|float, root: float, multiple: float, constant: float = 0):
#     return multiple * ( (x**root+y**root)**(1/root) ) + constant

# list_index = [2406, 2402, 2388, 2411, 2392, 2391, 2395, 2398, 2424, 2430, 2440, 2447, 2464]
# list_511 = [14.62, 18.13, 17.63, 15.01, 17.31, 13.45, 16.39, 15.17, 15.58, 14.62, 13.96, 16.06, 13.83]
# list_1274 = [8.36, 8.21, 8.14, 7.47, 11.84, 11.64, 19.12, 9.11, 36.43, 9.02, 8.48, 7.82, 9.42]
# list_FWHM = [10.83, 11.53, 11.41, 10.03, 13.84, 12.40, 16.90, 11.79, 22.35, 11.32, 10.87, 10.90, 11.31]

# df_cross_model = pd.DataFrame({"511":list_511,"1274":list_1274,"combined":list_FWHM},index=pd.Series(list_index, name="version")).rename_axis("FWHM",axis=1)

# # df_cross_model["pred[0.7,0.35]"] = f_surf(df_cross_model["511"], df_cross_model["1274"], 0.7, 0.35)
# # print(((df_cross_model["pred[0.7,0.35]"] - df_cross_model["combined"])**2).sum())
# # df_cross_model["pred[0.6,0.3]"] = f_surf(df_cross_model["511"], df_cross_model["1274"], 0.6, 0.3)
# # print(((df_cross_model["pred[0.6,0.3]"] - df_cross_model["combined"])**2).sum())
# try_0 = 0.4, 0.15, 1.38
# try_1 = 0.3, 0.09, 0.65
# try_2 = 0.25, 0.055, 1.03
# for try_ in [try_0, try_1, try_2]:
#     df_cross_model[f"pred{try_}"] = f_surf(df_cross_model["511"], df_cross_model["1274"], *try_)
#     print(((df_cross_model[f"pred{try_}"] - df_cross_model["combined"])**2).sum())

# display(df_cross_model)
# xx, yy = np.linspace(10, 20, 21), np.linspace(7, 37, 31)
# x,y = np.meshgrid(xx, yy)
# z_0 = f_surf(x, y, *try_0) #0.4, 0.18, -1
# z_1 = f_surf(x, y, *try_1)
# z_2 = f_surf(x, y, *try_2)
# # zz = np.linspace(z.min(), z.max(), 100)

# fig = go.Figure(data=[go.Surface(z=z_1, x=x, y=y)])
# fig.add_trace(go.Surface(z=z_2, x=x, y=y))
# fig.add_trace(go.Scatter3d(x=df_cross_model["511"],y=df_cross_model["1274"],z=df_cross_model["combined"]))
# # display(df_cross_model[f"pred{try_2}"] - df_cross_model["combined"])

In [None]:
# data_for_regressor.iloc[0, :] = labels_t
# display(data_for_regressor)
# data_to_predict = PCA(64).fit_transform(data_for_regressor.values.transpose())
#  # Combining labels and creating prediction Series
# s_labels_t = pd.Series(labels_t)
# shift = -round(s_labels_t.mean())
# s_labels_t.name = f"Initial data: dT {'-' if shift < 0 else '+'} {abs(shift)} ns"
# predicted_x = regressor.predict(data_to_predict)
# pred_s = pd.Series(labels_t - x_to_t(predicted_x),name="dT_act - dT_pred")

# # Add histogram with predicted vs actual data
# fig_hist = histogram(pd.concat([s_labels_t+shift,pred_s], axis=1), [-30,30,0.25], title="Arrival Time Histogram", xaxis_title="Time (ns)", yaxis_title="Prevalence")
# fig_hist.show()

In [None]:
# Run single experiment
model_type = "sklearn" # or "sklearn" or "xgboost"
                                            # data511 or datag511partfltr or datag511unfiltered
data_and_name = datag511partfltr,data_g511_partfltr_name # data_g511_name or data_g511_partfltr_name
data_and_name = select_from_source(datag511partfltr_all,select_channels=[0,2,5,8,9,10]),name_g511_d([0,2,5,8,9,10]) # test: 1,3,7,11
# data_temp_dict = {data_and_name[0]:data_and_name[1]}

if model_type == "xgboost":
    result_single_exp = run_experiment(
        data=data_and_name[0],  
        data_name=data_and_name[1],
        select_channels=[0],
        select_energies=(11050,11250),
        pca_components=None,
        model_type="xgboost",
        max_depth=50,
        n_estimators=3,
        max_leaves= 0,
        test_size=0.2,
        uniform_test_set=[5000,6000,7000,8000,9000,10000,11000,12000]#,8000,9000,10000,11000]
    )
else:
    result_single_exp = run_experiment(
        data=data_and_name[0],
        data_name=data_and_name[1],
        select_channels=[],
        select_energies=(5000,13000),
        pca_components=None,
        model_type="sklearn",
        hidden_layers=[23],
        test_size=0.3,
        uniform_test_set=[5000,6000,7000,8000,9000,10000,11000,12000], #,8000,9000,10000,11000]
        dT_correcting=True
    )#._params
result_single_exp.show()

In [None]:
data |= {"partfilter_g511_all_E5000-13000": select_from_source(datag511partfltr_all,select_channels=[0,2,5,8,9,10],select_energies=(5000,13000))}
# display(datag511partfltr_all)
check_ch = 7
plot_predictions(name_g511_d(check_ch), (11000,11500), 2994, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_all_E5000-13000").show()
plot_predictions(name_g511_d(check_ch), (11000,11500), 2995, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_all_E5000-13000").show()

### Run experiments...

In [None]:
default_params= {"model_type": "XGBoost",
                 "select_channels": [0,23],
                #  "max_iterations": 2_000,
                 "remove_nan_waveforms":True,
                 "select_energies":(10300,12000), # DEFAULT for th06-th60: (11050,11250)
                 "include_energy": False,
                 "max_leaves": 0,
                 "pca_method": PCA,
                 "uniform_test_set": [5000,6000,7000,8000,9000,10000,11000,12000],
                 "test_size":0.3}

# data = {"gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)": data_dict["raw"]["normalized"]["FIR"]["gBOTH"]["range_cut"]} #{data_g511_unfiltered_name:datag511unfiltered} # {data_g1274_name: data1274} #{exp_data_unfiltered_name: exp_data_unfiltered}
data = {data_g511_partfltr_name:datag511partfltr}

#             data_g1274_partfltr_name_all:datag1274partfltr_all,
            #  data_g511_partfltr_name_all:datag511partfltr_all,
pca_components_list = [None] #[16,23,100,None]
estimators = [2,4] #[1,2,4]
depth = [40,50]
select_energies = [(8000,50000),(8000,13000)]
exp_list = [[data_name,pca_comp,trees,depth,energies] for data_name,pca_comp,trees,depth,energies in itertools.product([k for k in data.keys()],pca_components_list,estimators,depth,select_energies)]
print(exp_list)
iterations = [4]*len(exp_list)
print(len(exp_list),"experiments,",sum(iterations), f"iterations (={sum(iterations)/len(exp_list)}*{len(pca_components_list)}*{len(estimators)}*{len(depth)}*{len(select_energies)})")
results = {}
# break
# mlflow.set_tracking_uri("http://127.0.0.1:30000")
for exp,iters in zip(exp_list,iterations):
    params = default_params
    params |= {"pca_components": exp[1], "n_estimators": exp[2], "max_depth":exp[3], "select_energies": exp[4]}
    result = {str(i):None for i in range(iters)}
    if [exp[1]] == exp[2]:
        print(f"Got layers {[exp[1]]} equal to {exp[2]}, skipping experiment")
        continue
    for i in range(iters):
        print(f"[MLFlow run] Starting iteration {i+1}/{iters} with params {params}...")
        if "raw" in exp[0]:
            result[str(i)] = run_experiment(data[exp[0]], exp[0], **params)._params
        else:
            result[str(i)] = run_experiment(data_dict[exp[0]], exp[0], **params)._params
    results[str(exp)] = result

In [None]:
default_params= {"model_type": "SKlearn",
                 "select_channels": [0,2,5,8,9,10],
                 "max_iterations": 2_000,
                 "remove_nan_waveforms":True,
                #  "select_energies":(9000,13000), # DEFAULT for th06-th60: (11050,11250)
                 "include_energy": False,
                 "activation": 'relu',
                 "pca_method": PCA,
                 "uniform_test_set": [5000,6000,7000,8000,9000,10000,11000,12000],
                 "test_size":0.3,
                 "dT_correcting": True}

# data = {"gBOTH_raw_normalized_partfltr_(4k,5k)(11.05k,11.25k)": data_dict["raw"]["normalized"]["partfltr"]["gBOTH"]["range_cut"]} #{data_g511_unfiltered_name:datag511unfiltered} # {data_g1274_name: data1274} #{exp_data_unfiltered_name: exp_data_unfiltered}
# data = {data_g511_partfltr_name:datag511partfltr}
data_run = {name_g511_d([0,2,5,8,9,10]):select_from_source(datag511partfltr_all,select_channels=[0,2,5,8,9,10])}

pca_components_list = [None] #[100,None] #[16,18,20,21,22,23,64]
hidden_layers_list = [[100]] #[5,10,20,50]
select_energies = [(5000,13000)] #[(9000,50000),(8000,50000)] #[(10000,50000),(5000,13000),()]
exp_list = [[obj for obj in tup] for tup in itertools.product([k for k in data_run.keys()],pca_components_list,hidden_layers_list,select_energies)]
print(exp_list)
iterations = [1]*len(exp_list)
print(len(exp_list),"experiments,",sum(iterations), f"iterations (={sum(iterations)/len(exp_list)}*{len(pca_components_list)}*{len(hidden_layers_list)})")
results = {}
# break
mlflow.set_tracking_uri("http://127.0.0.1:5000") # 5000: local, 30000: external
iteration = 0
for exp,iters in zip(exp_list,iterations):
    params = default_params
    params |= {"pca_components": exp[1], "hidden_layers": exp[2], "select_energies": exp[3]}
    result = {str(i):None for i in range(iters)}
    # if [exp[1]] == exp[2] == 22:
    #     print(f"Got layers {[exp[1]]} equal to {exp[2]}, skipping experiment")
    #     continue
    for i in range(iters):
        iteration += 1
        print(f"[MLFlow run] Starting iteration {i+1}/{iters} ({iteration}/{sum(iterations)}) with params {params}...")
        if "raw" in exp[0]:
            result[str(i)] = run_experiment(data_run[exp[0]], exp[0], **params)._params
        else:
            result[str(i)] = run_experiment(data_run[exp[0]], exp[0], **params)._params
    results[str(exp)] = result

In [None]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# {"SKLearnNN, Na22 Ch0":918309536924112984}
data = {data_g511_partfltr_name:datag511partfltr,
        data_g511_partfltr_name_all:datag511partfltr_all,
        # "partfilter_g511_E8000-50000":select_from_source(datag511partfltr,select_energies=(8000,50000)),
        # "partfilter_g511_E5000-13000":select_from_source(datag511partfltr,select_energies=(5000,13000)),
        # "partfilter_g511_E8000-13000":select_from_source(datag511partfltr,select_energies=(8000,13000)),
        }
# get_waveforms(source_data=data, select_energies=range_E, include_energy=False)
# Best results:         abs                                 avg
# E5000-6000    13.358  8000-50000, 100-[23] @ 10.485       5000-13000, 100-[16] @ 10.71
# E6000-7000    11.841  9000-13000, 23-[100] @  9.227       9000-13000, 23-[100] @  9.61
# E7000-8000    10.374  9000-50000, 23-[23]  @  8.682       9000-50000, 23-[100] @  8.82
# E8000-9000    10.292  5000-13000, ALL-[100] @ 8.371       5000-13000, ALL-[100] @ 8.49
# E9000-10000    9.439  9000-13000, 23-[100] @  7.689       5000-13000, 100-[16] @  7.92
# E10000-11000  11.282  5000-13000, ALL-[100] @ 7.034       5000-13000, ALL-[100] @ 7.12
# E11000-12000   7.064  5000-13000, 100-[16] @  6.004       5000-13000, 100-[16] @  6.28

# Eleastsquares 2949:  8000-50000 100-[23]  @ 5.599, 2897:  5000-13000 ALL-[100] @ 5.608, 2851:  5000-13000 100-[16] @ 5.605
#                739:  8000-13000 16-4*40   @ 3.444,  748:  8000-13000 16-4*50   @ 3.475,  743:  8000-50000 16-4*50  @ 3.512
# Elinear       2897:  5000-13000 ALL-[100] @ 5.651, 2949:  8000-50000 100-[23]  @ 5.654, 2851:  5000-13000 100-[16] @ 5.662
#                748:  8000-13000 16-4*50   @ 4.367,  743:  8000-50000 16-4*50   @ 4.410,  739:  8000-13000 16-4*40  @ 4.447


ignore_values = {"Energy range used": "() eV"}
ignore_values1 = {"tree depth": None}
energy = 5000
y = f"FWHM E{energy}-{energy+1000}"

def square_FWHM_metric(df: pd.DataFrame) -> pd.Series:
    ranges = {"5000-6000":13.3579,"6000-7000":11.841,"7000-8000":10.3735,"8000-9000":10.2916,"9000-10000":9.4387,"10000-11000":11.2823,"11000-12000":7.0638}
    return sum([(df[f"metrics.Uniform test FWHM E{e_range}"]/e_val)**2 for e_range,e_val in ranges.items()])

def linear_FWHM_metric(df: pd.DataFrame) -> pd.Series:
    ranges = {"5000-6000":13.3579,"6000-7000":11.841,"7000-8000":10.3735,"8000-9000":10.2916,"9000-10000":9.4387,"10000-11000":11.2823,"11000-12000":7.0638}
    return sum([df[f"metrics.Uniform test FWHM E{e_range}"]/e_val for e_range,e_val in ranges.items()])
linear_FWHM_metric.FWHM = square_FWHM_metric.FWHM = 7

y = square_FWHM_metric
# df_version_mapper = get_model_version_map([102816600889877627])
# display(df_version_mapper)
# display(get_model_version_map([918309536924112984]))
boxplot([918309536924112984], x="Energy range used", y=y, color="PCA components", ignore_vals=ignore_values, facet_row="Hidden layers", height=700, hover_name="model_version").show()
boxplot([102816600889877627], x="estimators", y=y, color="PCA components", ignore_vals=ignore_values1, facet_row="tree depth", facet_col="Energy range used", hover_name="model_version", height=800).show()
boxplot([941575026271596123], x="PCA components", y=y, color="Hidden layers", height=700, hover_name="model_version").show()
plot_predictions(data_g511_partfltr_name, (), 3032, data, "MLPRegressorModel", PCA_fit=fitted_PCA(3032, waveforms)).show()
plot_predictions(data_g511_partfltr_name, (), 3028, data, "MLPRegressorModel", PCA_fit=fitted_PCA(3028, waveforms)).show()
plot_predictions(data_g511_partfltr_name, (), 3000, data, "MLPRegressorModel", PCA_fit=fitted_PCA(3000, waveforms)).show()
plot_predictions(data_g511_partfltr_name, (), 3033, data, "MLPRegressorModel").show()
# plot_predictions(data_g511_partfltr_name, (), 2949, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E8000-50000").show()
# plot_predictions(data_g511_partfltr_name, (), 2897, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000").show()
# plot_predictions(data_g511_partfltr_name, (), 2851, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000").show()
# plot_predictions(data_g511_partfltr_name, (), 739, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000").show()
# plot_predictions(data_g511_partfltr_name, (), 748, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000").show()
# plot_predictions(data_g511_partfltr_name, (), 743, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-50000").show()

In [None]:
# Predicting on other detectors
detector = 1
data |= {name_g511_d(detector): data_dict[name_g511_d(detector)]}
# predict_from_model(name_g511_d(detector), (), 2949, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E8000-50000").show()
# predict_from_model(name_g511_d(detector), (), 2897, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000").show()
# predict_from_model(name_g511_d(detector), (), 2851, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000").show()
# predict_from_model(name_g511_d(detector), (), 739, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000").show()
# predict_from_model(name_g511_d(detector), (), 748, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000").show()
# predict_from_model(name_g511_d(detector), (), 743, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-50000").show()

In [None]:
# Predicting on other detectors
detector = 2
data |= {name_g511_d(detector): data_dict[name_g511_d(detector)]}
predict_from_model(name_g511_d(detector), (), 2949, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 2897, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 2851, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 739, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 748, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 743, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()

In [None]:
# Predicting on other detectors
detector = 3
data |= {name_g511_d(detector): data_dict[name_g511_d(detector)]}
predict_from_model(name_g511_d(detector), (), 2949, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 2897, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 2851, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 739, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 748, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(detector), (), 743, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()

In [None]:
# Check on part of set of detector 3
predict_from_model(name_g511_d(3), (5000,50000), 2949, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(3), (5000,50000), 2897, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(3), (5000,50000), 2851, data, "MLPRegressorModel", PCA_transform_on="partfilter_g511_E5000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(3), (5000,50000), 739, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(3), (5000,50000), 748, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-13000", xaxis_range=[-30,120]).show()
predict_from_model(name_g511_d(3), (5000,50000), 743, data, "XGBoostedTree", PCA_transform_on="partfilter_g511_E8000-50000", xaxis_range=[-30,120]).show()

### Visualise experiment results

In [None]:
exp_sigma_lists = lambda dic: {k:[{d_k:pd.Series([d[str(i)][d_k]["sigma"] for i in range(len(d))])} for d_k in d["0"].keys() if not "data" in d_k] for k,d in dic.items()}
exp_results = lambda dic: {k:{"test" if "test" in d_k else "train": ls_d for d_k,ls_d in (ls[0]|ls[1]).items()} for k,ls in dic.items()}
exp_res = lambda dic: {key: {k[k.find(", ")+2: k.find("[",k.find(", "))-2]:v for k,v in dic.items() if key in k} for key in {ky[ky.rfind("["):ky.find("]")+1] for ky in dic.keys()}}
series_ls = lambda dic: [pd.Series(s, name=n) for n,s in {k+"_"+i+"_"+j: v[i][j] for k,v in dic.items() for j in ["test","train"] for i in v.keys()}.items()]
df_results = pd.DataFrame(series_ls(exp_res(exp_results(exp_sigma_lists(results)))))
# df_results.index = pd.Index(["["+i[i.rfind("'")+3:i.rfind("]")+1]+i[i.rfind("_"):] for i in df_results.index])
display(df_results)
print(df_results.index)
# df_results.to_csv("data/results/NN_TruncSVC_testing_gBOTHpartfltr_4000-11250.csv")
mult = 2*np.sqrt(2*np.log(2))
# break
df = pd.read_csv("data/results/XGBoost_g511unfiltered_10350-14000.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[1:i.find(",")],i[i.find(",")+1:i.rfind(",")], i[i.rfind(",")+1:i.find("]")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["PCA components","# estimators","Max tree depth","set","FWHM"])
# df_new
px.box(df_new, x="PCA components", y="FWHM", color="# estimators",facet_col="set",
       hover_data=["set","# estimators"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of XGBoost hyperparameter optimization (g511 & g1274 unfiltered)").show()

df = pd.read_csv("data/results/XGBoost2_g511unfiltered_10350-14000.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[1:i.find(",")],i[i.find(",")+1:i.rfind(",")], i[i.rfind(",")+1:i.find("]")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["PCA components","# estimators","Max tree depth","set","FWHM"])
px.box(df_new, x="Max tree depth", y="FWHM", color="# estimators",facet_col="set", facet_row="PCA components",
       hover_data=["set","# estimators"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of XGBoost hyperparameter optimization (g511unfiltered, 10350<E<14000)", height=750).show()

df = pd.read_csv("data/results/PCA_testing_g1274_4000-5000.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","FWHM"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="FWHM", color="Hidden layers", #[df_new["set"] == "test"]
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis (g1274, 4000<E<5000)").show()

df = pd.read_csv("data/results/PCA_testing_g511_10350-14000.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","FWHM"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="FWHM", color="Hidden layers",
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis (g511, 10350<E<14000)").show()

df = pd.read_csv("data/results/PCA_testing_g511_11050-11250.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","FWHM"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="FWHM", color="Hidden layers", #[df_new["set"] == "test"]
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis (g511, 11050<E<11250)").show()

df = pd.read_csv("data/results/PCA_testing_g511unfiltered_11050-11250.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","FWHM"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="FWHM", color="Hidden layers", #[df_new["set"] == "test"]
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis (g511 unfiltered, 11050<E<11250)").show()


df = pd.read_csv("data/results/PCA_testing_2-128_th60.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","FWHM"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="FWHM", color="Hidden layers",
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis (th60)").show()


df = pd.read_csv("data/results/PCA_testing_gBOTH_4000-11250.csv").set_index("experiment")
df_new = pd.DataFrame(data=[(i[:i.find("_")], i[i.find("_")+1:i.rfind("_")], "test" if "test" in i else "train", val*mult) for i,s in df.iterrows() for val in s.values if not pd.isna(val)],
                      columns=["Hidden layers","PCA components","set","sigma"])
px.box(df_new[df_new["set"] == "test"], x="PCA components", y="sigma", color="Hidden layers",
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis g511 & g1274 COMBINED (test)", yaxis_title="FWHM").show()
px.box(df_new[df_new["set"] == "train"], x="PCA components", y="sigma", color="Hidden layers",
       hover_data=["set"], points="all" # add day column to hover data
       ).update_traces(boxmean=True).update_layout(title="Box plots of PCA analysis g511 & g1274 COMBINED (train)", yaxis_title="FWHM").show()

In [None]:
condition_final_max1 = data_df[-10:].mean() < 1.011
condition_final_max2 = (data_df[-10:].mean() < 1.011)
# display(data_df)
display(data_df.loc[:,condition_final_max2].shape)
print(data_df.values.transpose()[condition_final_max1].shape)
labels_t = np.array([float([s[s.find("]")+1:s.find("dT")] for s in [col.replace(" ","")]][0]) for col in data_df.columns])
labels_x = t_to_x(labels_t)
wave_i = np.array([int(col[col.find("[")+1:col.find("]")]) for col in data_df.columns])
# wave_i_map
train_test_list = d_train, d_test, wave_i_train, wave_i_test, l_t_train, l_t_test = train_test_split_cond(data_df.values.transpose(), wave_i, labels_t, test_size=0.5, random_state=42, condition=condition_final_max1, add_removed_to_test=True)
df_train=pd.DataFrame(data=d_train.T, columns = wave_i_train)
display(df_train)

print(data_df.shape)
print(str([item.shape for item in train_test_list]).strip("[]"))


In [None]:
waveform = [18527] #4066, 13873, 15418, 5073, 6122, 15841, 9151, 18527
wave_df, imap = get_waveforms(waveform, source_data=data06, get_indices_map=True)
col_for_str = lambda df, col: [c for c in df.columns if col in c][0]

df_new, df_res = df_with_fits(wave_df, 0.2, return_results=True, exponent_fit=True, y_min=0.7)
display(df_res)
x0, xT, x_cutoff = df_res.loc[str(waveform)]["x0"], df_res.loc[str(waveform)]["xT"], df_res.loc[str(waveform)]["x_cutoff"]
fig2 = df_new.iplot(title=f"Plot of waveform {waveform} with fitted parabola (start-end: {x0:.2f}-{x_cutoff:.0f}, xT @ {xT:.3f})",asFigure=1)
fig2.add_vline(x=xT, line_color="orange").show()
display(HTML("<br>".join([f"<b>{k}</b>: {v}" for k,v in {"a":"parabola constant","x0":"peak of parabola","x_cutoff":"end of parabola","var":"parabola and line fit variance","x0_cutoff_var":"variance from x0 to x_cutoff (parabola range)","xT":"x position of dT","yT_fit":"y position of dT according to fitted parabola","final10_slope":"Slope of the final 10 data points","b":"Final exponential fit base parameter","c":"x value where exp. fit is zero","exp_start":"Start of the exponential fit","exp_var":"Variance of the exp. fit"}.items()])))

In [None]:
start = datetime.now()
new_max_col = 20000
df = get_waveforms(0, len(data06), source_data=data06) #len(data0)

RESULTS_FROM_CACHE = True

print(datetime.now() - start, "elapsed while getting waveforms")
if RESULTS_FROM_CACHE is False:
    df_new, df_results = df_with_fits(df, 0.2, return_results=True, force_max_col=new_max_col, exponent_fit=True, y_min=0.7)
    print(datetime.now() - start, "elapsed while getting waveforms & fitting parabolas")
else:
    df_results = pd.read_parquet("results_df_with_fits_exp.parquet")
    print(datetime.now() - start, "elapsed while getting waveforms fit parabolas results")

start = time.perf_counter()
recalc_low_acc_results = fit_parabolas(df[[col for col in df.columns if sum([i in col for i in df_results[df_results["x0_cutoff_var"] > 0.00002].index]) > 0]], 0.1, force_max_col=new_max_col)
end = time.perf_counter()
print(f"Elapsed for recalculating inaccurate ones (after compilation) = {end - start:.3f}s")

In [None]:
# df_results.to_parquet("results_df_with_fits_exp_big_set.parquet")

In [None]:
low_acc_results = df_results[df_results["x0_cutoff_var"] > 0.00002]
past_cutoff_results = df_results[df_results["x_cutoff"] < df_results["xT"]]
# slope_results = df_results[df_results["final10_slope"] > 0.013]
notable_results = pd.concat([low_acc_results, past_cutoff_results],axis=0)
# Replace the low accuracy results with the new 'recalc_low_acc_results' values
updated_results = pd.concat([df_results[df_results["x0_cutoff_var"] <= 0.00002], recalc_low_acc_results],axis=0)

# display(df_results["[190]":"[191]"])
# display(recal_low_acc_results)
# display(low_acc_results)
# display(parabola_results)
# df_new[[col for col in df_new.columns if sum([i in col for i in notable_results.index]+["[190]" in col]) >= 1]].iplot()
kwargs = {"yaxis_range":[-23,35], "xaxis_range":[0, .0014]}

fig0 = corr_fig(df_results, [], (None, 2e-5), **kwargs)
popt0, pcov0 = curve_fit(lambda x, m, b: m*x + b, df_results["a"], df_results["xT"]-df_results["x0"], bounds=([-10000, 1], [10, 20]))
print("df_results:", popt0, pcov0, "\nx0,y0 - x1,y1:", (0,popt0[1]), (0.001, popt0[0]/1000+popt0[1]))
fig0.update_layout(shapes = [{'type': 'line', 'yref': 'y', 'xref': 'x', 'y0': popt0[1], 'y1': popt0[0]/1000+popt0[1], 'x0': 0, 'x1': 0.001}])
# fig0.show()

corr_fig(past_cutoff_results, **kwargs).update_layout(title="Past cutoff results",shapes = [{'type': 'line', 'yref': 'y', 'xref': 'x', 'y0': popt0[1], 'y1': popt0[0]/1000+popt0[1], 'x0': 0, 'x1': 0.001}]).show()
# corr_fig(slope_results, **kwargs).update_layout(title="Final slope results",shapes = [{'type': 'line', 'yref': 'y', 'xref': 'x', 'y0': popt0[1], 'y1': popt0[0]/1000+popt0[1], 'x0': 0, 'x1': 0.001}]).show()
# corr_fig(low_acc_results, **kwargs).update_layout(shapes = [{'type': 'line', 'yref': 'y', 'xref': 'x', 'y0': popt0[1], 'y1': popt0[0]/1000+popt0[1], 'x0': 0, 'x1': 0.001}]).show()

# fig2 = corr_fig(updated_results, [], (None, 2e-5), **kwargs)
# popt1, pcov1 = curve_fit(lambda x, m, b: m*x + b, updated_results["a"], updated_results["xT"]-updated_results["x0"], bounds=([-10000, 1], [10, 20]))
# print("updated_results:", popt1, pcov1, "\nx0,y0 - x1,y1:", (0,popt1[1]), (0.001, popt1[0]/1000+popt1[1]))
# fig2.update_layout(shapes = [{'type': 'line', 'yref': 'y', 'xref': 'x', 'y0': popt1[1], 'y1': popt1[0]/1000+popt1[1], 'x0': 0, 'x1': 0.001}])
# fig2.show() #  "line_dash":"dash"


In [None]:
df_results["exp_x_99%"] = np.log(100)/np.log(df_results["b"]) + df_results["c"]
display(df_results)
# print([] or None or {} or 0.0 or [None])

In [None]:
# Example to see training data structure
used_data, data_name = data4, data4_name
data_df = get_waveforms(0, len(used_data), source_data=used_data, get_indices_map=False)#[10:]
display(data_df.head())
data_df_last_10 = data_df[-10:].diff().mean().values #_fit_final_slope(data_df.values, data_df.columns).flatten()
print(len(data_df_last_10),"long array",data_df_last_10)#,"\n", _fit_final_slope(data_df.values, data_df.columns).flatten())
data = data_df.values.transpose()
labels_t = np.array([float([s[s.find("]")+1:s.find("dT")] for s in [col.replace(" ","")]][0]) for col in data_df.columns])
labels_x = t_to_x(labels_t)
wave_i = np.array([int(col[col.find("[")+1:col.find("]")]) for col in data_df.columns])

cutoff_xT = -10
cutoff_i_slope = 0.01

condition_skimmed = labels_x > cutoff_xT
condition_i_slope = (0 <= data_df[:11].diff().mean()) & (data_df[:11].diff().mean() <= cutoff_i_slope)
condition_f40 = data_df[-40:].mean() < 1.125
print("Skimmed condition length:",sum(condition_skimmed))
print(f"Initial slope first 10 values <={cutoff_i_slope} avg. condition length:",sum(condition_i_slope))

available_conditions = {
    f"xT > {cutoff_xT}": condition_skimmed,
    f"0 <= Initial slope <= {cutoff_i_slope}": condition_i_slope,
    f"Final values < 1.125": condition_f40}
used_conditions = [1,2]

applied_conditions = combine_and(*[list(available_conditions.values())[i] for i in used_conditions])
applied_conditions_names = [list(available_conditions.keys())[i] for i in used_conditions]

data_conditioned = data[applied_conditions] if used_conditions else data
labels_conditioned = labels_x[applied_conditions] if used_conditions else labels_x
print("Applied conditions:",applied_conditions_names)
print("Applied conditions length:", len(data_conditioned) )

removed_dict = {k:round(v,2) for (k,v) in zip(wave_i[~condition_skimmed], labels_x[~condition_skimmed])}
removed_dict1 = {k:round(v,2) for (k,v) in zip(wave_i[~condition_i_slope], labels_x[~condition_i_slope])}
removed_dict2 = {k:v for k,v in removed_dict.items() if v > 10}
removed_dict3 = {k:round(v,2) for (k,v) in zip(wave_i[~condition_f40], labels_x[~condition_f40])}
# if removed_dict2:
#     data_df[[col for col in data_df.columns if int(col[col.find("[")+1:col.find("]")]) in removed_dict2.keys()][:100]].iplot(title=f"Removed waveforms (xT <= {cutoff_xT})")
# elif removed_dict:
#     data_df[[col for col in data_df.columns if int(col[col.find("[")+1:col.find("]")]) in removed_dict.keys()][:100]].iplot(title=f"Removed waveforms (xT <= {cutoff_xT})")
data_df[[col for col in data_df.columns if int(col[col.find("[")+1:col.find("]")]) in removed_dict1.keys()][:100]].iplot(title=f"Removed waveforms (i_10_slope > {cutoff_i_slope})")
data_df[[col for col in data_df.columns if int(col[col.find("[")+1:col.find("]")]) in removed_dict3.keys()][:100]].iplot(title=f"Removed waveforms (Final 40 < 1.125)")

In [None]:
experiment_name = "Sklearn NN, Na22 th.ALL Ch0"
# run_name = f"NN{str(regr.hidden_layer_sizes).replace(',',' ')}_{(l_train.shape[0]+l_test.shape[0])/len(data_df.columns):.2%}_{datetime.now().strftime("%Y%m%d_%H%M%S")[2:]}"
# print(run_name)

def save_experiment(exp_name, run_name, data_df, data_name, train_set, test_set, train_labels_t, test_labels_t, applied_conditions_names, fitted_regressor, pca_components):
    experiment = mlflow.set_experiment(exp_name)
    regr = fitted_regressor

    source_path = os.path.join(os.path.abspath("./data"),data_name)
    dataset_train: NumpyDataset = from_numpy(train_set, source=source_path, name=data_name+" train", targets=t_to_x(train_labels_t))
    dataset_test: NumpyDataset = from_numpy(test_set, source=source_path, name=data_name+" test", targets=t_to_x(test_labels_t))

    # Add histogram with predicted vs actual data
    s_labels_t = pd.Series(np.append(train_labels_t,test_labels_t))
    shift = -round(s_labels_t.mean())
    s_labels_t.name = f"Initial data: dT {'-' if shift < 0 else '+'} {abs(shift)} ns"
    pred_s_train = pd.Series(train_labels_t - x_to_t(regr.predict(train_set)),name="dT_act - dT_pred (train)")
    pred_s_test = pd.Series(test_labels_t - x_to_t(regr.predict(test_set)),name="dT_act - dT_pred (test)")
    fig, params = histogram(pd.concat([s_labels_t+shift,pred_s_train,pred_s_test], axis=1), [-30,30,0.25], True, title="Arrival Time Histogram", xaxis_title="Time (ns)", yaxis_title="Frequency")
    fwhm_train = 2*np.sqrt(2*np.log(2)) * params[pred_s_train.name][2]
    fwhm_test = 2*np.sqrt(2*np.log(2)) * params[pred_s_test.name][2]
    fig.show()
    with mlflow.start_run(run_name=run_name) as mlflow_run:
        mlflow.set_experiment_tag("BaseModel","SKLearn Neural Network MLPRegressor")
        mlflow.log_params({
            "PCA components": pca_components,
            "Hidden layers": str(regr.hidden_layer_sizes).replace(","," "), 
            "Activation function":regr.activation,
            "Solver": regr.solver,
            "Alpha": regr.alpha,
            "Max epochs": regr.max_iter,
            "Waveforms used": f'{train_labels_t.shape[0]+test_labels_t.shape[0]} / available {len(data_df.columns)}',
            "Data train - data test shapes": f'{train_labels_t.shape} - {test_labels_t.shape}'.replace("(","/").replace(")","/").replace(","," "),
            "Applied conditions": str(applied_conditions_names).replace(",","  ") or "None"
        })
        mlflow.log_figure(fig, "PredictionHistogram.html")
        fig = go.Figure(go.Scatter(y=regr.loss_curve_,name="Loss Curve"))
        fig.add_trace(go.Scatter(x=[regr.loss_curve_.index(regr.best_loss_)],y=[regr.best_loss_],name="Loss minimum"))
        fig.update_layout(title='Loss curve plot',
                        xaxis_title="Epoch",
                        yaxis_title="Loss")
        mlflow.log_figure(fig, "LossCurve.html")
        mlflow.log_metrics({
            "FWHM Train": fwhm_train,
            "FWHM Test": fwhm_test,
            "Loss final": regr.loss_,
            "Loss min.": regr.best_loss_, #f"{regr.best_loss_:.2f} - {str(regr.loss_curve_.index(regr.best_loss_))}"
            "Loss min. epoch": regr.loss_curve_.index(regr.best_loss_),
            "Validation score R2": regr.score(test_set, t_to_x(test_labels_t)),
            "Iterations/epochs": regr.n_iter_,
            "t": regr.t_,
            "Train mean":pred_s_train.mean(),
            "Train RMS": pred_s_train.std(),
            "Test mean": pred_s_test.mean(),
            "Test RMS": pred_s_test.std()
        })
        mlflow.log_input(dataset_train, context="training")
        mlflow.log_input(dataset_test, context="testing")
        # mlflow.log_artifact("model.png","model_plot")
        mlflow.sklearn.log_model(regr, "MLPRegressorModel")
        # mlflow.keras.log_model(regr, "MLPRegressorModel")
        print("MLFlow run ID:", mlflow_run.info.run_id)

In [None]:
iterate_through = [list(zip(data_dict.keys(), data_dict.values()))[3]] # data_dict.items()

for data_name, used_data in data_dict.items():
    single_layers = [[3], [4], [8], [12], [16], [32], [64]] # 7 * 20s = ~2 mins (All 7 of them take 14 mins)
    dual_layers = [[3,3], [4,4], [8,8], [4,8], [8,4], [4,16], [8,16], [16,4], [16,8], [8,32], [32,8], [16,16], [16,16], [16,64], [64,16]] # 15 * 2 mins = ~30 mins
    triple_layers = []
    for hidden_layers in [[16]]:
        print("Running experiment:",data_name,hidden_layers,"...")
        data_df = get_waveforms(0, len(used_data), source_data=used_data, get_indices_map=False)#[10:]
        # display(data_df.head())
        # data_df_last_10 = data_df[-10:].diff().mean().values #_fit_final_slope(data_df.values, data_df.columns).flatten()
        # print(len(data_df_last_10),"long array",data_df_last_10)#,"\n", _fit_final_slope(data_df.values, data_df.columns).flatten())
        data = data_df.values.transpose()
        labels_t = np.array([float([s[s.find("]")+1:s.find("dT")] for s in [col.replace(" ","")]][0]) for col in data_df.columns])
        labels_x = t_to_x(labels_t)
        wave_i = np.array([int(col[col.find("[")+1:col.find("]")]) for col in data_df.columns])

        n_components = 64
        model = PCA(n_components)
        data_trans = model.fit_transform(data)
        pca_var_ratio = model.explained_variance_ratio_

        # past_cutoff_series = df_results["x_cutoff"] - df_results["xT"]
        # par_fit_var_series = df_results["x0_cutoff_var"]
        # total_var = df_results["var"]
        # exp_99_series = df_results["exp_x_99%"]
        # b_series = df_results["b"]
        # c_series = df_results["c"]
        # colors = {"before<br>cutoff":past_cutoff_series.values}|{s.name:s.values for s in [par_fit_var_series,total_var,exp_99_series,b_series,c_series]}|{"slope<br>last 10":data_df_last_10}
        # past_cutoff = 3

        # conditions_sk2 = (labels_x > cutoff_xT) & (past_cutoff_series > past_cutoff)

        d_train, d_test, l_train, l_test, l_train_t, l_test_t, wi_train, wi_test = train_test_split_cond(data_trans, labels_x, labels_t, wave_i, test_size=0.5, 
                                                                                                        random_state=42)

        # d_train2, d_test2, l_train2, l_test2, l_train_t2, l_test_t2, wi_train2, wi_test2 = train_test_split_cond(data_trans, labels_x, labels_t, wave_i, test_size=0.5, 
        #                                                                                                           random_state=42, conditions=conditions_sk2)
        print("train, test shapes:",l_train.shape, l_test.shape, " -> total data set of", l_train.shape[0]+l_test.shape[0], "out of possible",len(data_df.columns))
        plot_transform(data_trans, n_components, 1)

        regr = MLPRegressor(hidden_layer_sizes = hidden_layers,
                            activation = "relu",
                            solver = "adam",
                            alpha = 1e-4,
                            max_iter = 10000)


        # regr2 = MLPRegressor(hidden_layer_sizes = [16],
        #                     activation = "relu",
        #                     solver = "adam",
        #                     alpha = 1e-4,
        #                     max_iter = 1000)

        # training model
        regr.fit(d_train, l_train)

        # training model
        # regr2.fit(d_train2, l_train2)
        s_labels_t = pd.Series(np.append(l_train_t,l_test_t))
        shift = -round(s_labels_t.mean())
        s_labels_t.name = f"Initial data: dT {'-' if shift < 0 else '+'} {abs(shift)} ns"

        pred_s_train = pd.Series(l_train_t - x_to_t(regr.predict(d_train)),name="dT_act - dT_pred (train)")
        pred_s_test = pd.Series(l_test_t - x_to_t(regr.predict(d_test)),name="dT_act - dT_pred (test)")

        fig, params = histogram(pd.concat([s_labels_t+shift,pred_s_train,pred_s_test], axis=1), [-30,30,0.25], True, title="Arrival Time Histogram", xaxis_title="Time (ns)", yaxis_title="Prevalence")
        fwhm_train = 2*np.sqrt(2*np.log(2)) * params[pred_s_train.name][2]
        fwhm_test = 2*np.sqrt(2*np.log(2)) * params[pred_s_test.name][2]
        fig.show()

        name_signature = data_name[-15:][data_name[-16:].find("-"):-4]
        experiment_name = f"Sklearn NN, Na22 {'th.'+name_signature[-2:] if 'ecf' in name_signature else name_signature} Ch0"
        run_name = f"NN{str(regr.hidden_layer_sizes).replace(',',' ')}_{(l_train.shape[0]+l_test.shape[0])/len(data_df.columns):.2%}_{datetime.now().strftime("%Y%m%d_%H%M%S")[2:]}"

        print("Saving experiment:",experiment_name,", with run:",run_name)
        save_experiment(exp_name=experiment_name,
                        run_name=run_name,
                        data_df=data_df,
                        data_name=data_name,
                        train_set=d_train,
                        test_set=d_test,
                        train_labels_t=l_train_t,
                        test_labels_t=l_test_t,
                        applied_conditions_names=applied_conditions_names,
                        fitted_regressor=regr,
                        pca_components=n_components
                    )

In [None]:
show_graphs = False

# past_cutoff_results = df_results[df_results["x_cutoff"] < df_results["xT"]]
# display(df_results[df_results["xT"] > cutoff_xT])
# display(past_cutoff_series_skimmed) # Positive is good, negative is bad
print(f"20 t is {t_to_x(20)} x, 60 t is {t_to_x(60)} x, -20 t is {t_to_x(-20)} x, 160 t is {t_to_x(160)} x, 186 t is {t_to_x(186)} x")
print(f"Dataset sizes:")
print(f"1: {len(data_trans[condition_skimmed])}/{len(data0)} ({len(data_trans[condition_skimmed])/len(data0):.1%})")
# print(f"2: {len(data_trans[conditions_sk2])}/{len(data0)} ({len(data_trans[conditions_sk2])/len(data0):.1%})")
print("color order:", {i:k for i,k in enumerate(colors.keys())})
color_array, color_name = [(a,n) for n,a in colors.items()][4] #par_fit_var_series_skimmed.values #last_10_skimmed
color_test, color_train = train_test_split_cond(color_array, test_size=0.5, random_state=42, conditions=condition_skimmed) 
# color_test2, color_train2 = train_test_split_cond(color_array, test_size=0.5, random_state=42, conditions=conditions_sk2)
c_min_max_var = (0.000002, 0.000015)
add_kwargs = {"colorbar_name":color_name, "xaxis_title":"Predicted dT","yaxis_title":"Measured dT", "c_min_max": (0.0001, 0.015)}
add_kwargs.pop("c_min_max")

fig_train = mlp_reg_fig(d_train, l_train_t, regr, wi_train, regr_wrapper=x_to_t, color=color_test, **add_kwargs)
# print(labels_test,labels_removed)
# new_data_test = np.append(data_test,data_removed,axis=0)
# new_labels_test = np.append(labels_test,labels_removed)
fig_test = mlp_reg_fig(d_test, l_test_t, regr, wi_test, regr_wrapper=x_to_t, color=color_train, **add_kwargs)
# fig_train2 = mlp_reg_fig(d_train2, l_train_t2, regr2, wi_train2, regr_wrapper=x_to_t, color=color_test2, **add_kwargs)
# fig_test2 = mlp_reg_fig(d_test2, l_test_t2, regr2, wi_test2, regr_wrapper=x_to_t, color=color_train2, **add_kwargs)
# [fig.update_layout(title="Predicted and actual dT") for fig in [fig_train, fi
# g_test, fig_train2, fig_test2]]
# [fig.update_layout(title=f"Predicted and actual dT (removed past cutoff >{past_cutoff})") for fig in [fig_train2, fig_test2]]

# display(df_results.loc[[f"[{i}]" for i in wi_train[np.abs(l_train - regr.predict(d_train)) < 4]]].mean())
# df_results.loc[[f"[{i}]" for i in wi_train[np.abs(l_train - regr.predict(d_train)) < 4]]].drop(columns=["xT","yT_fit"]).hist()
# display(df_results.loc[[f"[{i}]" for i in wi_train[np.abs(l_train - regr.predict(d_train)) > 4]]].mean())
# df_results.loc[[f"[{i}]" for i in wi_train[np.abs(l_train - regr.predict(d_train)) > 4]]].drop(columns=["xT","yT_fit"]).hist()

fig_train.show() if show_graphs else None
fig_test.show() if show_graphs else None
# fig_train2.show() if show_graphs else None
# fig_test2.show() if show_graphs else print("(No graphs shown)")

---