### **This Notebook Performs a Parameters Search on Every Possible Cleaning and Model Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import optuna

Read the spectral data

In [2]:
df = pd.read_csv("../../data/400-1800_with_raw_scaled_surface_pagerank.csv")

##### The **PageRank** value is calculated, based on the Gaussian Kernal similarity of each Scaled Raw Spectra within a Surface

This aims to remove unrepresentative spectra within each surface which we think are caused by background elements.

In [3]:
df

Unnamed: 0.1,Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
0,0,201210-1-00,293,400.22778,1.317299,201210-1,Normal,0.610024
1,1,201210-1-00,294,400.91116,1.378171,201210-1,Normal,0.610024
2,2,201210-1-00,295,401.59454,1.339779,201210-1,Normal,0.610024
3,3,201210-1-00,296,402.27789,1.255608,201210-1,Normal,0.610024
4,4,201210-1-00,297,402.96127,1.271944,201210-1,Normal,0.610024
...,...,...,...,...,...,...,...,...
6239200,6239200,210526-3-09,2337,1797.03870,-3.024542,210526-3,Hyperglycemia,1.201164
6239201,6239201,210526-3-09,2338,1797.72200,-2.937646,210526-3,Hyperglycemia,1.201164
6239202,6239202,210526-3-09,2339,1798.40550,-2.936448,210526-3,Hyperglycemia,1.201164
6239203,6239203,210526-3-09,2340,1799.08890,-2.889072,210526-3,Hyperglycemia,1.201164


In [4]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.317299,1.378171,1.339779,1.255608,1.271944,1.301058,1.180881,1.368641,1.377741,1.234954,...,-2.388922,-2.365806,-2.290679,-2.392248,-2.387677,-2.495946,-2.491425,-2.526652,201210-1,Normal
201210-1-01,1.389005,1.360042,1.310209,1.308695,1.364277,1.440269,1.467115,1.366586,1.398295,1.370034,...,-2.314542,-2.434954,-2.454473,-2.496137,-2.502537,-2.496454,-2.450533,-2.393202,201210-1,Normal
201210-1-02,0.149822,-0.042052,-0.063172,0.014264,0.060441,0.023797,-0.065294,-0.131884,-0.081726,-0.106133,...,-1.050153,-0.975693,-0.933466,-1.031882,-1.050193,-0.941021,-1.061801,-1.079324,201210-1,Normal
201210-1-03,-0.206069,-0.259638,-0.287933,-0.245513,-0.169761,-0.140915,-0.17075,-0.122889,-0.189791,-0.293241,...,-1.868633,-1.757534,-1.83393,-1.803582,-1.816808,-1.77716,-1.700475,-1.838277,201210-1,Normal
201210-1-04,0.636598,0.616986,0.57921,0.553693,0.473996,0.406249,0.48181,0.413221,0.373721,0.214184,...,-1.95049,-2.547479,-2.437749,-2.42746,-2.373519,-2.190664,-2.36725,-1.971427,201210-1,Normal


>#### **Use Optuna to Find the Best Cleaning Parameter**

In [5]:
df

Unnamed: 0.1,Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
0,0,201210-1-00,293,400.22778,1.317299,201210-1,Normal,0.610024
1,1,201210-1-00,294,400.91116,1.378171,201210-1,Normal,0.610024
2,2,201210-1-00,295,401.59454,1.339779,201210-1,Normal,0.610024
3,3,201210-1-00,296,402.27789,1.255608,201210-1,Normal,0.610024
4,4,201210-1-00,297,402.96127,1.271944,201210-1,Normal,0.610024
...,...,...,...,...,...,...,...,...
6239200,6239200,210526-3-09,2337,1797.03870,-3.024542,210526-3,Hyperglycemia,1.201164
6239201,6239201,210526-3-09,2338,1797.72200,-2.937646,210526-3,Hyperglycemia,1.201164
6239202,6239202,210526-3-09,2339,1798.40550,-2.936448,210526-3,Hyperglycemia,1.201164
6239203,6239203,210526-3-09,2340,1799.08890,-2.889072,210526-3,Hyperglycemia,1.201164


In [6]:
def objective(trial):

    # Values for each hyperparameter
    pagerank_cutoff = trial.suggest_float('pagerank_cutoff', 0, 1.5)
    despike = trial.suggest_categorical('despike', [True, False])
    despike_ma = trial.suggest_int('despike_ma', 5, 100)
    despike_threshold = trial.suggest_float('despike_threshold', 3, 10, step=0.25)
    baseline_correct = trial.suggest_categorical('baseline_correct', [True, False])
    lam = trial.suggest_categorical('lam', [10**2, 10**3, 10**4, 10**5, 10**6, 10**7, 10**8, 10**9, 10**10, 10**11])
    p = trial.suggest_float('p', 0.001, 0.1, step=0.001)
    smoothing = trial.suggest_categorical('smoothing', [True, False])
    window_size = trial.suggest_int('window_size', 5, 201)
    poly_order = trial.suggest_int('poly_order', 1, 4)
    scaling = trial.suggest_categorical('scaling', [False, 'normal', 'vector', 'svn'])

    df_copy = df.copy()

    df_copy = df_copy[df_copy['PageRank'] > pagerank_cutoff]

    # Apply preprocessing based on suggested parameters
    if despike:
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold = despike_threshold))

    if baseline_correct:
        df_copy['Baseline'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
        df_copy['Absorbance'] = df_copy['Absorbance'] - df_copy['Baseline']

    if smoothing:
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))

    if scaling:
        if scaling == 'normal':
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))
        elif scaling == 'vector':
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: vector_normalise(x))
        else:
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: svn_normalise(x))

    # Prepare data for ML
    wavelength_df = prepare_wavelength_df(df_copy, 'Absorbance')
    X = wavelength_df.drop(['Status', 'SurID'], axis=1)
    y = wavelength_df['Status']
    groups = wavelength_df['SurID']

    # Suggest classifier type
    classifier_name = trial.suggest_categorical("classifier", ["ExtraTrees", "RandomForest", "SVC"])

    if classifier_name == "ExtraTrees":
        classifier = ExtraTreesClassifier(
            n_estimators=trial.suggest_int("et_n_estimators", 10, 100),
            max_depth=trial.suggest_int("et_max_depth", 2, 32, log=True),
            criterion=trial.suggest_categorical("et_criterion", ["gini", "entropy", "log_loss"]),
            random_state=1234,
        )

    elif classifier_name == "RandomForest":
        classifier = RandomForestClassifier(
            n_estimators=trial.suggest_int("rf_n_estimators", 10, 100),
            max_depth=trial.suggest_int("rf_max_depth", 2, 32, log=True),
            criterion=trial.suggest_categorical("rf_criterion", ["gini", "entropy", "log_loss"]),
            random_state=1234,
        )
        
    else: # SVC
        classifier = SVC(
            C=trial.suggest_float("svc_c", 1e-10, 1e10, log=True),
            kernel=trial.suggest_categorical("svc_kernel", ["linear", "poly", "rbf", "sigmoid"]),
            gamma=trial.suggest_categorical("svc_gamma", ["scale", "auto"]),
            random_state=1234,
        )

    cv = GroupKFold(n_splits=10)

    # Perform cross-validation
    scores = cross_validate(classifier, X, y, groups=groups, cv=cv, scoring='accuracy')

    # Return the average accuracy across all folds
    return np.mean(scores['test_score'])


# sampler = TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
# study = optuna.create_study(direction='maximise', sampler=sampler)

study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, n_jobs=-1)
study.optimize(objective, n_trials=100, n_jobs=-1)

print(study.best_trial)

[I 2024-03-30 01:01:27,542] A new study created in memory with name: no-name-3f90b199-ca82-4f37-afee-242462f3e5ad
[I 2024-03-30 01:01:50,903] Trial 4 finished with value: 0.5175798766578739 and parameters: {'pagerank_cutoff': 1.3244362638387175, 'despike': False, 'despike_ma': 15, 'despike_threshold': 9.5, 'baseline_correct': False, 'lam': 100000000, 'p': 0.027000000000000003, 'smoothing': False, 'window_size': 85, 'poly_order': 4, 'scaling': 'svn', 'classifier': 'SVC', 'svc_c': 2585.0643795363726, 'svc_kernel': 'sigmoid', 'svc_gamma': 'scale'}. Best is trial 4 with value: 0.5175798766578739.
[I 2024-03-30 01:02:04,954] Trial 6 finished with value: 0.3756360537978364 and parameters: {'pagerank_cutoff': 1.206103391448901, 'despike': True, 'despike_ma': 53, 'despike_threshold': 7.0, 'baseline_correct': False, 'lam': 1000000, 'p': 0.041, 'smoothing': False, 'window_size': 62, 'poly_order': 3, 'scaling': 'normal', 'classifier': 'SVC', 'svc_c': 3055.306512691334, 'svc_kernel': 'sigmoid', 's

In [None]:
results_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
results_df.to_csv("../../data/studies/all_models_outliers_removed.csv")

In [None]:
import joblib

joblib.dump(study, "../../data/studies/all_models_outliers_removed.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

['../../data/studies/et_raw_df_outliers_dropped.pkl']

In [None]:
print(study.best_trial)

FrozenTrial(number=250, state=1, values=[0.6766445139105718], datetime_start=datetime.datetime(2024, 3, 29, 4, 29, 13, 130297), datetime_complete=datetime.datetime(2024, 3, 29, 4, 35, 57, 210749), params={'despike': True, 'despike_ma': 17, 'despike_threshold': 5.5, 'lam': 1000000000, 'p': 0.004, 'window_size': 52, 'poly_order': 4, 'scaling': 'vector'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'despike': CategoricalDistribution(choices=(True, False)), 'despike_ma': IntDistribution(high=100, log=False, low=5, step=1), 'despike_threshold': FloatDistribution(high=10.0, log=False, low=3.0, step=0.25), 'lam': CategoricalDistribution(choices=(100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000)), 'p': FloatDistribution(high=0.1, log=False, low=0.001, step=0.001), 'window_size': IntDistribution(high=201, log=False, low=5, step=1), 'poly_order': IntDistribution(high=4, log=False, low=1, step=1), 'scaling': CategoricalDistribution(choices=(False, 'nor

In [None]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_param_importances(study)

Continue the study.

In [None]:
study.optimize(objective, n_trials=400, n_jobs=-1)

print(study.best_trial)