### **This Notebook Performs a Parameters Search on Every Possible Cleaning and Model Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import optuna
from joblib import Parallel, delayed
import csv

Read the spectral data

In [4]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
df = pd.read_csv("../../data/raw_df_outliers_removed.csv")
#df = pd.read_csv("../../data/400-1800_spike_removed.csv")

In [5]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
4638931,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
4638932,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
4638933,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
4638934,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


In [6]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1765.6628,1774.7809,1769.0302,1756.422,1758.869,1763.23,1745.2285,1773.3534,1774.7166,1753.3281,...,1210.4993,1213.9619,1225.2153,1210.001,1210.6858,1194.4679,1195.1451,1189.8683,201210-1,Normal
201210-1-01,1966.993,1962.4237,1954.5616,1954.3228,1963.0917,1975.0807,1979.3162,1963.4561,1968.4587,1964.0,...,1382.6973,1363.7004,1360.621,1354.0477,1353.0381,1353.9978,1361.2426,1370.2874,201210-1,Normal
201210-1-09,2492.6379,2491.0544,2486.6711,2493.686,2502.9224,2505.2312,2504.2173,2493.1714,2473.5293,2476.697,...,1665.1053,1653.6478,1648.4294,1634.5895,1660.3303,1655.6666,1675.4294,1633.775,201210-1,Normal
201210-1-10,2241.6375,2248.8101,2220.3132,2224.4375,2238.4429,2227.1865,2218.7612,2234.8635,2203.5964,2198.8882,...,1442.1283,1425.1864,1420.7164,1420.3989,1434.137,1430.6307,1426.0441,1439.8866,201210-1,Normal
201210-1-11,2159.1975,2174.082,2158.0576,2169.1277,2191.8386,2194.5652,2193.6184,2207.6462,2176.9893,2166.2527,...,1429.1047,1409.2841,1435.8605,1433.5804,1430.17,1437.9093,1427.6486,1432.1414,201210-1,Normal


>#### **Use Optuna to Find the Best Cleaning Parameter**

In [7]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
4638931,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
4638932,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
4638933,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
4638934,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


In [8]:
def objective(trial):

    # Values for each hyperparameter
    despike = trial.suggest_categorical('despike', [True, False])
    despike_ma = trial.suggest_int('despike_ma', 5, 100)
    despike_threshold = trial.suggest_float('despike_threshold', 3, 10, step=0.25)
    lam = trial.suggest_categorical('lam', [10**2, 10**3, 10**4, 10**5, 10**6, 10**7, 10**8, 10**9])
    p = trial.suggest_float('p', 0.001, 0.1, step=0.001)
    window_size = trial.suggest_int('window_size', 5, 201)
    poly_order = trial.suggest_int('poly_order', 1, 4)
    scaling = trial.suggest_categorical('scaling', [False, 'normal', 'vector', 'svn'])

    df_copy = df.copy()

    # Apply preprocessing based on suggested parameters
    if despike:
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold = despike_threshold))
    df_copy['Baseline'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
    df_copy['Baseline_Corrected_Absorbance'] = df_copy['Absorbance'] - df_copy['Baseline']
    df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
    if scaling:
        if scaling == 'normal':
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: normalise(x))
        elif scaling == 'vector':
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: vector_normalise(x))
        else:
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: svn_normalise(x))

    # Prepare data for ML
    wavelength_df = prepare_wavelength_df(df_copy, 'Smooth_Baseline_Corrected')
    X = wavelength_df.drop(['Status', 'SurID'], axis=1)
    y = wavelength_df['Status']
    groups = wavelength_df['SurID']

    # Classifier and cross-validation setup
    et = ExtraTreesClassifier(random_state=1234)
    cv = GroupKFold(n_splits=10)

    # Perform cross-validation
    scores = cross_validate(et, X, y, groups=groups, cv=cv, scoring='accuracy')
    #scores = cross_val_score(et, X, y, groups=groups, cv=cv, scoring='accuracy')

    # Return the average accuracy across all folds
    return np.mean(scores['test_score'])


# sampler = TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
# study = optuna.create_study(direction='maximise', sampler=sampler)

study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, n_jobs=-1)
study.optimize(objective, n_trials=300, n_jobs=-1)

print(study.best_trial)

[I 2024-03-29 02:29:33,998] A new study created in memory with name: no-name-8c1ca61b-55a2-476a-9a1e-29dace3c2ef4
[I 2024-03-29 02:33:40,354] Trial 11 finished with value: 0.5841260830973171 and parameters: {'despike': False, 'despike_ma': 54, 'despike_threshold': 3.25, 'lam': 100000, 'p': 0.077, 'window_size': 50, 'poly_order': 1, 'scaling': False}. Best is trial 11 with value: 0.5841260830973171.
[I 2024-03-29 02:33:40,929] Trial 9 finished with value: 0.571456640165988 and parameters: {'despike': False, 'despike_ma': 54, 'despike_threshold': 5.0, 'lam': 10000000, 'p': 0.035, 'window_size': 123, 'poly_order': 1, 'scaling': False}. Best is trial 11 with value: 0.5841260830973171.
[I 2024-03-29 02:33:47,570] Trial 2 finished with value: 0.5153159258553774 and parameters: {'despike': False, 'despike_ma': 10, 'despike_threshold': 4.25, 'lam': 100, 'p': 0.091, 'window_size': 76, 'poly_order': 2, 'scaling': False}. Best is trial 11 with value: 0.5841260830973171.
[I 2024-03-29 02:33:50,944

FrozenTrial(number=250, state=1, values=[0.6766445139105718], datetime_start=datetime.datetime(2024, 3, 29, 4, 29, 13, 130297), datetime_complete=datetime.datetime(2024, 3, 29, 4, 35, 57, 210749), params={'despike': True, 'despike_ma': 17, 'despike_threshold': 5.5, 'lam': 1000000000, 'p': 0.004, 'window_size': 52, 'poly_order': 4, 'scaling': 'vector'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'despike': CategoricalDistribution(choices=(True, False)), 'despike_ma': IntDistribution(high=100, log=False, low=5, step=1), 'despike_threshold': FloatDistribution(high=10.0, log=False, low=3.0, step=0.25), 'lam': CategoricalDistribution(choices=(100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000)), 'p': FloatDistribution(high=0.1, log=False, low=0.001, step=0.001), 'window_size': IntDistribution(high=201, log=False, low=5, step=1), 'poly_order': IntDistribution(high=4, log=False, low=1, step=1), 'scaling': CategoricalDistribution(choices=(False, 'nor

In [9]:
results_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
results_df.to_csv("../../data/studies/et_raw_df_outliers_dropped.csv")

In [10]:
import joblib

joblib.dump(study, "../../data/studies/et_raw_df_outliers_dropped.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

['../../data/studies/et_raw_df_outliers_dropped.pkl']

In [11]:
print(study.best_trial)

FrozenTrial(number=250, state=1, values=[0.6766445139105718], datetime_start=datetime.datetime(2024, 3, 29, 4, 29, 13, 130297), datetime_complete=datetime.datetime(2024, 3, 29, 4, 35, 57, 210749), params={'despike': True, 'despike_ma': 17, 'despike_threshold': 5.5, 'lam': 1000000000, 'p': 0.004, 'window_size': 52, 'poly_order': 4, 'scaling': 'vector'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'despike': CategoricalDistribution(choices=(True, False)), 'despike_ma': IntDistribution(high=100, log=False, low=5, step=1), 'despike_threshold': FloatDistribution(high=10.0, log=False, low=3.0, step=0.25), 'lam': CategoricalDistribution(choices=(100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000)), 'p': FloatDistribution(high=0.1, log=False, low=0.001, step=0.001), 'window_size': IntDistribution(high=201, log=False, low=5, step=1), 'poly_order': IntDistribution(high=4, log=False, low=1, step=1), 'scaling': CategoricalDistribution(choices=(False, 'nor

In [12]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [13]:
plot_slice(study)

In [14]:
optuna.visualization.plot_contour(study)

In [15]:
optuna.visualization.plot_param_importances(study)

Continue the study.

In [16]:
# study.optimize(objective, n_trials=200, n_jobs=-1)

# print(study.best_trial)