### **This Notebook Performs a Parameters Search on Every Possible Cleaning and Model Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [5]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import optuna
from joblib import Parallel, delayed
import csv

Read the spectral data

In [2]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
df = pd.read_csv("../../data/exosomes.raw_spectrum_400-1800.csv")
#df = pd.read_csv("../../data/400-1800_spike_removed.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


In [4]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1765.6628,1774.7809,1769.0302,1756.422,1758.869,1763.23,1745.2285,1773.3534,1774.7166,1753.3281,...,1210.4993,1213.9619,1225.2153,1210.001,1210.6858,1194.4679,1195.1451,1189.8683,201210-1,Normal
201210-1-01,1966.993,1962.4237,1954.5616,1954.3228,1963.0917,1975.0807,1979.3162,1963.4561,1968.4587,1964.0,...,1382.6973,1363.7004,1360.621,1354.0477,1353.0381,1353.9978,1361.2426,1370.2874,201210-1,Normal
201210-1-02,2182.6694,2149.6565,2146.0227,2159.3459,2167.291,2160.9861,2145.6575,2134.2004,2142.8303,2138.6309,...,1976.207,1989.0183,1996.2838,1979.3507,1976.2002,1994.9839,1974.203,1971.188,201210-1,Normal
201210-1-03,2445.0837,2430.4973,2422.7927,2434.3433,2454.97,2462.8245,2454.7007,2467.7329,2449.5161,2421.3474,...,1992.3817,2022.6331,2001.8311,2010.0946,2006.4933,2017.2891,2038.1699,2000.6475,201210-1,Normal
201210-1-04,2250.4536,2248.6235,2245.0984,2242.7173,2235.2803,2228.9585,2236.0095,2229.6091,2225.9231,2211.0359,...,2009.0385,1953.3303,1963.5698,1964.5299,1969.5634,1986.6266,1970.1484,2007.0848,201210-1,Normal


>#### **Use Optuna to Find the Best Cleaning Parameter**

In [5]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


In [6]:
def objective(trial):

    # Values for each hyperparameter
    despike = trial.suggest_categorical('despike', [True, False])
    despike_ma = trial.suggest_int('despike_ma', 5, 100)
    despike_threshold = trial.suggest_float('despike_threshold', 3, 10, step=0.25)
    lam = trial.suggest_categorical('lam', [10**2, 10**3, 10**4, 10**5, 10**6, 10**7, 10**8, 10**9])
    p = trial.suggest_float('p', 0.001, 0.1, step=0.001)
    window_size = trial.suggest_int('window_size', 5, 201)
    poly_order = trial.suggest_int('poly_order', 1, 4)
    scaling = trial.suggest_categorical('scaling', [False, 'normal', 'vector', 'svn'])

    df_copy = df.copy()

    # Apply preprocessing based on suggested parameters
    if despike:
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold = despike_threshold))
    df_copy['Baseline'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
    df_copy['Baseline_Corrected_Absorbance'] = df_copy['Absorbance'] - df_copy['Baseline']
    df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
    if scaling:
        if scaling == 'normal':
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: normalise(x))
        elif scaling == 'vector':
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: vector_normalise(x))
        else:
            df_copy['Smooth_Baseline_Corrected'] = df_copy.groupby('SpecID')['Smooth_Baseline_Corrected'].transform(lambda x: svn_normalise(x))

    # Prepare data for ML
    wavelength_df = prepare_wavelength_df(df_copy, 'Smooth_Baseline_Corrected')
    X = wavelength_df.drop(['Status', 'SurID'], axis=1)
    y = wavelength_df['Status']
    groups = wavelength_df['SurID']

    # Classifier and cross-validation setup
    et = ExtraTreesClassifier(random_state=1234)
    cv = GroupKFold(n_splits=10)

    # Perform cross-validation
    scores = cross_validate(et, X, y, groups=groups, cv=cv, scoring='accuracy')
    #scores = cross_val_score(et, X, y, groups=groups, cv=cv, scoring='accuracy')

    # Return the average accuracy across all folds
    return np.mean(scores['test_score'])


# sampler = TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
# study = optuna.create_study(direction='maximise', sampler=sampler)

study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, n_jobs=-1)
study.optimize(objective, n_trials=200, n_jobs=-1)

print(study.best_trial)

[I 2024-03-27 20:53:02,152] A new study created in memory with name: no-name-29574763-d1c4-42c6-ab81-4cc261f83b1e
[I 2024-03-27 21:00:44,390] Trial 5 finished with value: 0.5603809523809524 and parameters: {'despike': False, 'despike_ma': 28, 'lam': 1000000, 'p': 0.08599441828273476, 'window_size': 93, 'poly_order': 4, 'scaling': 'vector'}. Best is trial 5 with value: 0.5603809523809524.
[I 2024-03-27 21:00:49,017] Trial 11 finished with value: 0.5373809523809524 and parameters: {'despike': False, 'despike_ma': 19, 'lam': 1000000, 'p': 0.0851412547376365, 'window_size': 109, 'poly_order': 2, 'scaling': 'svn'}. Best is trial 5 with value: 0.5603809523809524.
[I 2024-03-27 21:00:50,637] Trial 10 finished with value: 0.5438412698412699 and parameters: {'despike': False, 'despike_ma': 78, 'lam': 1000000, 'p': 0.049849439005656, 'window_size': 148, 'poly_order': 1, 'scaling': 'svn'}. Best is trial 5 with value: 0.5603809523809524.
[I 2024-03-27 21:00:51,036] Trial 9 finished with value: 0.4

FrozenTrial(number=74, state=1, values=[0.611920634920635], datetime_start=datetime.datetime(2024, 3, 27, 21, 37, 9, 752882), datetime_complete=datetime.datetime(2024, 3, 27, 21, 47, 14, 243888), params={'despike': True, 'despike_ma': 78, 'lam': 1000000000, 'p': 0.06898929794655184, 'window_size': 50, 'poly_order': 3, 'scaling': 'svn'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'despike': CategoricalDistribution(choices=(True, False)), 'despike_ma': IntDistribution(high=100, log=False, low=5, step=1), 'lam': CategoricalDistribution(choices=(100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000)), 'p': FloatDistribution(high=0.1, log=False, low=0.001, step=None), 'window_size': IntDistribution(high=201, log=False, low=5, step=1), 'poly_order': CategoricalDistribution(choices=(1, 2, 3, 4)), 'scaling': CategoricalDistribution(choices=(False, 'normal', 'vector', 'svn'))}, trial_id=74, value=None)


In [None]:
results_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
results_df.to_csv("../../data/et_optuna.csv")

In [7]:
import joblib

study = optuna.create_study(direction="maximize")

joblib.dump(study, "../../data/studies/cleaning_study2.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

[I 2024-03-27 22:02:22,635] A new study created in memory with name: no-name-e984eeb6-360a-419a-bd8b-00a190f8e916


In [13]:
print(study.best_trial)

ValueError: No trials are completed yet.

In [None]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_param_importances(study)