### **This Notebook Performs a Parameters Search on Every Possible Extra Trees Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import optuna
from joblib import Parallel, delayed
import csv

Read the spectral data

In [2]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
df = pd.read_csv("../../data/current_clean_spectrum.csv")
#df = pd.read_csv("../../data/400-1800_spike_removed.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,41.863303
1,201210-1-00,294,400.91116,201210-1,Normal,41.803843
2,201210-1-00,295,401.59454,201210-1,Normal,41.741884
3,201210-1-00,296,402.27789,201210-1,Normal,41.677722
4,201210-1-00,297,402.96127,201210-1,Normal,41.611654
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,12.378163
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,13.269937
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,14.199285
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,15.166531


In [4]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,41.863303,41.803843,41.741884,41.677722,41.611654,41.543974,41.47498,41.404968,41.334234,41.263073,...,6.280946,5.549559,4.745724,3.866578,2.909255,1.870891,0.748623,-0.460415,201210-1,Normal
201210-1-01,46.314608,47.323684,48.299209,49.241395,50.150457,51.026608,51.870063,52.681035,53.459738,54.206386,...,6.769011,7.280928,7.861246,8.512936,9.238972,10.042323,10.925962,11.89286,201210-1,Normal
201210-1-02,118.159018,114.68624,111.563911,108.777452,106.312282,104.153823,102.287493,100.698715,99.372907,98.295491,...,-4.633601,-4.557349,-4.439365,-4.278894,-4.07518,-3.82747,-3.53501,-3.197043,201210-1,Normal
201210-1-03,175.466997,174.846086,174.18802,173.498226,172.782129,172.045155,171.292728,170.530275,169.763222,168.996993,...,-10.801936,-10.349539,-9.864191,-9.347124,-8.799567,-8.222752,-7.617909,-6.986269,201210-1,Normal
201210-1-04,111.814973,106.629998,101.86738,97.512673,93.55143,89.969205,86.751551,83.884023,81.352173,79.141556,...,-11.689508,-11.752441,-11.789205,-11.799583,-11.783357,-11.74031,-11.670224,-11.572882,201210-1,Normal


>#### **Use Optuna to Find the Best Cleaning Parameter**

In [5]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,41.863303
1,201210-1-00,294,400.91116,201210-1,Normal,41.803843
2,201210-1-00,295,401.59454,201210-1,Normal,41.741884
3,201210-1-00,296,402.27789,201210-1,Normal,41.677722
4,201210-1-00,297,402.96127,201210-1,Normal,41.611654
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,12.378163
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,13.269937
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,14.199285
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,15.166531


In [6]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

In [7]:
def objective(trial):

    # Prepare data for ML
    wavelength_copy = wavelength_df.copy()
    X = wavelength_copy.drop(['Status', 'SurID'], axis=1)
    y = wavelength_copy['Status']
    groups = wavelength_copy['SurID']

    # Classifier and cross-validation setup
    cv = GroupKFold(n_splits=10)

    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 100, log=True)
    min_samples_split = trial.suggest_float('min_samples_split', 0.01, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.01, 0.5)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    max_samples = trial.suggest_float('max_samples', 0.01, 1.0) if bootstrap else None

    et = ExtraTreesClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        max_samples=max_samples,
        random_state=1234
    )
    # Perform cross-validation
    #scores = cross_validate(et, X, y, groups=groups, cv=cv, scoring='accuracy')
    scores = cross_val_score(et, X, y, groups=groups, cv=cv, scoring='accuracy')

    # Return the average accuracy across all folds
    return np.mean(scores)

# sampler = TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
# study = optuna.create_study(direction='maximise', sampler=sampler)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1)

print(study.best_trial)

[I 2024-03-27 22:10:19,685] A new study created in memory with name: no-name-9453ee72-3873-4458-8c7d-50c487bf1d0b
[I 2024-03-27 22:10:31,069] Trial 6 finished with value: 0.29001587301587295 and parameters: {'n_estimators': 834, 'max_depth': 5, 'min_samples_split': 0.682855882620252, 'min_samples_leaf': 0.36343901079388374, 'max_features': 'auto', 'bootstrap': False}. Best is trial 6 with value: 0.29001587301587295.
[I 2024-03-27 22:10:32,025] Trial 8 finished with value: 0.3122380952380952 and parameters: {'n_estimators': 1865, 'max_depth': 2, 'min_samples_split': 0.5044065365886914, 'min_samples_leaf': 0.08209059060314773, 'max_features': 'log2', 'bootstrap': True, 'max_samples': 0.8183008416743885}. Best is trial 8 with value: 0.3122380952380952.
[I 2024-03-27 22:10:33,212] Trial 7 finished with value: 0.40628571428571425 and parameters: {'n_estimators': 173, 'max_depth': 55, 'min_samples_split': 0.8954794807824602, 'min_samples_leaf': 0.1925715442077548, 'max_features': 'auto', 'bo

FrozenTrial(number=30, state=1, values=[0.5726031746031746], datetime_start=datetime.datetime(2024, 3, 27, 22, 11, 6, 515607), datetime_complete=datetime.datetime(2024, 3, 27, 22, 12, 30, 594861), params={'n_estimators': 491, 'max_depth': 23, 'min_samples_split': 0.012438925339381929, 'min_samples_leaf': 0.023531336806942943, 'max_features': None, 'bootstrap': True, 'max_samples': 0.6195819889388039}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=2000, log=False, low=100, step=1), 'max_depth': IntDistribution(high=100, log=True, low=2, step=1), 'min_samples_split': FloatDistribution(high=1.0, log=False, low=0.01, step=None), 'min_samples_leaf': FloatDistribution(high=0.5, log=False, low=0.01, step=None), 'max_features': CategoricalDistribution(choices=('auto', 'sqrt', 'log2', None)), 'bootstrap': CategoricalDistribution(choices=(True, False)), 'max_samples': FloatDistribution(high=1.0, log=False, low=0.01, step=None)}, trial

Save the study

In [8]:
import joblib

joblib.dump(study, "../../data/studies/et_study.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

['../../data/studies/et_study.pkl']

Continue the study.

In [17]:
study.optimize(objective, n_trials=25, n_jobs=-1)

print(study.best_trial)

[I 2024-03-27 23:37:01,601] Trial 104 finished with value: 0.5389999999999999 and parameters: {'n_estimators': 1488, 'max_depth': 63, 'min_samples_split': 0.18321850794774946, 'min_samples_leaf': 0.010764867091873073, 'max_features': None, 'bootstrap': False}. Best is trial 30 with value: 0.5726031746031746.
[I 2024-03-27 23:41:20,995] Trial 110 finished with value: 0.5662380952380952 and parameters: {'n_estimators': 1355, 'max_depth': 57, 'min_samples_split': 0.12047139009075215, 'min_samples_leaf': 0.015994116608584817, 'max_features': None, 'bootstrap': False}. Best is trial 30 with value: 0.5726031746031746.
[I 2024-03-27 23:45:21,351] Trial 111 finished with value: 0.5244444444444445 and parameters: {'n_estimators': 1319, 'max_depth': 4, 'min_samples_split': 0.12657376317613242, 'min_samples_leaf': 0.018738285563519776, 'max_features': None, 'bootstrap': False}. Best is trial 30 with value: 0.5726031746031746.
[I 2024-03-27 23:45:44,925] Trial 101 finished with value: 0.5662380952

In [None]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_param_importances(study)