### **This Notebook Performs a Parameters Search on Every Possible Cleaning and Model Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import optuna

Read the spectral data

In [2]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
# df = pd.read_csv("../../data/raw_df_outliers_removed.csv")
df = pd.read_csv("../../data/400-1800_with_raw_scaled_surface_pagerank.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,0.610024
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,0.610024
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,0.610024
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,0.610024
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,0.610024
...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1.201164
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1.201164
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1.201164
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1.201164


In [4]:
pagerank_cutoff = 0.824056910854271
despike_ma = 92
despike_threshold = 3.5
lam = 10 ** 11
p = 0.1

In [5]:
clean_df = df[df['PageRank'] > pagerank_cutoff]
clean_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
4098,201210-1-02,293,400.22778,2182.6694,201210-1,Normal,0.961147
4099,201210-1-02,294,400.91116,2149.6565,201210-1,Normal,0.961147
4100,201210-1-02,295,401.59454,2146.0227,201210-1,Normal,0.961147
4101,201210-1-02,296,402.27789,2159.3459,201210-1,Normal,0.961147
4102,201210-1-02,297,402.96127,2167.2910,201210-1,Normal,0.961147
...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1.201164
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1.201164
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1.201164
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1.201164


In [6]:
clean_df['SpecID'].nunique()

1939

In [7]:
clean_df['Despiked_Absorbance'] = clean_df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold=despike_threshold))
clean_df['Baseline'] = clean_df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
clean_df['Baseline_Corrected_Absorbance'] = clean_df['Despiked_Absorbance'] - clean_df['Baseline']
clean_df['Scaled_Absorbance'] = clean_df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Despiked_Absorbance'] = clean_df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold=despike_threshold))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Baseline'] = clean_df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

In [8]:
clean_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank,Despiked_Absorbance,Baseline,Baseline_Corrected_Absorbance,Scaled_Absorbance
4098,201210-1-02,293,400.22778,2182.6694,201210-1,Normal,0.961147,2182.6694,2047.983637,134.685763,0.387154
4099,201210-1-02,294,400.91116,2149.6565,201210-1,Normal,0.961147,2149.6565,2048.003637,101.652863,0.143959
4100,201210-1-02,295,401.59454,2146.0227,201210-1,Normal,0.961147,2146.0227,2048.023636,97.999064,0.117058
4101,201210-1-02,296,402.27789,2159.3459,201210-1,Normal,0.961147,2159.3459,2048.043636,111.302264,0.215
4102,201210-1-02,297,402.96127,2167.291,201210-1,Normal,0.961147,2167.291,2048.063635,119.227365,0.273346


In [9]:
wavelength_df = prepare_wavelength_df(clean_df, 'Scaled_Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-02,0.387154,0.143959,0.117058,0.215,0.273346,0.226781,0.113781,0.029284,0.092672,0.061608,...,-1.318869,-1.224532,-1.171024,-1.295672,-1.318849,-1.180542,-1.333518,-1.355698,201210-1,Normal
201210-1-03,-0.322233,-0.377119,-0.406082,-0.362514,-0.284758,-0.255112,-0.285655,-0.236506,-0.305067,-0.411116,...,-1.65062,-1.536285,-1.614261,-1.582751,-1.595934,-1.554886,-1.475848,-1.616807,201210-1,Normal
201210-1-07,1.590973,1.729138,1.678946,1.716794,1.719294,1.650259,1.792391,1.972,2.141552,1.896857,...,-1.010676,-1.051011,-1.020404,-1.013684,-0.74418,-0.718906,-0.897704,-0.844931,201210-1,Normal
201210-1-09,1.840965,1.828747,1.789689,1.859896,1.951399,1.976493,1.969735,1.866808,1.681478,1.714805,...,-0.481731,-0.588972,-0.636404,-0.766483,-0.517136,-0.55925,-0.367209,-0.763921,201210-1,Normal
201210-1-10,1.147777,1.255007,0.853108,0.916829,1.121588,0.965771,0.850365,1.085054,0.643614,0.581263,...,-0.778024,-1.015345,-1.074649,-1.074681,-0.87409,-0.919638,-0.980606,-0.778525,201210-1,Normal


>#### **Use Optuna to Find the Best Model Parameters**

In [10]:
def objective(trial):

    # Prepare data for ML
    wavelength_copy = wavelength_df.copy()
    X = wavelength_copy.drop(['Status', 'SurID'], axis=1)
    y = wavelength_copy['Status']
    groups = wavelength_copy['SurID']

    # Suggest classifier type
    classifier_name = trial.suggest_categorical("classifier", ["ExtraTrees", "RandomForest", "SVC"])

    if classifier_name == "ExtraTrees":
        criterion = trial.suggest_categorical("et_criterion", ["gini", "entropy"])
        n_estimators = trial.suggest_int("et_n_estimators", 50, 300)
        max_depth_option = trial.suggest_categorical("et_max_depth_option", [None, "Specify"])
        max_depth = trial.suggest_int("et_max_depth", 2, 32, log=True) if max_depth_option == "Specify" else None
        min_samples_split = trial.suggest_int("et_min_samples_split", 2, 10)
        classifier = ExtraTreesClassifier(random_state=1234, criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)

    elif classifier_name == "RandomForest":
        criterion = trial.suggest_categorical("rf_criterion", ["gini", "entropy"])
        n_estimators = trial.suggest_int("rf_n_estimators", 50, 300)
        max_depth_option = trial.suggest_categorical("rf_max_depth_option", [None, "Specify"])
        max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True) if max_depth_option == "Specify" else None
        min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 10)
        classifier = RandomForestClassifier(random_state=1234, criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)

    else:
        C = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        kernel = trial.suggest_categorical("svc_kernel", ["linear", "poly", "rbf", "sigmoid"])
        gamma = trial.suggest_categorical("svc_gamma", ["scale", "auto"])
        classifier = SVC(random_state=1234, C=C, kernel=kernel, gamma=gamma)
        
    cv = GroupKFold(n_splits=10)

    # Perform cross-validation
    scores = cross_validate(classifier, X, y, groups=groups, cv=cv, scoring='accuracy', n_jobs=-1)

    # Return the average accuracy across all folds
    return np.mean(scores['test_score'])

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500, n_jobs=-1)


[I 2024-04-15 21:44:10,599] A new study created in memory with name: no-name-90c42519-5095-4d70-8cb2-aadb21c8bd05
[I 2024-04-15 21:44:36,008] Trial 6 finished with value: 0.2509372630830823 and parameters: {'classifier': 'SVC', 'svc_c': 1.7115072442684504e-06, 'svc_kernel': 'sigmoid', 'svc_gamma': 'auto'}. Best is trial 6 with value: 0.2509372630830823.
[I 2024-04-15 21:44:44,970] Trial 0 finished with value: 0.6714174983792134 and parameters: {'classifier': 'ExtraTrees', 'et_criterion': 'entropy', 'et_n_estimators': 254, 'et_max_depth_option': None, 'et_min_samples_split': 9}. Best is trial 0 with value: 0.6714174983792134.
[I 2024-04-15 21:44:57,309] Trial 1 finished with value: 0.2509372630830823 and parameters: {'classifier': 'SVC', 'svc_c': 3.252827754454528e-10, 'svc_kernel': 'poly', 'svc_gamma': 'scale'}. Best is trial 0 with value: 0.6714174983792134.
[I 2024-04-15 21:45:01,096] Trial 5 finished with value: 0.6592850568564608 and parameters: {'classifier': 'ExtraTrees', 'et_cri

In [11]:
results_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
results_df.to_csv("../../data/studies/model_tuning.csv")

In [12]:
import joblib

joblib.dump(study, "../../data/studies/model_tuning.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

['../../data/studies/model_tuning.pkl']

In [13]:
print(study.best_trial)

FrozenTrial(number=384, state=1, values=[0.7019578823076598], datetime_start=datetime.datetime(2024, 4, 15, 23, 29, 19, 154297), datetime_complete=datetime.datetime(2024, 4, 15, 23, 31, 38, 812391), params={'classifier': 'RandomForest', 'rf_criterion': 'entropy', 'rf_n_estimators': 189, 'rf_max_depth_option': None, 'rf_min_samples_split': 3}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('ExtraTrees', 'RandomForest', 'SVC')), 'rf_criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'rf_n_estimators': IntDistribution(high=300, log=False, low=50, step=1), 'rf_max_depth_option': CategoricalDistribution(choices=(None, 'Specify')), 'rf_min_samples_split': IntDistribution(high=10, log=False, low=2, step=1)}, trial_id=384, value=None)


In [14]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [15]:
plot_slice(study)

In [16]:
optuna.visualization.plot_param_importances(study)