### **This Notebook Performs a Parameters Search on Every Possible Cleaning and Model Parameter.**

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GroupKFold
from Spectra_Preparation_Functions import *
import optuna

Read the spectral data

In [None]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
# df = pd.read_csv("../../data/raw_df_outliers_removed.csv")
# df = pd.read_csv("../../data/exosomes.raw_spectrum_400-1800.csv")
df = pd.read_csv("../../data/400-1800_with_raw_scaled_surface_pagerank.csv")


In [None]:
df

In [None]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

>#### **Use Optuna to Find the Best Cleaning Parameter**

In [None]:
df

In [None]:
def objective(trial):

    # Decide which cleaning steps to implement
    pagerank_cutoff = trial.suggest_float('pagerank_cutoff', 0, 1.5)
    despike = trial.suggest_categorical('despike', [True, False])
    baseline_correct = trial.suggest_categorical('baseline_correct', [True, False])
    smoothing = trial.suggest_categorical('smoothing', [True, False])
    scaling = trial.suggest_categorical('scaling', [True, False])

    df_copy = df.copy()

    # Filter less central spectra in each surface
    df_copy = df_copy[df_copy['PageRank'] > pagerank_cutoff]

    # Apply preprocessing based on suggested parameters
    if despike:
        despike_ma = trial.suggest_int('despike_ma', 5, 200)
        despike_threshold = trial.suggest_float('despike_threshold', 3, 10, step=0.1)
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold = despike_threshold))

    if baseline_correct:
        lam = trial.suggest_categorical('lam', [10**2, 10**3, 10**4, 10**5, 10**6, 10**7, 10**8, 10**9, 10**10, 10**11])
        p = trial.suggest_float('p', 0.001, 0.1, step=0.001)
        df_copy['Baseline'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
        df_copy['Absorbance'] = df_copy['Absorbance'] - df_copy['Baseline']

    if smoothing:
        window_size = trial.suggest_int('window_size', 6, 251)
        poly_order = trial.suggest_int('poly_order', 1, 5)
        df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))

    if scaling:
        scaling_type = trial.suggest_categorical('scaling_type', ['normal', 'vector', 'snv'])
        if scaling_type == 'normal':
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))
        elif scaling_type == 'vector':
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: vector_normalise(x))
        else:
            df_copy['Absorbance'] = df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: snv_normalise(x))

    # Prepare data for ML
    wavelength_df = prepare_wavelength_df(df_copy, 'Absorbance')
    X = wavelength_df.drop(['Status', 'SurID'], axis=1)
    y = wavelength_df['Status']
    groups = wavelength_df['SurID']

    # Suggest classifier type
    classifier_name = trial.suggest_categorical("classifier", ["ExtraTrees", "RandomForest", "SVC"])

    if classifier_name == "ExtraTrees":
        criterion = trial.suggest_categorical("et_criterion", ["gini", "entropy"])
        n_estimators = trial.suggest_int("et_n_estimators", 10, 100)
        max_depth_option = trial.suggest_categorical("et_max_depth_option", [None, "Specify"])
        max_depth = trial.suggest_int("et_max_depth", 2, 32, log=True) if max_depth_option == "Specify" else None
        class_weight = trial.suggest_categorical("et_class_weight_option", [None, "balanced"])
        classifier = ExtraTreesClassifier(random_state=1234, criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight)

    elif classifier_name == "RandomForest":
        criterion = trial.suggest_categorical("rf_criterion", ["gini", "entropy"])
        n_estimators = trial.suggest_int("rf_n_estimators", 10, 100)
        max_depth_option = trial.suggest_categorical("rf_max_depth_option", [None, "Specify"])
        max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True) if max_depth_option == "Specify" else None
        class_weight = trial.suggest_categorical("rf_class_weight_option", [None, "balanced"])
        classifier = RandomForestClassifier(random_state=1234, criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight)
        
    else:
        classifier = SVC(
            C=trial.suggest_float("svc_c", 1e-10, 1e10, log=True),
            kernel=trial.suggest_categorical("svc_kernel", ["linear", "poly", "rbf", "sigmoid"]),
            gamma=trial.suggest_categorical("svc_gamma", ["scale", "auto"]),
            class_weight=trial.suggest_categorical("svc_class_weight_option", [None, "balanced"])
            random_state=1234,
        )

    cv = GroupKFold(n_splits=10)

    # Perform cross-validation
    scores = cross_validate(classifier, X, y, groups=groups, cv=cv, scoring='accuracy', n_jobs=-1)

    # Return the average F1 scpre across all folds
    return np.mean(scores['test_score'])


sampler = optuna.samplers.TPESampler(seed=1234)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(direction='maximize', sampler=sampler)

#study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, n_jobs=-1)
study.optimize(objective, n_trials=500, n_jobs=-1)

print(study.best_trial)

In [None]:
results_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
results_df.to_csv("../../data/studies/class_weights_acc_score_optuna_cleaning_and_model_filtered_parameters.csv")

In [None]:
import joblib

joblib.dump(study, "../../data/studies/class_weights_acc_score_optuna_cleaning_and_model_filtered_parameters.pkl")

# loaded_study = joblib.load("../../data/studies/cleaning_study.pkl")
# print("Best trial until now:")
# print(" Value: ", loaded_study.best_trial.value)
# print(" Params: ")
# for key, value in loaded_study.best_trial.params.items():
#     print(f"    {key}: {value}")

In [None]:
print(study.best_trial)

In [None]:
from optuna.visualization import plot_optimization_history, plot_slice

plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_param_importances(study)

Continue the study.

In [None]:
# study.optimize(objective, n_trials=100, n_jobs=-1)

# print(study.best_trial)