#### **This notebook contains the functions used to clean the spectra.**

Import Libraries

In [10]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
from Spectra_Preparation_Functions import *
import matplotlib.pyplot as plt

Read the spectral data

In [11]:
df = pd.read_csv("../../data/kfold_parameters.csv")


In [12]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,0.024722
1,201210-1-00,294,400.91116,201210-1,Normal,0.024731
2,201210-1-00,295,401.59454,201210-1,Normal,0.024737
3,201210-1-00,296,402.27789,201210-1,Normal,0.024741
4,201210-1-00,297,402.96127,201210-1,Normal,0.024743
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,0.016583
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,0.016581
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,0.016580
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,0.016580


#### **Machine Learning**

In [15]:
def evaluate_model(df, model):

    # Set the Surfaces as groups
    X = df.drop(['Status'], axis=1)
    y = df['Status']
    
    # Using GroupKFold for classification tasks
    cv = KFold(n_splits=10, random_state=1234, shuffle=True)

    # Cross Validate
    scores = cross_validate(model, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)

    # Displaying the results
    print(f"{model.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores['test_accuracy']):.4f} +/- {np.std(scores['test_accuracy']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation Precision: {np.mean(scores['test_precision_macro']):.4f} +/- {np.std(scores['test_precision_macro']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation Recall: {np.mean(scores['test_recall_macro']):.4f} +/- {np.std(scores['test_recall_macro']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation F1-Score: {np.mean(scores['test_f1_macro']):.4f} +/- {np.std(scores['test_f1_macro']):.4f}")

In [16]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Pivot the DataFrame to get wavelengths as columns and absorbance values
    wavelength_df = df.pivot(index='SpecID', columns='WaveNumber', values=absorbance_col).reset_index()
    wavelength_df.columns.name = None

    # Merge with the statuses based on SpecID
    statuses = df[['SpecID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SpecID')

    return wavelength_df

In [17]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

In [18]:
et = ExtraTreesClassifier(random_state=1234)
rf = RandomForestClassifier(random_state=1234)
svc = SVC(random_state=1234)

In [19]:
evaluate_model(wavelength_df, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.9150 +/- 0.0167
ExtraTreesClassifier Cross-Validation Precision: 0.9147 +/- 0.0159
ExtraTreesClassifier Cross-Validation Recall: 0.9156 +/- 0.0158
ExtraTreesClassifier Cross-Validation F1-Score: 0.9142 +/- 0.0159


In [20]:
evaluate_model(wavelength_df, rf)

RandomForestClassifier Cross-Validation Accuracy: 0.8815 +/- 0.0195
RandomForestClassifier Cross-Validation Precision: 0.8813 +/- 0.0187
RandomForestClassifier Cross-Validation Recall: 0.8827 +/- 0.0199
RandomForestClassifier Cross-Validation F1-Score: 0.8805 +/- 0.0198


In [21]:
evaluate_model(wavelength_df, svc)

SVC Cross-Validation Accuracy: 0.7721 +/- 0.0154
SVC Cross-Validation Precision: 0.7729 +/- 0.0147
SVC Cross-Validation Recall: 0.7724 +/- 0.0151
SVC Cross-Validation F1-Score: 0.7711 +/- 0.0150
