#### **This notebook contains the functions used to clean the spectra.**

Import Libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate, GroupKFold
import seaborn as sns
from Spectra_Preparation_Functions import *
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from scipy.signal import peak_widths, peak_prominences


Read the spectral data

In [None]:
df = pd.read_csv("../../data/groupkfold_parameters.csv")


In [None]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,31.645788
1,201210-1-00,294,400.91116,201210-1,Normal,31.890799
2,201210-1-00,295,401.59454,201210-1,Normal,32.060592
3,201210-1-00,296,402.27789,201210-1,Normal,32.161890
4,201210-1-00,297,402.96127,201210-1,Normal,32.201418
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,-17.773341
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,-16.954783
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,-16.058237
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,-15.079158


In [None]:
def evaluate_model(df, model):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)

    # Cross Validate
    scores = cross_validate(model, X, y, groups=groups, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)

    # Displaying the results
    print(f"{model.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores['test_accuracy']):.4f} +/- {np.std(scores['test_accuracy']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation Precision: {np.mean(scores['test_precision_macro']):.4f} +/- {np.std(scores['test_precision_macro']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation Recall: {np.mean(scores['test_recall_macro']):.4f} +/- {np.std(scores['test_recall_macro']):.4f}")
    print(f"{model.__class__.__name__} Cross-Validation F1-Score: {np.mean(scores['test_f1_macro']):.4f} +/- {np.std(scores['test_f1_macro']):.4f}")

In [None]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Pivot the DataFrame to get wavelengths as columns and absorbance values
    wavelength_df = df.pivot(index='SpecID', columns='WaveNumber', values=absorbance_col).reset_index()
    wavelength_df.columns.name = None

    # Merge with the statuses based on SpecID
    # Include the SurID to perform GroupKFold CV
    statuses_and_surface = df[['SpecID', 'SurID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses_and_surface, on='SpecID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SpecID')

    return wavelength_df

In [None]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

In [None]:
et = ExtraTreesClassifier(random_state=1234)
rf = RandomForestClassifier(random_state=1234)
svc = SVC(random_state=1234)

##### **Full Spectra**

In [None]:
evaluate_model(wavelength_df, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.9366 +/- 0.0088
ExtraTreesClassifier Cross-Validation Precision: 0.9363 +/- 0.0091
ExtraTreesClassifier Cross-Validation Recall: 0.9360 +/- 0.0084
ExtraTreesClassifier Cross-Validation F1-Score: 0.9357 +/- 0.0087


In [None]:
evaluate_model(wavelength_df, rf)

RandomForestClassifier Cross-Validation Accuracy: 0.9077 +/- 0.0173
RandomForestClassifier Cross-Validation Precision: 0.9076 +/- 0.0174
RandomForestClassifier Cross-Validation Recall: 0.9085 +/- 0.0171
RandomForestClassifier Cross-Validation F1-Score: 0.9068 +/- 0.0176


In [None]:
evaluate_model(wavelength_df, svc)

KeyboardInterrupt: 

##### **Peak Stats**

In [None]:
peaks = []
widths = []
prominences = []
statuses = df[['SpecID', 'Status']].drop_duplicates()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])


    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences
peaks_df

# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

peak_stats.shape
peak_stats = pd.merge(statuses, peak_stats, on='SpecID', how='left')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [None]:
peak_stats

Unnamed: 0_level_0,Status,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,Normal,33.590344,34.594272,251,159.095461,-26.402516,5.051673,16.769826,200.216791,0.502550,4.483874,18.800052,169.775455,0.002746
201210-1-01,Normal,36.490851,44.288630,216,206.147390,-32.590469,5.117660,14.065389,147.630935,0.503152,6.393594,26.533805,223.590382,0.006419
201210-1-02,Normal,63.466734,129.274420,182,1464.499730,-79.027032,4.775708,9.892271,55.950546,0.503520,18.547456,123.894318,1499.356120,0.010434
201210-1-03,Normal,100.391970,182.055250,151,998.434555,-113.170559,5.440494,11.403335,63.376129,0.501245,29.869794,146.267824,1060.096242,0.004750
201210-1-04,Normal,64.777083,71.300565,215,306.678766,-55.324916,5.716222,18.453179,197.787362,0.501506,7.107753,31.103420,363.817836,0.001538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-05,Hyperglycemia,50.016569,71.629391,194,498.333123,-38.440178,5.438161,12.062597,94.253027,0.502982,11.711844,47.037205,532.216946,0.006200
210526-3-06,Hyperglycemia,49.560968,76.842996,182,478.158989,-36.622331,6.135346,13.163073,85.192044,0.504635,11.925983,46.051860,513.398962,0.007081
210526-3-07,Hyperglycemia,46.525756,53.048528,195,390.362523,-33.953010,5.901795,14.919547,124.484695,0.501605,9.734300,37.255653,424.682817,0.003518
210526-3-08,Hyperglycemia,45.688366,46.582246,217,299.422840,-30.178801,5.428957,12.891786,100.819036,0.503390,7.787151,29.079851,327.396790,0.008119


In [None]:
evaluate_model(peak_stats, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6355 +/- 0.0190
ExtraTreesClassifier Cross-Validation Precision: 0.6335 +/- 0.0186
ExtraTreesClassifier Cross-Validation Recall: 0.6319 +/- 0.0179
ExtraTreesClassifier Cross-Validation F1-Score: 0.6312 +/- 0.0179


In [None]:
evaluate_model(peak_stats, rf)

RandomForestClassifier Cross-Validation Accuracy: 0.6299 +/- 0.0204
RandomForestClassifier Cross-Validation Precision: 0.6276 +/- 0.0209
RandomForestClassifier Cross-Validation Recall: 0.6261 +/- 0.0211
RandomForestClassifier Cross-Validation F1-Score: 0.6251 +/- 0.0207


In [None]:
evaluate_model(peak_stats, svc)

SVC Cross-Validation Accuracy: 0.4299 +/- 0.0321
SVC Cross-Validation Precision: 0.5551 +/- 0.0486
SVC Cross-Validation Recall: 0.4127 +/- 0.0222
SVC Cross-Validation F1-Score: 0.3565 +/- 0.0262


##### **Peak Bins**

In [None]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"
# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
peaks_df.head()

# Set the bins as columns with the peak absrobances, widths and prominences as the values.
# If no peaks appear in a bin the value is set to 0.
# If multiple peaks appear their properties are aggregated.

# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with 0
peak_bins.fillna(0, inplace=True)
peak_bins.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_775-800,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,79.348488,51.220241,56.270595,43.622056,9.791624,23.050053,11.951954,-2.060928,-4.324703,28.507881,...,1.079688,5.60529,1.425452,9.092851,4.989642,29.439388,1.346161,1.716824,1.339044,Normal
201210-1-01,77.237541,33.905318,49.270501,31.5868,3.304689,21.400157,14.579625,9.078719,-10.360994,0.805333,...,0.911928,1.412806,0.554703,2.452688,0.921724,70.524064,19.42184,0.81203,12.108906,Normal
201210-1-02,132.816715,187.44222,69.677144,56.429799,12.626494,67.955977,29.686681,-3.111961,5.960533,25.575162,...,1.582275,1.030469,1.66381,1.639609,0.0,29.350815,0.0,1.796942,3.345436,Normal
201210-1-03,371.600578,212.368633,210.376974,129.626363,125.077805,0.0,625.666908,0.0,37.979164,77.983322,...,0.0,1.473284,1.069089,0.0,0.0,63.376129,0.0,60.42336,1.246936,Normal
201210-1-04,82.989091,33.333771,35.816497,10.215812,6.639818,37.646776,56.228362,87.822719,41.492328,36.29409,...,1.971683,2.140093,2.14675,16.566306,2.824986,5.762471,3.919114,1.00186,1.682968,Normal


In [None]:
evaluate_model(peak_bins, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.8407 +/- 0.0151
ExtraTreesClassifier Cross-Validation Precision: 0.8408 +/- 0.0153
ExtraTreesClassifier Cross-Validation Recall: 0.8406 +/- 0.0148
ExtraTreesClassifier Cross-Validation F1-Score: 0.8390 +/- 0.0155


In [None]:
evaluate_model(peak_bins, rf)

RandomForestClassifier Cross-Validation Accuracy: 0.8306 +/- 0.0244
RandomForestClassifier Cross-Validation Precision: 0.8298 +/- 0.0244
RandomForestClassifier Cross-Validation Recall: 0.8314 +/- 0.0252
RandomForestClassifier Cross-Validation F1-Score: 0.8290 +/- 0.0246


In [None]:
evaluate_model(peak_bins, svc)

SVC Cross-Validation Accuracy: 0.5297 +/- 0.0337
SVC Cross-Validation Precision: 0.6073 +/- 0.0246
SVC Cross-Validation Recall: 0.5183 +/- 0.0255
SVC Cross-Validation F1-Score: 0.4976 +/- 0.0355
