#### **This notebook contains the functions used to clean the spectra.**

Import Libraries

In [1]:
import sys
sys.path.append('..')  # Adds the parent directory to the path so Python can find the `Cleaning_and_Evaluation` package

import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
import seaborn as sns
import matplotlib.pyplot as plt
from Cleaning_and_Evaluation import *

Read the spectral data

In [2]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_400-1800.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


> **Clean the Spectra With the Chosen Parameters**

In [4]:
cleaning_params = {
    'despike': True,
    'baseline_correct': True,
    'smoothing': True,
    'scaling': 'vector',
    'despike_ma': 10,
    'despike_threshold': 7,
    'lam': 10**7,
    'p': 0.01,
    'window_size': 21,
    'poly_order': 3
}
spectra_cleaning(df, **cleaning_params)

df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,0.018836,201210-1,Normal
1,201210-1-00,294,400.91116,0.017018,201210-1,Normal
2,201210-1-00,295,401.59454,0.015626,201210-1,Normal
3,201210-1-00,296,402.27789,0.014620,201210-1,Normal
4,201210-1-00,297,402.96127,0.013957,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,0.014480,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,0.014751,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,0.015144,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,0.015686,210526-3,Hyperglycemia


In [5]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.018836,0.017018,0.015626,0.014620,0.013957,0.013595,0.013493,0.013607,0.013897,0.014320,...,0.011467,0.011340,0.010818,0.009837,0.008333,0.006242,0.003501,0.000044,201210-1,Normal
201210-1-01,0.006818,0.007851,0.008672,0.009304,0.009770,0.010093,0.010296,0.010402,0.010434,0.010414,...,0.007621,0.007526,0.007602,0.007896,0.008451,0.009313,0.010528,0.012140,201210-1,Normal
201210-1-02,0.008443,0.007880,0.007373,0.006922,0.006527,0.006189,0.005908,0.005685,0.005520,0.005414,...,0.001663,0.001841,0.001976,0.002050,0.002046,0.001945,0.001729,0.001380,201210-1,Normal
201210-1-03,0.001885,0.002656,0.003235,0.003642,0.003896,0.004015,0.004018,0.003923,0.003750,0.003518,...,0.000473,0.000574,0.000736,0.000963,0.001261,0.001637,0.002095,0.002640,201210-1,Normal
201210-1-04,0.017577,0.016497,0.015381,0.014247,0.013111,0.011991,0.010903,0.009864,0.008892,0.008002,...,0.000251,0.000057,0.000170,0.000665,0.001615,0.003092,0.005169,0.007921,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.009250,0.009758,0.010232,0.010663,0.011043,0.011363,0.011615,0.011789,0.011876,0.011868,...,0.014275,0.015312,0.016013,0.016280,0.016015,0.015123,0.013506,0.011066,210526-3,Hyperglycemia
210526-3-46,0.012931,0.011140,0.009881,0.009090,0.008702,0.008654,0.008879,0.009314,0.009895,0.010557,...,0.011069,0.011422,0.011892,0.012498,0.013261,0.014200,0.015335,0.016686,210526-3,Hyperglycemia
210526-3-47,0.005788,0.010199,0.013364,0.015420,0.016506,0.016760,0.016317,0.015316,0.013895,0.012191,...,0.017112,0.018450,0.019477,0.020098,0.020221,0.019753,0.018601,0.016672,210526-3,Hyperglycemia
210526-3-48,0.007008,0.010383,0.012751,0.014232,0.014948,0.015020,0.014567,0.013712,0.012574,0.011275,...,0.014601,0.015251,0.016156,0.017352,0.018871,0.020748,0.023016,0.025709,210526-3,Hyperglycemia


In [6]:
et = ExtraTreesClassifier(random_state=1234)
rf = RandomForestClassifier(random_state=1234)
svc = SVC(random_state=1234)

##### **Full Spectra**

In [7]:
evaluate_model(wavelength_df, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.8706 +/- 0.0214
ExtraTreesClassifier Cross-Validation Precision: 0.8717 +/- 0.0213
ExtraTreesClassifier Cross-Validation Recall: 0.8716 +/- 0.0204
ExtraTreesClassifier Cross-Validation F1-Score: 0.8700 +/- 0.0215


In [8]:
evaluate_model(wavelength_df, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.8460 +/- 0.0206
RandomForestClassifier Cross-Validation Precision: 0.8454 +/- 0.0223
RandomForestClassifier Cross-Validation Recall: 0.8462 +/- 0.0213
RandomForestClassifier Cross-Validation F1-Score: 0.8442 +/- 0.0213


In [9]:
evaluate_model(wavelength_df, svc, groupkfold=False)

##### **Peak Stats**

In [None]:
peaks = []
widths = []
prominences = []
statuses = df[['SpecID', 'Status']].drop_duplicates()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])


    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences
peaks_df

# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

peak_stats.shape
peak_stats = pd.merge(statuses, peak_stats, on='SpecID', how='left')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [None]:
peak_stats

Unnamed: 0_level_0,Status,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,Normal,33.590344,34.594272,251,159.095461,-26.402516,5.051673,16.769826,200.216791,0.502550,4.483874,18.800052,169.775455,0.002746
201210-1-01,Normal,36.490851,44.288630,216,206.147390,-32.590469,5.117660,14.065389,147.630935,0.503152,6.393594,26.533805,223.590382,0.006419
201210-1-02,Normal,63.466734,129.274420,182,1464.499730,-79.027032,4.775708,9.892271,55.950546,0.503520,18.547456,123.894318,1499.356120,0.010434
201210-1-03,Normal,100.391970,182.055250,151,998.434555,-113.170559,5.440494,11.403335,63.376129,0.501245,29.869794,146.267824,1060.096242,0.004750
201210-1-04,Normal,64.777083,71.300565,215,306.678766,-55.324916,5.716222,18.453179,197.787362,0.501506,7.107753,31.103420,363.817836,0.001538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-05,Hyperglycemia,50.016569,71.629391,194,498.333123,-38.440178,5.438161,12.062597,94.253027,0.502982,11.711844,47.037205,532.216946,0.006200
210526-3-06,Hyperglycemia,49.560968,76.842996,182,478.158989,-36.622331,6.135346,13.163073,85.192044,0.504635,11.925983,46.051860,513.398962,0.007081
210526-3-07,Hyperglycemia,46.525756,53.048528,195,390.362523,-33.953010,5.901795,14.919547,124.484695,0.501605,9.734300,37.255653,424.682817,0.003518
210526-3-08,Hyperglycemia,45.688366,46.582246,217,299.422840,-30.178801,5.428957,12.891786,100.819036,0.503390,7.787151,29.079851,327.396790,0.008119


In [None]:
evaluate_model(peak_stats, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6355 +/- 0.0190
ExtraTreesClassifier Cross-Validation Precision: 0.6335 +/- 0.0186
ExtraTreesClassifier Cross-Validation Recall: 0.6319 +/- 0.0179
ExtraTreesClassifier Cross-Validation F1-Score: 0.6312 +/- 0.0179


In [None]:
evaluate_model(peak_stats, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.6299 +/- 0.0204
RandomForestClassifier Cross-Validation Precision: 0.6276 +/- 0.0209
RandomForestClassifier Cross-Validation Recall: 0.6261 +/- 0.0211
RandomForestClassifier Cross-Validation F1-Score: 0.6251 +/- 0.0207


In [None]:
evaluate_model(peak_stats, svc, groupkfold=False)

SVC Cross-Validation Accuracy: 0.4299 +/- 0.0321
SVC Cross-Validation Precision: 0.5551 +/- 0.0486
SVC Cross-Validation Recall: 0.4127 +/- 0.0222
SVC Cross-Validation F1-Score: 0.3565 +/- 0.0262


##### **Peak Bins**

In [None]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"
# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
peaks_df.head()

# Set the bins as columns with the peak absrobances, widths and prominences as the values.
# If no peaks appear in a bin the value is set to 0.
# If multiple peaks appear their properties are aggregated.

# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with 0
peak_bins.fillna(0, inplace=True)
peak_bins.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_775-800,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,79.348488,51.220241,56.270595,43.622056,9.791624,23.050053,11.951954,-2.060928,-4.324703,28.507881,...,1.079688,5.60529,1.425452,9.092851,4.989642,29.439388,1.346161,1.716824,1.339044,Normal
201210-1-01,77.237541,33.905318,49.270501,31.5868,3.304689,21.400157,14.579625,9.078719,-10.360994,0.805333,...,0.911928,1.412806,0.554703,2.452688,0.921724,70.524064,19.42184,0.81203,12.108906,Normal
201210-1-02,132.816715,187.44222,69.677144,56.429799,12.626494,67.955977,29.686681,-3.111961,5.960533,25.575162,...,1.582275,1.030469,1.66381,1.639609,0.0,29.350815,0.0,1.796942,3.345436,Normal
201210-1-03,371.600578,212.368633,210.376974,129.626363,125.077805,0.0,625.666908,0.0,37.979164,77.983322,...,0.0,1.473284,1.069089,0.0,0.0,63.376129,0.0,60.42336,1.246936,Normal
201210-1-04,82.989091,33.333771,35.816497,10.215812,6.639818,37.646776,56.228362,87.822719,41.492328,36.29409,...,1.971683,2.140093,2.14675,16.566306,2.824986,5.762471,3.919114,1.00186,1.682968,Normal


In [None]:
evaluate_model(peak_bins, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.8407 +/- 0.0151
ExtraTreesClassifier Cross-Validation Precision: 0.8408 +/- 0.0153
ExtraTreesClassifier Cross-Validation Recall: 0.8406 +/- 0.0148
ExtraTreesClassifier Cross-Validation F1-Score: 0.8390 +/- 0.0155


In [None]:
evaluate_model(peak_bins, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.8306 +/- 0.0244
RandomForestClassifier Cross-Validation Precision: 0.8298 +/- 0.0244
RandomForestClassifier Cross-Validation Recall: 0.8314 +/- 0.0252
RandomForestClassifier Cross-Validation F1-Score: 0.8290 +/- 0.0246


In [None]:
evaluate_model(peak_bins, svc, groupkfold=False)

SVC Cross-Validation Accuracy: 0.5297 +/- 0.0337
SVC Cross-Validation Precision: 0.6073 +/- 0.0246
SVC Cross-Validation Recall: 0.5183 +/- 0.0255
SVC Cross-Validation F1-Score: 0.4976 +/- 0.0355
