#### **Here are the Optimum KFold Results from 400-1800 cm-1**

These parameters are used to compare the full wavelength with the peak featuresets.

Import Libraries

In [1]:
import sys
sys.path.append('..')  # Adds the parent directory to the path so Python can find the `Cleaning_and_Evaluation` package
from Cleaning_and_Evaluation import *
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import find_peaks, peak_prominences, peak_widths

Read the spectral data

In [2]:
df = pd.read_csv("../data/exosomes.raw_spectrum_400-1800.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


> ##### **Clean the Spectra With the Chosen Parameters**

In [4]:
cleaning_params = {
    'despike': False,
    'baseline_correct': True,
    'smoothing': True,
    'scaling': False,
    'despike_ma': 10,
    'despike_threshold': 7,
    'lam': 10**9,
    'p': 0.05,
    'window_size': 35,
    'poly_order': 3
}
spectra_cleaning(df, **cleaning_params)

df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,31.645788,201210-1,Normal
1,201210-1-00,294,400.91116,31.890799,201210-1,Normal
2,201210-1-00,295,401.59454,32.060592,201210-1,Normal
3,201210-1-00,296,402.27789,32.161890,201210-1,Normal
4,201210-1-00,297,402.96127,32.201418,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,-17.773341,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,-16.954783,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,-16.058237,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,-15.079158,210526-3,Hyperglycemia


In [5]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,31.645788,31.890799,32.060592,32.161890,32.201418,32.185899,32.122059,32.016620,31.876308,31.707846,...,-10.777774,-11.616896,-12.643726,-13.874421,-15.325136,-17.012028,-18.951251,-21.158961,201210-1,Normal
201210-1-01,42.196054,42.922684,43.593684,44.214403,44.790193,45.326403,45.828383,46.301484,46.751056,47.182449,...,-13.220270,-12.939614,-12.661874,-12.389675,-12.125640,-11.872393,-11.632560,-11.408763,201210-1,Normal
201210-1-02,138.865467,133.819502,129.408292,125.595054,122.343009,119.615374,117.375368,115.586210,114.211117,113.213310,...,-34.538639,-34.402384,-34.255415,-34.103739,-33.953362,-33.810290,-33.680531,-33.570091,201210-1,Normal
201210-1-03,232.772008,232.126129,231.411228,230.634073,229.801431,228.920069,227.996754,227.038255,226.051337,225.042769,...,-61.351792,-60.916246,-60.267221,-59.384911,-58.249507,-56.841200,-55.140182,-53.126644,201210-1,Normal
201210-1-04,145.574231,137.246285,129.875220,123.408835,117.794931,112.981307,108.915764,105.546101,102.820118,100.685616,...,-59.759990,-59.519263,-59.174454,-58.724981,-58.170266,-57.509728,-56.742787,-55.868862,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.170339,-0.261030,-0.613879,-0.894331,-1.108508,-1.262533,-1.362528,-1.414617,-1.424920,-1.399561,...,-16.124202,-15.626106,-15.172246,-14.769544,-14.424923,-14.145303,-13.937608,-13.808758,210526-3,Hyperglycemia
210526-3-46,-8.350328,-6.780715,-5.535203,-4.589109,-3.917748,-3.496437,-3.300490,-3.305224,-3.485955,-3.817999,...,-18.517344,-17.926895,-17.264189,-16.528669,-15.719778,-14.836957,-13.879649,-12.847295,210526-3,Hyperglycemia
210526-3-47,5.365557,4.697456,4.102545,3.575986,3.112945,2.708585,2.358069,2.056561,1.799226,1.581226,...,-16.283957,-15.325432,-14.335147,-13.318813,-12.282137,-11.230830,-10.170601,-9.107159,210526-3,Hyperglycemia
210526-3-48,-3.551670,-4.188896,-4.736459,-5.196958,-5.572996,-5.867172,-6.082087,-6.220343,-6.284541,-6.277280,...,-10.047690,-9.290136,-8.501070,-7.680554,-6.828650,-5.945421,-5.030930,-4.085240,210526-3,Hyperglycemia


In [6]:
et = ExtraTreesClassifier(random_state=1234)
rf = RandomForestClassifier(random_state=1234)
svc = SVC(random_state=1234)

---

> ##### **Full Spectra**

In [7]:
evaluate_model(wavelength_df, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.9366 +/- 0.0088
ExtraTreesClassifier Cross-Validation Precision: 0.9363 +/- 0.0091
ExtraTreesClassifier Cross-Validation Recall: 0.9360 +/- 0.0084
ExtraTreesClassifier Cross-Validation F1-Score: 0.9357 +/- 0.0087


In [8]:
evaluate_model(wavelength_df, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.9077 +/- 0.0173
RandomForestClassifier Cross-Validation Precision: 0.9076 +/- 0.0174
RandomForestClassifier Cross-Validation Recall: 0.9085 +/- 0.0171
RandomForestClassifier Cross-Validation F1-Score: 0.9068 +/- 0.0176


In [9]:
evaluate_model(wavelength_df, svc, groupkfold=False)

SVC Cross-Validation Accuracy: 0.5547 +/- 0.0295
SVC Cross-Validation Precision: 0.6596 +/- 0.0288
SVC Cross-Validation Recall: 0.5444 +/- 0.0214
SVC Cross-Validation F1-Score: 0.5292 +/- 0.0287


---

> ##### **Peak Stats**

In [10]:
peaks = []
widths = []
prominences = []
statuses = df[['SpecID', 'Status', 'SurID']].drop_duplicates()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])


    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences
peaks_df

# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

peak_stats.shape
peak_stats = pd.merge(statuses, peak_stats, on='SpecID', how='left')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [11]:
peak_stats

Unnamed: 0_level_0,Status,SurID,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
201210-1-00,Normal,201210-1,33.590344,34.594272,251,159.095461,-26.402516,5.051673,16.769826,200.216791,0.502550,4.483874,18.800052,169.775455,0.002746
201210-1-01,Normal,201210-1,36.490851,44.288630,216,206.147390,-32.590469,5.117660,14.065389,147.630935,0.503152,6.393594,26.533805,223.590382,0.006419
201210-1-02,Normal,201210-1,63.466734,129.274420,182,1464.499730,-79.027032,4.775708,9.892271,55.950546,0.503520,18.547456,123.894318,1499.356120,0.010434
201210-1-03,Normal,201210-1,100.391970,182.055250,151,998.434555,-113.170559,5.440494,11.403335,63.376129,0.501245,29.869794,146.267824,1060.096242,0.004750
201210-1-04,Normal,201210-1,64.777083,71.300565,215,306.678766,-55.324916,5.716222,18.453179,197.787362,0.501506,7.107753,31.103420,363.817836,0.001538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-05,Hyperglycemia,210526-3,50.016569,71.629391,194,498.333123,-38.440178,5.438161,12.062597,94.253027,0.502982,11.711844,47.037205,532.216946,0.006200
210526-3-06,Hyperglycemia,210526-3,49.560968,76.842996,182,478.158989,-36.622331,6.135346,13.163073,85.192044,0.504635,11.925983,46.051860,513.398962,0.007081
210526-3-07,Hyperglycemia,210526-3,46.525756,53.048528,195,390.362523,-33.953010,5.901795,14.919547,124.484695,0.501605,9.734300,37.255653,424.682817,0.003518
210526-3-08,Hyperglycemia,210526-3,45.688366,46.582246,217,299.422840,-30.178801,5.428957,12.891786,100.819036,0.503390,7.787151,29.079851,327.396790,0.008119


In [12]:
evaluate_model(peak_stats, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6355 +/- 0.0190
ExtraTreesClassifier Cross-Validation Precision: 0.6335 +/- 0.0186
ExtraTreesClassifier Cross-Validation Recall: 0.6319 +/- 0.0179
ExtraTreesClassifier Cross-Validation F1-Score: 0.6312 +/- 0.0179


In [13]:
evaluate_model(peak_stats, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.6299 +/- 0.0204
RandomForestClassifier Cross-Validation Precision: 0.6276 +/- 0.0209
RandomForestClassifier Cross-Validation Recall: 0.6261 +/- 0.0211
RandomForestClassifier Cross-Validation F1-Score: 0.6251 +/- 0.0207


In [14]:
evaluate_model(peak_stats, svc, groupkfold=False)

SVC Cross-Validation Accuracy: 0.4299 +/- 0.0321
SVC Cross-Validation Precision: 0.5551 +/- 0.0486
SVC Cross-Validation Recall: 0.4127 +/- 0.0222
SVC Cross-Validation F1-Score: 0.3565 +/- 0.0262


---

> ##### **Peak Bins**

In [15]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"
# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
peaks_df.head()

# Set the bins as columns with the peak absrobances, widths and prominences as the values.
# If no peaks appear in a bin the value is set to 0.
# If multiple peaks appear their properties are aggregated.

# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status', 'SurID']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with 0
peak_bins.fillna(0, inplace=True)
peak_bins.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status,SurID
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,79.348488,51.220241,56.270595,43.622056,9.791624,23.050053,11.951954,-2.060928,-4.324703,28.507881,...,5.60529,1.425452,9.092851,4.989642,29.439388,1.346161,1.716824,1.339044,Normal,201210-1
201210-1-01,77.237541,33.905318,49.270501,31.5868,3.304689,21.400157,14.579625,9.078719,-10.360994,0.805333,...,1.412806,0.554703,2.452688,0.921724,70.524064,19.42184,0.81203,12.108906,Normal,201210-1
201210-1-02,132.816715,187.44222,69.677144,56.429799,12.626494,67.955977,29.686681,-3.111961,5.960533,25.575162,...,1.030469,1.66381,1.639609,0.0,29.350815,0.0,1.796942,3.345436,Normal,201210-1
201210-1-03,371.600578,212.368633,210.376974,129.626363,125.077805,0.0,625.666908,0.0,37.979164,77.983322,...,1.473284,1.069089,0.0,0.0,63.376129,0.0,60.42336,1.246936,Normal,201210-1
201210-1-04,82.989091,33.333771,35.816497,10.215812,6.639818,37.646776,56.228362,87.822719,41.492328,36.29409,...,2.140093,2.14675,16.566306,2.824986,5.762471,3.919114,1.00186,1.682968,Normal,201210-1


In [16]:
evaluate_model(peak_bins, et, groupkfold=False)

ExtraTreesClassifier Cross-Validation Accuracy: 0.8407 +/- 0.0151
ExtraTreesClassifier Cross-Validation Precision: 0.8408 +/- 0.0153
ExtraTreesClassifier Cross-Validation Recall: 0.8406 +/- 0.0148
ExtraTreesClassifier Cross-Validation F1-Score: 0.8390 +/- 0.0155


In [17]:
evaluate_model(peak_bins, rf, groupkfold=False)

RandomForestClassifier Cross-Validation Accuracy: 0.8306 +/- 0.0244
RandomForestClassifier Cross-Validation Precision: 0.8298 +/- 0.0244
RandomForestClassifier Cross-Validation Recall: 0.8314 +/- 0.0252
RandomForestClassifier Cross-Validation F1-Score: 0.8290 +/- 0.0246


In [18]:
evaluate_model(peak_bins, svc, groupkfold=False)

SVC Cross-Validation Accuracy: 0.5297 +/- 0.0337
SVC Cross-Validation Precision: 0.6073 +/- 0.0246
SVC Cross-Validation Recall: 0.5183 +/- 0.0255
SVC Cross-Validation F1-Score: 0.4976 +/- 0.0355
