#### **This notebook evaluates the Optimum GroupKFold Results from 400-1800 cm-1**

These parameters are used to compare the full wavelength with the peak featuresets.

Import Libraries

In [1]:
import sys
sys.path.append('..')  # Adds the parent directory to the path so Python can find the `Cleaning_and_Evaluation` package
from Cleaning_and_Evaluation import *
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.signal import find_peaks, peak_prominences, peak_widths
from sklearn.model_selection import cross_validate, GroupKFold

Read the spectral data

In [2]:
df = pd.read_csv("../data/exosomes.raw_spectrum_400-1800.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


> ##### **Clean the Spectra With the Chosen Parameters**

In [4]:
cleaning_params = {
    'despike': True,
    'baseline_correct': True,
    'smoothing': True,
    'scaling': False,
    'despike_ma': 20,
    'despike_threshold': 7,
    'lam': 10**8,
    'p': 0.01,
    'window_size': 51,
    'poly_order': 3
}
spectra_cleaning(df, **cleaning_params)

df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,41.863303,201210-1,Normal
1,201210-1-00,294,400.91116,41.803843,201210-1,Normal
2,201210-1-00,295,401.59454,41.741884,201210-1,Normal
3,201210-1-00,296,402.27789,41.677722,201210-1,Normal
4,201210-1-00,297,402.96127,41.611654,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,12.378163,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,13.269937,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,14.199285,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,15.166531,210526-3,Hyperglycemia


In [5]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,41.863303,41.803843,41.741884,41.677722,41.611654,41.543974,41.474980,41.404968,41.334234,41.263073,...,6.280946,5.549559,4.745724,3.866578,2.909255,1.870891,0.748623,-0.460415,201210-1,Normal
201210-1-01,46.314608,47.323684,48.299209,49.241395,50.150457,51.026608,51.870063,52.681035,53.459738,54.206386,...,6.769011,7.280928,7.861246,8.512936,9.238972,10.042323,10.925962,11.892860,201210-1,Normal
201210-1-02,118.159018,114.686240,111.563911,108.777452,106.312282,104.153823,102.287493,100.698715,99.372907,98.295491,...,-4.633601,-4.557349,-4.439365,-4.278894,-4.075180,-3.827470,-3.535010,-3.197043,201210-1,Normal
201210-1-03,175.466997,174.846086,174.188020,173.498226,172.782129,172.045155,171.292728,170.530275,169.763222,168.996993,...,-10.801936,-10.349539,-9.864191,-9.347124,-8.799567,-8.222752,-7.617909,-6.986269,201210-1,Normal
201210-1-04,111.814973,106.629998,101.867380,97.512673,93.551430,89.969205,86.751551,83.884023,81.352173,79.141556,...,-11.689508,-11.752441,-11.789205,-11.799583,-11.783357,-11.740310,-11.670224,-11.572882,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,11.003178,12.008451,12.879986,13.624553,14.248922,14.759862,15.164145,15.468539,15.679816,15.804744,...,6.671003,7.197072,7.711103,8.211164,8.695324,9.161650,9.608210,10.033072,210526-3,Hyperglycemia
210526-3-46,14.239653,13.596345,12.981340,12.396227,11.842591,11.322020,10.836101,10.386421,9.974566,9.602123,...,2.124074,2.718437,3.405898,4.190762,5.077335,6.069920,7.172824,8.390352,210526-3,Hyperglycemia
210526-3-47,17.165901,17.349777,17.457499,17.494696,17.466999,17.380039,17.239446,17.050852,16.819887,16.552181,...,6.895403,7.740965,8.633346,9.572405,10.558000,11.589990,12.668233,13.792588,210526-3,Hyperglycemia
210526-3-48,15.865434,14.551958,13.412815,12.441007,11.629535,10.971400,10.459605,10.087150,9.847038,9.732269,...,8.137265,9.444800,10.919724,12.570428,14.405304,16.432743,18.661137,21.098876,210526-3,Hyperglycemia


In [6]:
et = ExtraTreesClassifier(random_state=1234)
rf = RandomForestClassifier(random_state=1234)
svc = SVC(random_state=1234)

---

> ##### **Full Spectra**

In [7]:
evaluate_model(wavelength_df, et)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6119 +/- 0.1114
ExtraTreesClassifier Cross-Validation Precision: 0.6103 +/- 0.1103
ExtraTreesClassifier Cross-Validation Recall: 0.6215 +/- 0.1349
ExtraTreesClassifier Cross-Validation F1-Score: 0.5949 +/- 0.1254


In [8]:
evaluate_model(wavelength_df, rf)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForestClassifier Cross-Validation Accuracy: 0.6122 +/- 0.0993
RandomForestClassifier Cross-Validation Precision: 0.6152 +/- 0.0926
RandomForestClassifier Cross-Validation Recall: 0.6216 +/- 0.1245
RandomForestClassifier Cross-Validation F1-Score: 0.5948 +/- 0.1121


In [9]:
evaluate_model(wavelength_df, svc)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVC Cross-Validation Accuracy: 0.4272 +/- 0.1406
SVC Cross-Validation Precision: 0.4800 +/- 0.1651
SVC Cross-Validation Recall: 0.4137 +/- 0.1754
SVC Cross-Validation F1-Score: 0.3656 +/- 0.1404


---

> ##### **Peak Stats**

In [10]:
peaks = []
widths = []
prominences = []
statuses = df[['SpecID', 'Status', 'SurID']].drop_duplicates()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])


    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences
peaks_df

# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

peak_stats.shape
peak_stats = pd.merge(statuses, peak_stats, on='SpecID', how='left')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [11]:
peak_stats

Unnamed: 0_level_0,Status,SurID,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
201210-1-00,Normal,201210-1,42.724182,31.686298,205,173.397843,-7.558160,5.754848,18.698699,194.114304,0.502925,4.502105,19.616381,169.566634,0.004136
201210-1-01,Normal,201210-1,45.522946,42.483632,183,188.679018,-10.658033,6.588338,19.723760,151.983867,0.503475,6.408729,26.918589,187.781984,0.003636
201210-1-02,Normal,201210-1,86.632065,130.929315,141,1336.381183,-25.851462,6.232133,12.990837,67.855151,0.503902,20.954275,126.097991,1341.050057,0.001224
201210-1-03,Normal,201210-1,122.589284,191.431125,114,987.344326,-51.984432,7.055155,15.043040,71.923318,0.501391,37.095520,160.515805,1000.595932,0.003340
201210-1-04,Normal,201210-1,67.658148,57.916299,189,306.170893,-17.099481,6.257609,20.057264,183.353131,0.501293,6.715678,30.324634,317.970476,0.003077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-05,Hyperglycemia,210526-3,59.340378,63.513923,154,453.251197,-2.236862,7.349668,16.882553,119.909577,0.502152,11.593617,46.123829,457.165115,0.001815
210526-3-06,Hyperglycemia,210526-3,56.953202,56.688423,190,485.468080,-3.875471,6.270059,15.710435,112.777156,0.511045,9.078047,42.079487,486.563979,0.013106
210526-3-07,Hyperglycemia,210526-3,51.472942,48.092297,148,349.507366,1.776651,7.720291,18.521906,131.018560,0.502461,9.690046,35.546181,349.983911,0.009252
210526-3-08,Hyperglycemia,210526-3,51.855065,37.769853,205,266.294753,-2.041752,5.787417,14.797579,104.877015,0.505332,6.186991,25.262593,268.029526,0.003392


In [12]:
evaluate_model(peak_stats, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.4056 +/- 0.0722
ExtraTreesClassifier Cross-Validation Precision: 0.3969 +/- 0.0666
ExtraTreesClassifier Cross-Validation Recall: 0.3959 +/- 0.0912
ExtraTreesClassifier Cross-Validation F1-Score: 0.3786 +/- 0.0780


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
evaluate_model(peak_stats, rf)

RandomForestClassifier Cross-Validation Accuracy: 0.4168 +/- 0.0751
RandomForestClassifier Cross-Validation Precision: 0.4077 +/- 0.0711
RandomForestClassifier Cross-Validation Recall: 0.4052 +/- 0.0937
RandomForestClassifier Cross-Validation F1-Score: 0.3892 +/- 0.0819


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
evaluate_model(peak_stats, svc)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVC Cross-Validation Accuracy: 0.3712 +/- 0.1054
SVC Cross-Validation Precision: 0.3259 +/- 0.1458
SVC Cross-Validation Recall: 0.3549 +/- 0.1399
SVC Cross-Validation F1-Score: 0.2903 +/- 0.0940


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

> ##### **Peak Bins**

In [15]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"
# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
peaks_df.head()

# Set the bins as columns with the peak absrobances, widths and prominences as the values.
# If no peaks appear in a bin the value is set to 0.
# If multiple peaks appear their properties are aggregated.

# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status', 'SurID']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with 0
peak_bins.fillna(0, inplace=True)
peak_bins.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status,SurID
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,89.002395,67.967842,66.465735,54.529033,20.073329,34.71664,21.469727,4.954262,6.181741,45.558273,...,6.069429,0.0,8.584305,31.642064,0.0,2.131577,0.670059,0.0,Normal,201210-1
201210-1-01,99.991163,50.110842,65.076935,47.419326,18.229328,36.275023,28.967117,21.372376,-2.406472,8.481125,...,1.155223,0.0,3.797847,0.826808,143.050278,17.80087,0.556733,0.0,Normal,201210-1
201210-1-02,176.303245,197.680777,81.129202,73.20924,28.286036,81.935506,15.763898,7.15048,17.239728,25.321297,...,3.840262,0.0,1.694357,0.0,35.97677,0.0,4.762265,1.079848,Normal,201210-1
201210-1-03,403.941153,0.0,231.782303,0.0,138.570161,0.0,637.018793,0.0,41.350087,82.255296,...,0.524564,1.296502,0.0,0.0,70.641586,0.0,29.168888,1.046836,Normal,201210-1
201210-1-04,98.087538,47.702445,46.46261,16.671599,15.590154,37.450891,52.928517,76.141247,27.652156,21.135276,...,0.504633,0.935608,16.933157,1.907858,2.417041,1.261868,0.0,1.286921,Normal,201210-1


In [16]:
evaluate_model(peak_bins, et)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5498 +/- 0.1029
ExtraTreesClassifier Cross-Validation Precision: 0.5584 +/- 0.0954
ExtraTreesClassifier Cross-Validation Recall: 0.5609 +/- 0.1288
ExtraTreesClassifier Cross-Validation F1-Score: 0.5250 +/- 0.1136


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
evaluate_model(peak_bins, rf)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForestClassifier Cross-Validation Accuracy: 0.5519 +/- 0.1081
RandomForestClassifier Cross-Validation Precision: 0.5577 +/- 0.0993
RandomForestClassifier Cross-Validation Recall: 0.5670 +/- 0.1298
RandomForestClassifier Cross-Validation F1-Score: 0.5314 +/- 0.1171


In [18]:
evaluate_model(peak_bins, svc)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVC Cross-Validation Accuracy: 0.4475 +/- 0.1452
SVC Cross-Validation Precision: 0.5055 +/- 0.1218
SVC Cross-Validation Recall: 0.4429 +/- 0.1789
SVC Cross-Validation F1-Score: 0.3871 +/- 0.1390
