#### **This Notebook Limits the dataset to the Spectral Regions of Importance outlined in the Thesis**

The thesis by John O'Sullivan can be found [here](https://doras.dcu.ie/27239/1/ethesis_O%27Sullivan_John_2022.pdf), with the ranges outlined on pages 129 to 136.

In [1]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from Cleaning_and_Evaluation import *

In [2]:
#df = pd.read_csv("../data/exosomes.raw_spectrum_1.csv")
#df = pd.read_csv("../data/exosomes.raw_spectrum_380-1800.csv")
#df = pd.read_csv("../data/exosomes.raw_spectrum_400-1800.csv")
df = pd.read_csv("../data/current_clean_spectrum.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,0,200.00000,2.443895,201210-1,Normal
1,201210-1-00,1,200.68336,2.340851,201210-1,Normal
2,201210-1-00,2,201.36674,2.246252,201210-1,Normal
3,201210-1-00,3,202.05011,2.159808,201210-1,Normal
4,201210-1-00,4,202.73349,2.081228,201210-1,Normal
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,-0.306283,210526-3,Hyperglycemia
8023571,210526-3-09,2631,1997.94980,-0.285098,210526-3,Hyperglycemia
8023572,210526-3-09,2632,1998.63330,-0.261904,210526-3,Hyperglycemia
8023573,210526-3-09,2633,1999.31670,-0.236634,210526-3,Hyperglycemia


In [4]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2.443895,2.340851,2.246252,2.159808,2.081228,2.010222,1.9465,1.889771,1.839746,1.796132,...,-0.219738,-0.218255,-0.216545,-0.214591,-0.212375,-0.209878,-0.207084,-0.203975,201210-1,Normal
201210-1-01,2.987729,2.87714,2.7716,2.671053,2.575446,2.484723,2.39883,2.317713,2.241317,2.169588,...,0.139463,0.158756,0.17829,0.19804,0.217982,0.238089,0.258338,0.278702,201210-1,Normal
201210-1-02,1.36759,1.338989,1.309418,1.278995,1.247839,1.21607,1.183807,1.15117,1.118277,1.085248,...,0.993536,1.009665,1.021793,1.029665,1.033029,1.031629,1.025214,1.013529,201210-1,Normal
201210-1-03,4.494705,4.659871,4.799524,4.914824,5.006931,5.077004,5.126202,5.155685,5.166612,5.160143,...,-0.483049,-0.484923,-0.48723,-0.489997,-0.493249,-0.497011,-0.501307,-0.506165,201210-1,Normal
201210-1-04,1.399465,1.357917,1.3185,1.281168,1.24588,1.212592,1.18126,1.151841,1.124292,1.098568,...,-0.376922,-0.365798,-0.353721,-0.340652,-0.326552,-0.311383,-0.295106,-0.277681,201210-1,Normal


Show the performance of the current approach.

In [5]:
et = ExtraTreesClassifier(random_state=1234)
evaluate_model(wavelength_df, et)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6191 +/- 0.1216
ExtraTreesClassifier Cross-Validation Precision: 0.6317 +/- 0.1340
ExtraTreesClassifier Cross-Validation Recall: 0.6358 +/- 0.1626
ExtraTreesClassifier Cross-Validation F1-Score: 0.5968 +/- 0.1424


#### **Filtering the spectra to only look at the regions from the thesis**

Each of these regions indentify specifc exosome biomarkers.

> ##### **Normal Peaks**

In [6]:
# Wavenumber ranges from the thesis
ranges = [
    (407, 495), (453, 454), (516, 537), (519, 519), (603, 738),
    (620, 624), (821, 839), (886, 887), (972, 972), (1000, 1003),
    (1030, 1036), (1060, 1078), (1132, 1152), (1170, 1170),
    (1188, 1196), (1212, 1294), (1427, 1446), (1448, 1450),
    (1520, 1533), (1542, 1594), (1611, 1653), (1668, 1668)
]

In [7]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
303,201210-1-00,303,407.06149,-0.707765,201210-1,Normal
304,201210-1-00,304,407.74487,-0.691733,201210-1,Normal
305,201210-1-00,305,408.42825,-0.683411,201210-1,Normal
306,201210-1-00,306,409.11160,-0.670134,201210-1,Normal
307,201210-1-00,307,409.79498,-0.648311,201210-1,Normal
...,...,...,...,...,...,...
8023062,210526-3-09,2122,1650.11390,-0.692690,210526-3,Hyperglycemia
8023063,210526-3-09,2123,1650.79720,-0.677543,210526-3,Hyperglycemia
8023064,210526-3-09,2124,1651.48060,-0.658122,210526-3,Hyperglycemia
8023065,210526-3-09,2125,1652.16410,-0.640630,210526-3,Hyperglycemia


In [8]:
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 779)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5140 +/- 0.0996
ExtraTreesClassifier Cross-Validation Precision: 0.5333 +/- 0.0969
ExtraTreesClassifier Cross-Validation Recall: 0.5314 +/- 0.1286
ExtraTreesClassifier Cross-Validation F1-Score: 0.4953 +/- 0.1062


In [9]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 24)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5377 +/- 0.1017
ExtraTreesClassifier Cross-Validation Precision: 0.5500 +/- 0.1002
ExtraTreesClassifier Cross-Validation Recall: 0.5556 +/- 0.1349
ExtraTreesClassifier Cross-Validation F1-Score: 0.5164 +/- 0.1122


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


> ##### **Hyperglycemic Peaks**

In [10]:
ranges = [
    (402, 562), (510, 550), (618, 624), (634, 795), (808, 812), 
    (844, 861), (1001, 1006),(1067, 1068),(1062, 1077), (1129, 1132), 
    (1110, 1159), (1150, 1157), (1163, 1172), (1231, 1282), (1250, 1259), 
    (1357, 1364), (1429, 1480), (1484, 1492), (1528, 1533), (1544, 1594),
    (1582, 1602),(1625, 1643)
]

In [11]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]

In [12]:
main_peak_areas

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
296,201210-1-00,296,402.27789,-0.706814,201210-1,Normal
297,201210-1-00,297,402.96127,-0.716041,201210-1,Normal
298,201210-1-00,298,403.64465,-0.717182,201210-1,Normal
299,201210-1-00,299,404.32803,-0.723550,201210-1,Normal
300,201210-1-00,300,405.01138,-0.711185,201210-1,Normal
...,...,...,...,...,...,...
8023047,210526-3-09,2107,1639.86330,-0.702838,210526-3,Hyperglycemia
8023048,210526-3-09,2108,1640.54680,-0.710691,210526-3,Hyperglycemia
8023049,210526-3-09,2109,1641.23010,-0.706832,210526-3,Hyperglycemia
8023050,210526-3-09,2110,1641.91350,-0.713591,210526-3,Hyperglycemia


In [13]:
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
wavelength_df.shape
evaluate_model(wavelength_df, et)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5319 +/- 0.1131
ExtraTreesClassifier Cross-Validation Precision: 0.5485 +/- 0.1055
ExtraTreesClassifier Cross-Validation Recall: 0.5515 +/- 0.1474
ExtraTreesClassifier Cross-Validation F1-Score: 0.5141 +/- 0.1217


In [14]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 24)
ExtraTreesClassifier Cross-Validation Accuracy: 0.4899 +/- 0.1169
ExtraTreesClassifier Cross-Validation Precision: 0.4963 +/- 0.1089
ExtraTreesClassifier Cross-Validation Recall: 0.5186 +/- 0.1532
ExtraTreesClassifier Cross-Validation F1-Score: 0.4736 +/- 0.1251


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


> ##### **Hypoglycemic Peaks**

In [15]:
ranges = [
    (406, 599),(447, 453),(508, 546),(604, 777),(803, 808),
    (816, 818),(841, 845),(916, 917),(918, 920),(909, 947),
    (1001, 1004),(1032, 1034),(1075, 1078),(1114, 1200),(1162, 1166),
    (1202, 1338),(1250, 1253),(1287, 1292),(1436, 1442),(1438, 1438),
    (1540, 1595),(1623, 1639)
]

In [16]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 1058)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5147 +/- 0.1063
ExtraTreesClassifier Cross-Validation Precision: 0.5330 +/- 0.1009
ExtraTreesClassifier Cross-Validation Recall: 0.5316 +/- 0.1363
ExtraTreesClassifier Cross-Validation F1-Score: 0.4942 +/- 0.1132


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 24)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5210 +/- 0.1120
ExtraTreesClassifier Cross-Validation Precision: 0.5234 +/- 0.1099
ExtraTreesClassifier Cross-Validation Recall: 0.5346 +/- 0.1393
ExtraTreesClassifier Cross-Validation F1-Score: 0.5014 +/- 0.1223


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


> ##### **All Important Ranges**

In [18]:
# Wavenumber ranges from the thesis
ranges = [
    (407, 495), (453, 454), (516, 537), (519, 519), (603, 738),
    (620, 624), (821, 839), (886, 887), (972, 972), (1000, 1003),
    (1030, 1036), (1060, 1078), (1132, 1152), (1170, 1170),
    (1188, 1196), (1212, 1294), (1427, 1446), (1448, 1450),
    (1520, 1533), (1542, 1594), (1611, 1653), (1668, 1668),
    (402, 562), (510, 550), (618, 624), (634, 795), (808, 812), 
    (844, 861), (1001, 1006), (1067, 1068),(1062, 1077), (1129, 1132), 
    (1110, 1159), (1150, 1157), (1163, 1172), (1231, 1282), (1250, 1259), 
    (1357, 1364), (1429, 1480), (1484, 1492), (1528, 1533), (1544, 1594),
    (1582, 1602),(1625, 1643),
    (406, 599),(447, 453),(508, 546),(604, 777),(803, 808),
    (816, 818),(841, 845),(916, 917),(918, 920),(909, 947),
    (1001, 1004),(1032, 1034),(1075, 1078),(1114, 1200),(1162, 1166),
    (1202, 1338),(1250, 1253),(1287, 1292),(1436, 1442),(1438, 1438),
    (1540, 1595),(1623, 1639)
]

In [19]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 1349)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5448 +/- 0.1077
ExtraTreesClassifier Cross-Validation Precision: 0.5559 +/- 0.1064
ExtraTreesClassifier Cross-Validation Recall: 0.5631 +/- 0.1441
ExtraTreesClassifier Cross-Validation F1-Score: 0.5262 +/- 0.1188


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Extract the Wavenumbers of significance for later use**

In [28]:
thesis_wavenumbers = main_peak_areas['WaveNumber'].drop_duplicates()
thesis_wavenumbers.to_csv("../data")

In [30]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_model(wavelength_df, et)

(3045, 66)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5339 +/- 0.0981
ExtraTreesClassifier Cross-Validation Precision: 0.5431 +/- 0.1035
ExtraTreesClassifier Cross-Validation Recall: 0.5540 +/- 0.1358
ExtraTreesClassifier Cross-Validation F1-Score: 0.5126 +/- 0.1112


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
