#### Machine Learning

In [148]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [149]:
#df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
#df = pd.read_csv("../../data/exosomes.raw_spectrum_380-1800.csv")
#df = pd.read_csv("../../data/exosomes.raw_spectrum_400-1800.csv")
df = pd.read_csv("../../data/current_clean_spectrum.csv")

In [150]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,-0.145034
1,201210-1-00,294,400.91116,201210-1,Normal,-0.145878
2,201210-1-00,295,401.59454,201210-1,Normal,-0.146846
3,201210-1-00,296,402.27789,201210-1,Normal,-0.147930
4,201210-1-00,297,402.96127,201210-1,Normal,-0.149124
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,-1.512976
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,-1.502732
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,-1.493562
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,-1.485648


In [151]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Pivot the DataFrame to get wavelengths as columns and absorbance values
    wavelength_df = df.pivot(index='SpecID', columns='WaveNumber', values=absorbance_col).reset_index()
    wavelength_df.columns.name = None

    # Merge with the statuses based on SpecID
    # Include the SurID to perform GroupKFold CV
    statuses_and_surface = df[['SpecID', 'SurID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses_and_surface, on='SpecID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SpecID')

    return wavelength_df

In [152]:
def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)
    
    # Getting cross-validation scores
    scores = cross_val_score(et, X, y, cv=cv, scoring='accuracy',groups=groups, n_jobs=-1)
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

In [153]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,-0.145034,-0.145878,-0.146846,-0.14793,-0.149124,-0.150419,-0.151809,-0.153285,-0.154842,-0.156472,...,-1.345058,-1.374955,-1.411756,-1.456164,-1.508905,-1.570731,-1.642422,-1.724783,201210-1,Normal
201210-1-01,-0.113925,-0.095859,-0.078315,-0.061298,-0.044809,-0.028851,-0.013427,0.001461,0.015811,0.029622,...,-1.276142,-1.270276,-1.264736,-1.259661,-1.255195,-1.251489,-1.2487,-1.246995,201210-1,Normal
201210-1-02,0.319805,0.294851,0.272852,0.25361,0.236933,0.222637,0.21054,0.20047,0.192256,0.185737,...,-0.786004,-0.783888,-0.781223,-0.778007,-0.774241,-0.769923,-0.765056,-0.759642,201210-1,Normal
201210-1-03,0.320669,0.319732,0.317912,0.315311,0.312032,0.308171,0.303821,0.299071,0.294007,0.28871,...,-1.087505,-1.084745,-1.081706,-1.078409,-1.074875,-1.071124,-1.067182,-1.063074,201210-1,Normal
201210-1-04,1.09033,1.001816,0.922056,0.850546,0.786796,0.730331,0.680686,0.637415,0.600081,0.568264,...,-1.801045,-1.805067,-1.80963,-1.81487,-1.820931,-1.82796,-1.836115,-1.845557,201210-1,Normal


Show the performance of the current approach.

In [154]:
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6161 +/- 0.1013


Fit an Extra Trees Classifier on the whole dataset.

#### **Filtering the spectra to only look at the regions from the thesis**

Normal Peaks

In [155]:
# Wavenumber ranges from the thesis
ranges = [
    (407, 495), (453, 454), (516, 537), (519, 519), (603, 738),
    (620, 624), (821, 839), (886, 887), (972, 972), (1000, 1003),
    (1030, 1036), (1060, 1078), (1132, 1152), (1170, 1170),
    (1188, 1196), (1212, 1294), (1427, 1446), (1448, 1450),
    (1520, 1533), (1542, 1594), (1611, 1653), (1668, 1668)
]

In [156]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
10,201210-1-00,303,407.06149,201210-1,Normal,-0.158168
11,201210-1-00,304,407.74487,201210-1,Normal,-0.159926
12,201210-1-00,305,408.42825,201210-1,Normal,-0.161737
13,201210-1-00,306,409.11160,201210-1,Normal,-0.163596
14,201210-1-00,307,409.79498,201210-1,Normal,-0.165498
...,...,...,...,...,...,...
6238985,210526-3-09,2122,1650.11390,210526-3,Hyperglycemia,-0.431695
6238986,210526-3-09,2123,1650.79720,210526-3,Hyperglycemia,-0.433430
6238987,210526-3-09,2124,1651.48060,210526-3,Hyperglycemia,-0.451049
6238988,210526-3-09,2125,1652.16410,210526-3,Hyperglycemia,-0.472429


In [157]:
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5358 +/- 0.0993


In [158]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5614 +/- 0.0828


Hyperglycemic peaks

In [159]:
ranges = [
    (402, 562), (510, 550), (618, 624), (634, 795), (808, 812), 
    (844, 861), (1001, 1006),(1067, 1068),(1062, 1077), (1129, 1132), 
    (1110, 1159), (1150, 1157), (1163, 1172), (1231, 1282), (1250, 1259), 
    (1357, 1364), (1429, 1480), (1484, 1492), (1528, 1533), (1544, 1594),
    (1582, 1602),(1625, 1643)
]

In [160]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]

In [161]:
main_peak_areas

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
3,201210-1-00,296,402.27789,201210-1,Normal,-0.147930
4,201210-1-00,297,402.96127,201210-1,Normal,-0.149124
5,201210-1-00,298,403.64465,201210-1,Normal,-0.150419
6,201210-1-00,299,404.32803,201210-1,Normal,-0.151809
7,201210-1-00,300,405.01138,201210-1,Normal,-0.153285
...,...,...,...,...,...,...
6238970,210526-3-09,2107,1639.86330,210526-3,Hyperglycemia,-0.348889
6238971,210526-3-09,2108,1640.54680,210526-3,Hyperglycemia,-0.359890
6238972,210526-3-09,2109,1641.23010,210526-3,Hyperglycemia,-0.365852
6238973,210526-3-09,2110,1641.91350,210526-3,Hyperglycemia,-0.344367


In [162]:
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
wavelength_df.shape
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5790 +/- 0.0973


In [163]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5301 +/- 0.1076


Hypoglycemic

In [164]:
ranges = [
    (406, 599),(447, 453),(508, 546),(604, 777),(803, 808),
    (816, 818),(841, 845),(916, 917),(918, 920),(909, 947),
    (1001, 1004),(1032, 1034),(1075, 1078),(1114, 1200),(1162, 1166),
    (1202, 1338),(1250, 1253),(1287, 1292),(1436, 1442),(1438, 1438),
    (1540, 1595),(1623, 1639)
]

In [165]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5516 +/- 0.0949


In [166]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5686 +/- 0.1023


All important ranges

In [167]:
# Wavenumber ranges from the thesis
ranges = [
    (407, 495), (453, 454), (516, 537), (519, 519), (603, 738),
    (620, 624), (821, 839), (886, 887), (972, 972), (1000, 1003),
    (1030, 1036), (1060, 1078), (1132, 1152), (1170, 1170),
    (1188, 1196), (1212, 1294), (1427, 1446), (1448, 1450),
    (1520, 1533), (1542, 1594), (1611, 1653), (1668, 1668),
    (402, 562), (510, 550), (618, 624), (634, 795), (808, 812), 
    (844, 861), (1001, 1006), (1067, 1068),(1062, 1077), (1129, 1132), 
    (1110, 1159), (1150, 1157), (1163, 1172), (1231, 1282), (1250, 1259), 
    (1357, 1364), (1429, 1480), (1484, 1492), (1528, 1533), (1544, 1594),
    (1582, 1602),(1625, 1643),
    (406, 599),(447, 453),(508, 546),(604, 777),(803, 808),
    (816, 818),(841, 845),(916, 917),(918, 920),(909, 947),
    (1001, 1004),(1032, 1034),(1075, 1078),(1114, 1200),(1162, 1166),
    (1202, 1338),(1250, 1253),(1287, 1292),(1436, 1442),(1438, 1438),
    (1540, 1595),(1623, 1639)
]

In [168]:
main_peak_areas = df[df['WaveNumber'].apply(lambda x: any(start <= x <= end for start, end in ranges))]
main_peak_areas
wavelength_df = prepare_wavelength_df(main_peak_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5749 +/- 0.1025


In [169]:
midpoints = [(start + end) / 2 for start, end in ranges]
central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 0.34165 <= x <= midpoint + 0.34165 for midpoint in midpoints))]
#central_areas = df[df['WaveNumber'].apply(lambda x: any(midpoint - 1 <= x <= midpoint + 1 for midpoint in midpoints))]

central_areas
wavelength_df = prepare_wavelength_df(central_areas, 'Absorbance')
print(wavelength_df.shape)
evaluate_extra_trees(wavelength_df)

(3045, 66)
ExtraTreesClassifier Cross-Validation Accuracy: 0.5902 +/- 0.1012
