Import Libraries

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


Read the spectral data

In [4]:
spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [5]:
df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
df.columns.name = None

Add the statuses back.

In [6]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
df = pd.merge(df, statuses, on='SpecID')
df = df.set_index('SpecID')

In [7]:
df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [8]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [9]:
# Splitting the dataframe into features (X) and target variable (y)
X = df.drop(['Status'], axis=1)
y = df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [10]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

In [11]:
# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [12]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8587848932676518


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.84      0.84       203
 Hypoglycemia       0.88      0.83      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[171  16  16]
 [ 18 167  15]
 [ 14   7 185]]


In [13]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8850574712643678


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.88      0.87       203
 Hypoglycemia       0.90      0.88      0.89       200
       Normal       0.89      0.90      0.89       206

     accuracy                           0.89       609
    macro avg       0.89      0.88      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[178  12  13]
 [ 14 176  10]
 [ 14   7 185]]


#### 2. Creating a Peak Featureset

In [206]:
from scipy.signal import find_peaks

Find the peaks in each spectra, and add their properties to the featureset.

In [218]:
peaks = []
df = spectra_df.copy()

for _, group in df.groupby('SpecID'):
    # peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    peaks += list(group.iloc[peak_index].index.values)

In [219]:
peaks_df = df.iloc[peaks]

In [220]:
peaks_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
7,201210-1-00,7,204.78360,2690.32400,201210-1,Normal
10,201210-1-00,10,206.83371,2639.71040,201210-1,Normal
13,201210-1-00,13,208.88382,2642.42430,201210-1,Normal
16,201210-1-00,16,210.93394,2614.35740,201210-1,Normal
21,201210-1-00,21,214.35080,2618.04910,201210-1,Normal
...,...,...,...,...,...,...
8010384,210526-3-49,2619,1989.74940,849.27698,210526-3,Hyperglycemia
8010388,210526-3-49,2623,1992.48290,855.50647,210526-3,Hyperglycemia
8010390,210526-3-49,2625,1993.84960,845.08130,210526-3,Hyperglycemia
8010393,210526-3-49,2628,1995.89980,828.59833,210526-3,Hyperglycemia


Assign Peaks to bins

In [229]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin(wavenumber, bin_size):
    return int((wavenumber - 200) / bin_size)

# Set the bin size
bin_size = 50

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin(x, bin_size))

In [230]:
peaks_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Bin
7,201210-1-00,7,204.78360,2690.32400,201210-1,Normal,0
10,201210-1-00,10,206.83371,2639.71040,201210-1,Normal,0
13,201210-1-00,13,208.88382,2642.42430,201210-1,Normal,0
16,201210-1-00,16,210.93394,2614.35740,201210-1,Normal,0
21,201210-1-00,21,214.35080,2618.04910,201210-1,Normal,0
...,...,...,...,...,...,...,...
8010384,210526-3-49,2619,1989.74940,849.27698,210526-3,Hyperglycemia,35
8010388,210526-3-49,2623,1992.48290,855.50647,210526-3,Hyperglycemia,35
8010390,210526-3-49,2625,1993.84960,845.08130,210526-3,Hyperglycemia,35
8010393,210526-3-49,2628,1995.89980,828.59833,210526-3,Hyperglycemia,35


In [231]:
df = peaks_df.pivot_table(index='SpecID', columns='Bin', values='Absorbance', aggfunc='mean')
df.columns.name = None
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
df = pd.merge(df, statuses, on='SpecID')
df = df.set_index('SpecID')
df = df.fillna(False)

In [232]:
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2574.135406,2421.086014,1974.836958,1845.387347,1770.661155,1748.011470,1713.241968,1722.717050,1773.007550,1747.710024,...,1450.231224,1422.708017,1329.914705,1275.099004,1238.437282,1205.166136,1160.908465,1124.180826,1107.232791,Normal
201210-1-01,2819.946236,2638.460510,2218.419324,2030.368839,1974.911795,1973.966140,1902.003996,1887.311579,1967.013695,1923.662576,...,1679.466500,1621.497227,1490.836636,1430.667554,1389.341896,1349.451925,1309.551552,1276.013691,1277.062452,Normal
201210-1-02,3376.538425,3081.004440,2537.460494,2231.832723,2155.020827,2164.966671,2056.388021,2066.288835,2245.517167,2084.510447,...,2249.952842,2277.119087,2127.713648,2058.509276,2012.239482,1979.275836,1928.108467,1899.640291,1974.707175,Normal
201210-1-03,8728.744789,6748.955075,3903.5478,2645.215520,2453.236979,2470.758360,2335.921290,2412.252160,2437.837127,2275.182650,...,2517.077614,2449.353741,2249.315017,2161.676648,2066.190788,1969.458377,1886.817962,1818.989556,1752.161272,Normal
201210-1-04,3309.063990,3163.688756,2591.467267,2322.879282,2234.438235,2242.790032,2157.865168,2161.435300,2203.236274,2147.168257,...,2382.543922,2406.85407,2178.090924,2102.501576,2027.520281,1947.726315,1885.175179,1834.318324,1771.409441,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,1976.128772,1977.310861,1793.242,1617.047611,1598.621210,1622.913119,1609.938486,1609.110168,1621.911652,1637.606560,...,1438.931790,1364.686692,1308.099050,1253.777191,1190.178875,1137.701900,1083.609189,1021.477714,975.837671,Hyperglycemia
210526-3-46,1917.047074,1919.819076,1733.928226,1568.919778,1549.480705,1574.049679,1559.951065,1564.522039,1573.496463,1584.898916,...,1398.013896,1325.420009,1269.307048,1213.737884,1159.103652,1115.039104,1056.079690,994.492753,951.129536,Hyperglycemia
210526-3-47,1871.424047,1876.269417,1677.434211,1548.911552,1509.398605,1533.204547,1520.197335,1515.399814,1529.075726,1538.370800,...,1352.593490,1283.706675,1229.857762,1180.845156,1127.339660,1079.624304,1022.954848,965.303415,928.426808,Hyperglycemia
210526-3-48,1820.290870,1816.779389,1657.134994,1495.956347,1477.663065,1497.028868,1484.784470,1481.738276,1494.673357,1508.482505,...,1314.662117,1245.238483,1192.755646,1143.876217,1096.682790,1049.844885,993.549606,937.166398,895.820732,Hyperglycemia


In [233]:
# Splitting the dataframe into features (X) and target variable (y)
X = df.drop(['Status'], axis=1)
y = df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [234]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [235]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8292282430213465


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.80      0.83      0.81       203
 Hypoglycemia       0.87      0.80      0.83       200
       Normal       0.82      0.86      0.84       206

     accuracy                           0.83       609
    macro avg       0.83      0.83      0.83       609
 weighted avg       0.83      0.83      0.83       609


Confusion Matrix:
[[168  15  20]
 [ 23 159  18]
 [ 19   9 178]]


In [236]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8357963875205254


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.82      0.82      0.82       203
 Hypoglycemia       0.84      0.83      0.84       200
       Normal       0.84      0.85      0.85       206

     accuracy                           0.84       609
    macro avg       0.84      0.84      0.84       609
 weighted avg       0.84      0.84      0.84       609


Confusion Matrix:
[[166  20  17]
 [ 17 167  16]
 [ 19  11 176]]
