Import Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.signal import find_peaks
from scipy.signal import peak_widths
from scipy.signal import peak_prominences
from scipy.integrate import simps

Read the spectral data

In [2]:
spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [3]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [4]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [5]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [6]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [7]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [8]:
# Splitting the dataframe into features (X) and target variable (y)
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [9]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

In [10]:
# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [11]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8587848932676518


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.84      0.84       203
 Hypoglycemia       0.88      0.83      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[171  16  16]
 [ 18 167  15]
 [ 14   7 185]]


In [12]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8850574712643678


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.88      0.87       203
 Hypoglycemia       0.90      0.88      0.89       200
       Normal       0.89      0.90      0.89       206

     accuracy                           0.89       609
    macro avg       0.89      0.88      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[178  12  13]
 [ 14 176  10]
 [ 14   7 185]]


Show feature importances for the two models.

In [13]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
2578,1961.7312,0.00385
2624,1993.1663,0.003585
2537,1933.713,0.003413
2588,1968.5649,0.003077
2622,1991.7996,0.002976
2615,1987.016,0.002895
2589,1969.2483,0.002645
2632,1998.6333,0.002511
2557,1947.3804,0.002456
2558,1948.0637,0.002421


In [14]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
2625,1993.8496,0.002194
2629,1996.5831,0.001845
2415,1850.3417,0.001841
153,304.55582,0.001767
2261,1745.1025,0.00161
2581,1963.7814,0.001605
116,279.27106,0.001565
2602,1978.1321,0.001542
2618,1989.066,0.001535
2617,1988.3827,0.001478


#### 2. Using Peak Statistics

This uses statistical properties of the peaks in each sample to be used as additional features.

In [15]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    # peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [16]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

23126
23126
23126


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [17]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences
727,201210-1-00,727,696.81091,1851.9185,201210-1,Normal,33.525178,193.5749
1026,201210-1-00,1026,901.13898,1746.4041,201210-1,Normal,107.305604,157.983
1820,201210-1-00,1820,1443.7357,1537.9485,201210-1,Normal,13.417171,72.912
3038,201210-1-01,403,475.39862,1998.4773,201210-1,Normal,11.709582,72.348
3252,201210-1-01,617,621.64008,2034.2784,201210-1,Normal,40.674278,195.8289


Standard Deviation of the Absorbances appears to have a positive affect on accuracy.

In [18]:
# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

In [19]:
peak_stats = pd.merge(peak_stats, statuses, on='SpecID')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(False)

In [20]:
peak_stats.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,1712.090367,159.772853,3,1851.9185,1537.9485,51.415985,49.435009,107.305604,13.417171,141.489967,61.999185,193.5749,72.912,Normal
201210-1-01,1856.2663,139.542334,7,2034.2784,1733.3473,23.416188,17.568953,54.616659,6.661455,131.706486,60.739176,219.855,72.348,Normal
201210-1-02,2604.89485,587.435004,6,3696.4109,2195.7212,21.815435,12.237078,36.809791,7.003516,553.217533,657.769939,1762.2488,89.9251,Normal
201210-1-03,3963.258862,2606.396597,8,10350.545,2536.3599,32.978583,22.929553,74.637545,9.668775,865.953875,713.101538,2220.9512,93.4571,Normal
201210-1-04,2301.025225,89.624553,8,2458.5142,2167.6958,45.247401,70.521171,217.370659,8.83833,165.412862,119.945983,446.3222,69.562,Normal


In [21]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_stats.drop(['Status'], axis=1)
y = peak_stats['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [22]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [23]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.80623973727422


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.80      0.76      0.78       195
 Hypoglycemia       0.78      0.84      0.81       196
       Normal       0.83      0.81      0.82       218

     accuracy                           0.81       609
    macro avg       0.81      0.81      0.81       609
 weighted avg       0.81      0.81      0.81       609


Confusion Matrix:
[[149  26  20]
 [ 16 165  15]
 [ 21  20 177]]


In [24]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8045977011494253


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.78      0.78      0.78       195
 Hypoglycemia       0.80      0.81      0.80       196
       Normal       0.83      0.82      0.82       218

     accuracy                           0.80       609
    macro avg       0.80      0.80      0.80       609
 weighted avg       0.80      0.80      0.80       609


Confusion Matrix:
[[153  25  17]
 [ 18 158  20]
 [ 25  14 179]]


In [25]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
7,PeakWidths_max,0.113616
0,PeakAbsorbance_mean,0.107635
4,PeakAbsorbance_min,0.104208
3,PeakAbsorbance_max,0.103457
1,PeakAbsorbance_std,0.093586
11,PeakProminences_max,0.083349
10,PeakProminences_std,0.077934
6,PeakWidths_std,0.077173
9,PeakProminences_mean,0.068041
5,PeakWidths_mean,0.063796


In [26]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
7,PeakWidths_max,0.102492
4,PeakAbsorbance_min,0.097766
0,PeakAbsorbance_mean,0.095996
3,PeakAbsorbance_max,0.09001
1,PeakAbsorbance_std,0.088978
6,PeakWidths_std,0.083003
11,PeakProminences_max,0.078489
10,PeakProminences_std,0.077455
9,PeakProminences_mean,0.06789
5,PeakWidths_mean,0.063825


Combine these peak statistics with the full wavelength

In [27]:
stats_and_spectrum = peak_stats.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [28]:
stats_and_spectrum.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1712.090367,159.772853,3,1851.9185,1537.9485,51.415985,49.435009,107.305604,13.417171,141.489967,...,1060.3231,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699
201210-1-01,1856.2663,139.542334,7,2034.2784,1733.3473,23.416188,17.568953,54.616659,6.661455,131.706486,...,1253.5012,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922
201210-1-02,2604.89485,587.435004,6,3696.4109,2195.7212,21.815435,12.237078,36.809791,7.003516,553.217533,...,2066.4561,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126
201210-1-03,3963.258862,2606.396597,8,10350.545,2536.3599,32.978583,22.929553,74.637545,9.668775,865.953875,...,1718.3978,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048
201210-1-04,2301.025225,89.624553,8,2458.5142,2167.6958,45.247401,70.521171,217.370659,8.83833,165.412862,...,1697.4792,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722


In [29]:
# Splitting the dataframe into features (X) and target variable (y)
X = stats_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = stats_and_spectrum['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [30]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [31]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8686371100164204


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.82      0.89      0.85       195
 Hypoglycemia       0.91      0.83      0.86       196
       Normal       0.88      0.89      0.89       218

     accuracy                           0.87       609
    macro avg       0.87      0.87      0.87       609
 weighted avg       0.87      0.87      0.87       609


Confusion Matrix:
[[173   9  13]
 [ 21 162  13]
 [ 16   8 194]]


In [32]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8899835796387521


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.85      0.90      0.88       195
 Hypoglycemia       0.94      0.87      0.90       196
       Normal       0.88      0.89      0.89       218

     accuracy                           0.89       609
    macro avg       0.89      0.89      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[176   5  14]
 [ 13 171  12]
 [ 17   6 195]]


In [33]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
11,PeakProminences_max,0.005449
7,PeakWidths_max,0.005194
10,PeakProminences_std,0.005127
2620,1981.549,0.004429
2639,1994.5331,0.003895
5,PeakWidths_mean,0.003296
1,PeakAbsorbance_std,0.003053
6,PeakWidths_std,0.003028
2609,1974.0319,0.002881
2615,1978.1321,0.002634


In [34]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
7,PeakWidths_max,0.003966
6,PeakWidths_std,0.00215
2466,1876.3098,0.002016
2601,1968.5649,0.001981
2549,1933.0297,0.001874
2575,1950.7972,0.001817
2594,1963.7814,0.001783
2569,1946.697,0.001723
2419,1844.1913,0.001716
136,284.05466,0.001715


#### 3. Creating a uniform Peak Featureset

This aims to create a featureset using peaks within wavenumber intervals.

First get the peak properties

In [35]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [36]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

2114865
2114865
2114865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [37]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences
7,201210-1-00,7,204.7836,2690.324,201210-1,Normal,0.814277,43.9995
10,201210-1-00,10,206.83371,2639.7104,201210-1,Normal,1.441874,25.309
13,201210-1-00,13,208.88382,2642.4243,201210-1,Normal,2.022615,33.4111
16,201210-1-00,16,210.93394,2614.3574,201210-1,Normal,0.718095,3.5368
21,201210-1-00,21,214.3508,2618.0491,201210-1,Normal,0.93965,23.7469


Assign Peaks to bins

In [100]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 100

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


In [101]:
peaks_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences,Bin
7,201210-1-00,7,204.78360,2690.32400,201210-1,Normal,0.814277,43.99950,200-300
10,201210-1-00,10,206.83371,2639.71040,201210-1,Normal,1.441874,25.30900,200-300
13,201210-1-00,13,208.88382,2642.42430,201210-1,Normal,2.022615,33.41110,200-300
16,201210-1-00,16,210.93394,2614.35740,201210-1,Normal,0.718095,3.53680,200-300
21,201210-1-00,21,214.35080,2618.04910,201210-1,Normal,0.939650,23.74690,200-300
...,...,...,...,...,...,...,...,...,...
8010384,210526-3-49,2619,1989.74940,849.27698,210526-3,Hyperglycemia,1.360785,12.07178,1900-2000
8010388,210526-3-49,2623,1992.48290,855.50647,210526-3,Hyperglycemia,1.579800,29.82385,1900-2000
8010390,210526-3-49,2625,1993.84960,845.08130,210526-3,Hyperglycemia,1.580559,20.95581,1900-2000
8010393,210526-3-49,2628,1995.89980,828.59833,210526-3,Hyperglycemia,1.442027,4.69702,1900-2000


In [102]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

In [103]:
peak_bins.head()

Unnamed: 0_level_0,Absorbance_1000-1100,Absorbance_1100-1200,Absorbance_1200-1300,Absorbance_1300-1400,Absorbance_1400-1500,Absorbance_1500-1600,Absorbance_1600-1700,Absorbance_1700-1800,Absorbance_1800-1900,Absorbance_1900-2000,...,PeakWidths_1900-2000,PeakWidths_200-300,PeakWidths_300-400,PeakWidths_400-500,PeakWidths_500-600,PeakWidths_600-700,PeakWidths_700-800,PeakWidths_800-900,PeakWidths_900-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1643.263208,1568.692442,1540.69004,1512.809105,1478.104857,1450.069088,1380.529239,1257.175496,1182.545549,1115.706809,...,1.719697,1.543104,1.655541,2.138065,2.065692,3.586973,1.930686,1.68495,4.62991,Normal
201210-1-01,1807.867253,1738.651865,1704.53576,1681.448245,1664.724962,1658.400558,1556.166932,1410.004725,1329.094592,1276.514236,...,1.832827,1.612727,1.878539,2.361763,1.889456,3.353208,1.782314,3.631895,2.219656,Normal
201210-1-02,2204.669493,2148.067951,2156.962151,2172.336252,2175.853128,2233.065802,2204.005787,2036.851074,1955.91682,1934.555121,...,1.803894,1.835919,1.686544,2.01821,1.775907,3.251167,1.788371,1.784477,2.403071,Normal
201210-1-03,2689.344084,2844.293662,2600.00609,2494.864183,2647.226467,2527.284693,2347.111727,2110.750856,1932.533936,1786.860573,...,1.347389,3.679351,1.164558,2.719875,2.02058,3.603446,1.989175,1.531008,4.571798,Normal
201210-1-04,2176.84241,2215.659108,2253.966385,2284.936166,2281.8311,2356.610905,2289.682702,2068.270985,1917.70177,1798.932077,...,1.557408,1.670538,1.660809,2.431477,1.866395,3.490156,1.95882,2.581125,2.031502,Normal


In [104]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [105]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [106]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8095238095238095


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.80      0.76      0.78       203
 Hypoglycemia       0.84      0.81      0.83       200
       Normal       0.80      0.85      0.82       206

     accuracy                           0.81       609
    macro avg       0.81      0.81      0.81       609
 weighted avg       0.81      0.81      0.81       609


Confusion Matrix:
[[155  17  31]
 [ 23 163  14]
 [ 16  15 175]]


In [107]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8423645320197044


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.81      0.82       203
 Hypoglycemia       0.85      0.83      0.84       200
       Normal       0.84      0.88      0.86       206

     accuracy                           0.84       609
    macro avg       0.84      0.84      0.84       609
 weighted avg       0.84      0.84      0.84       609


Confusion Matrix:
[[165  17  21]
 [ 19 167  14]
 [ 13  12 181]]


In [108]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
9,Absorbance_1900-2000,0.040115
8,Absorbance_1800-1900,0.038022
10,Absorbance_200-300,0.034731
28,PeakProminences_200-300,0.034282
52,PeakWidths_800-900,0.028033
46,PeakWidths_200-300,0.027853
7,Absorbance_1700-1800,0.026874
25,PeakProminences_1700-1800,0.02303
37,PeakWidths_1100-1200,0.022657
27,PeakProminences_1900-2000,0.022623


In [109]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
9,Absorbance_1900-2000,0.041998
8,Absorbance_1800-1900,0.03977
10,Absorbance_200-300,0.033539
7,Absorbance_1700-1800,0.029611
46,PeakWidths_200-300,0.02862
28,PeakProminences_200-300,0.028042
12,Absorbance_400-500,0.025215
39,PeakWidths_1300-1400,0.02496
11,Absorbance_300-400,0.024921
15,Absorbance_700-800,0.024167


Merge with the full spectrum.

In [110]:
bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [111]:
bins_and_spectrum.head()

Unnamed: 0_level_0,Absorbance_1000-1100,Absorbance_1100-1200,Absorbance_1200-1300,Absorbance_1300-1400,Absorbance_1400-1500,Absorbance_1500-1600,Absorbance_1600-1700,Absorbance_1700-1800,Absorbance_1800-1900,Absorbance_1900-2000,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1643.263208,1568.692442,1540.69004,1512.809105,1478.104857,1450.069088,1380.529239,1257.175496,1182.545549,1115.706809,...,1060.3231,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699
201210-1-01,1807.867253,1738.651865,1704.53576,1681.448245,1664.724962,1658.400558,1556.166932,1410.004725,1329.094592,1276.514236,...,1253.5012,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922
201210-1-02,2204.669493,2148.067951,2156.962151,2172.336252,2175.853128,2233.065802,2204.005787,2036.851074,1955.91682,1934.555121,...,2066.4561,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126
201210-1-03,2689.344084,2844.293662,2600.00609,2494.864183,2647.226467,2527.284693,2347.111727,2110.750856,1932.533936,1786.860573,...,1718.3978,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048
201210-1-04,2176.84241,2215.659108,2253.966385,2284.936166,2281.8311,2356.610905,2289.682702,2068.270985,1917.70177,1798.932077,...,1697.4792,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722


In [112]:
# Splitting the dataframe into features (X) and target variable (y)
X = bins_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = bins_and_spectrum['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [113]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [114]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8653530377668309


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.85      0.85       203
 Hypoglycemia       0.89      0.85      0.87       200
       Normal       0.86      0.89      0.88       206

     accuracy                           0.87       609
    macro avg       0.87      0.87      0.87       609
 weighted avg       0.87      0.87      0.87       609


Confusion Matrix:
[[172  14  17]
 [ 17 171  12]
 [ 15   7 184]]


In [115]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8735632183908046


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.85      0.86      0.86       203
 Hypoglycemia       0.89      0.88      0.88       200
       Normal       0.88      0.89      0.88       206

     accuracy                           0.87       609
    macro avg       0.87      0.87      0.87       609
 weighted avg       0.87      0.87      0.87       609


Confusion Matrix:
[[174  14  15]
 [ 14 175  11]
 [ 16   7 183]]


In [116]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
37,PeakWidths_1100-1200,0.004057
2645,1970.615,0.00378
2666,1984.9658,0.003329
2657,1978.8154,0.003312
2684,1997.2665,0.003021
2681,1995.2164,0.002906
46,PeakWidths_200-300,0.002567
52,PeakWidths_800-900,0.002539
2646,1971.2985,0.002507
2578,1924.8291,0.002474


In [117]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
2631,1961.0479,0.00306
2593,1935.0797,0.002083
141,259.45331,0.001962
2421,1817.5399,0.001876
2652,1975.3987,0.00173
2518,1883.8269,0.001619
52,PeakWidths_800-900,0.001544
2330,1755.353,0.00153
2608,1945.3303,0.001506
2435,1827.1071,0.001493
