Import Libraries

In [1]:
import pandas as pd
from scipy.signal import find_peaks
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

Read the spectral data

In [3]:
spectra_df = pd.read_csv("../data/exosomes.raw_spectrum_1.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [4]:
df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
df.columns.name = None

Add the statuses back.

In [5]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
df = pd.merge(df, statuses, on='SpecID')
df = df.set_index('SpecID')

In [6]:
df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [7]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [8]:
# Splitting the dataframe into features (X) and target variable (y)
X = df.drop(['Status'], axis=1)
y = df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [9]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

In [10]:
# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [11]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8587848932676518


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.84      0.84       203
 Hypoglycemia       0.88      0.83      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[171  16  16]
 [ 18 167  15]
 [ 14   7 185]]


In [12]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8850574712643678


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.88      0.87       203
 Hypoglycemia       0.90      0.88      0.89       200
       Normal       0.89      0.90      0.89       206

     accuracy                           0.89       609
    macro avg       0.89      0.88      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[178  12  13]
 [ 14 176  10]
 [ 14   7 185]]


#### 2. Training only using the spectral peaks.

In [3]:
df = spectra_df.copy()

NameError: name 'spectra_df' is not defined

Set all of the non-peak absorbances to 0

In [4]:
def create_bins(spec_group):
    bin_width = 10  # You can adjust this value based on your requirement
    wave_numbers = spec_group['WaveNumber']
    peaks, _ = find_peaks(spec_group['Absorbance'], height=0)  # Find peaks using scipy's find_peaks
    bins = np.arange(0, max(wave_numbers) + bin_width, bin_width)
    bin_labels = bins[:-1] + bin_width / 2
    bin_indices = np.digitize(wave_numbers, bins)
    spec_group['Bin'] = bin_indices
    grouped_data = spec_group.groupby('Bin')['Absorbance'].mean().reset_index()
    grouped_data['Bin'] = bin_labels[grouped_data['Bin'] - 1]
    return grouped_data

# Apply the function to each SpecID group
result = df.groupby('SpecID').apply(create_bins)

# Pivot the result to create the final DataFrame
final_df = result.pivot(index='Bin', columns='SpecID', values='Absorbance').reset_index()

# Display the final DataFrame
print(final_df)


NameError: name 'df' is not defined

In [None]:
peak_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,0,200.00000,False,201210-1,Normal
1,201210-1-00,1,200.68336,False,201210-1,Normal
2,201210-1-00,2,201.36674,False,201210-1,Normal
3,201210-1-00,3,202.05011,False,201210-1,Normal
4,201210-1-00,4,202.73349,False,201210-1,Normal
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,False,210526-3,Hyperglycemia
8023571,210526-3-09,2631,1997.94980,False,210526-3,Hyperglycemia
8023572,210526-3-09,2632,1998.63330,False,210526-3,Hyperglycemia
8023573,210526-3-09,2633,1999.31670,False,210526-3,Hyperglycemia


In [None]:
peak_df[peak_df['Absorbance'] != False]

NameError: name 'df' is not defined

##### Pivot the dataframe again and train the models.

In [None]:
df = peak_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
df.columns.name = None

statuses = peak_df[['SpecID', 'Status']].drop_duplicates()
df = pd.merge(df, statuses, on='SpecID')
df = df.set_index('SpecID')

NameError: name 'peak_df' is not defined

In [None]:
df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Normal
201210-1-01,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Normal
201210-1-02,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Normal
201210-1-03,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Normal
201210-1-04,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Normal


In [None]:
# Splitting the dataframe into features (X) and target variable (y)
X = df.drop(['Status'], axis=1)
y = df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [None]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.5648604269293924


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.58      0.37      0.45       203
 Hypoglycemia       0.55      0.66      0.60       200
       Normal       0.57      0.67      0.62       206

     accuracy                           0.56       609
    macro avg       0.57      0.56      0.56       609
 weighted avg       0.57      0.56      0.56       609


Confusion Matrix:
[[ 75  69  59]
 [ 26 131  43]
 [ 28  40 138]]


In [None]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.5681444991789819


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.57      0.39      0.46       203
 Hypoglycemia       0.53      0.69      0.60       200
       Normal       0.61      0.63      0.62       206

     accuracy                           0.57       609
    macro avg       0.57      0.57      0.56       609
 weighted avg       0.57      0.57      0.56       609


Confusion Matrix:
[[ 79  77  47]
 [ 27 137  36]
 [ 33  43 130]]


#### 3. Use the peaks and their properties

#### 4. Use the Full Spectrum and Peak Properties.