### This aims to classify the exosome staus based on a featureset derrived from the peaks.

Import Libraries

In [1]:
import pandas as pd
from scipy.signal import find_peaks
import seaborn as sns
import numpy as np
from scipy.signal import savgol_filter
from scipy import sparse
from scipy.sparse.linalg import spsolve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.signal import peak_widths
from scipy.signal import peak_prominences
from scipy.integrate import simps
from scipy.integrate import trapz
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

Read the spectral data

In [2]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,0,200.00000,2709.3699,201210-1,Normal
1,201210-1-00,1,200.68336,2697.1318,201210-1,Normal
2,201210-1-00,2,201.36674,2696.0413,201210-1,Normal
3,201210-1-00,3,202.05011,2678.5925,201210-1,Normal
4,201210-1-00,4,202.73349,2670.8928,201210-1,Normal
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,1321.0371,210526-3,Hyperglycemia
8023571,210526-3-09,2631,1997.94980,1316.4056,210526-3,Hyperglycemia
8023572,210526-3-09,2632,1998.63330,1311.2640,210526-3,Hyperglycemia
8023573,210526-3-09,2633,1999.31670,1318.0909,210526-3,Hyperglycemia


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8023575 entries, 0 to 8023574
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   SpecID      object 
 1   Seq         int64  
 2   WaveNumber  float64
 3   Absorbance  float64
 4   SurID       object 
 5   Status      object 
dtypes: float64(2), int64(1), object(3)
memory usage: 367.3+ MB


Scale the absorbances of each spectra by the max value.

In [19]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

df['Scaled_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))

## 1 Sample Example

In [20]:
sample = df[df['SpecID'] == "210526-3-04"]
print(sample)

              SpecID   Seq  WaveNumber     SurID         Status  Absorbance  \
7981415  210526-3-04     0   200.00000  210526-3  Hyperglycemia    0.959731   
7981416  210526-3-04     1   200.68336  210526-3  Hyperglycemia    0.964470   
7981417  210526-3-04     2   201.36674  210526-3  Hyperglycemia    0.943477   
7981418  210526-3-04     3   202.05011  210526-3  Hyperglycemia    0.940760   
7981419  210526-3-04     4   202.73349  210526-3  Hyperglycemia    0.949647   
...              ...   ...         ...       ...            ...         ...   
7984045  210526-3-04  2630  1997.26650  210526-3  Hyperglycemia    0.510802   
7984046  210526-3-04  2631  1997.94980  210526-3  Hyperglycemia    0.513393   
7984047  210526-3-04  2632  1998.63330  210526-3  Hyperglycemia    0.512436   
7984048  210526-3-04  2633  1999.31670  210526-3  Hyperglycemia    0.510150   
7984049  210526-3-04  2634  2000.00000  210526-3  Hyperglycemia    0.508291   

         Scaled_Absorbance  
7981415           0.95

#### Machine Learning

In [21]:
spectra_df = df.drop(columns=['Absorbance'])

In [22]:
spectra_df = spectra_df.rename(columns={'Scaled_Absorbance':'Absorbance'})

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [23]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [24]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [25]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.0,0.995483,0.995081,0.98864,0.985799,0.979026,0.976731,0.99297,0.967134,0.962959,...,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089,Normal
201210-1-01,0.99786,1.0,0.994795,0.987068,0.985319,0.985517,0.984678,0.984173,0.98165,0.976585,...,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721,Normal
201210-1-02,1.0,0.970271,0.98333,0.970521,0.967885,0.967837,0.959964,0.947223,0.943806,0.942708,...,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509,Normal
201210-1-03,0.785427,0.794385,0.80868,0.824538,0.839005,0.850751,0.866092,0.890804,0.894421,0.908143,...,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991,Normal
201210-1-04,0.998059,0.996477,1.0,0.998082,0.993222,0.989748,0.989609,0.991243,0.991758,0.989352,...,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [26]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [27]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [28]:
# Splitting the dataframe into features (X) and target variable (y)
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9278 +/- 0.0111
ExtraTreesClassifier Cross-Validation Accuracy: 0.9415 +/- 0.0133


#### 2. Using Peak Statistics

This uses statistical properties of the peaks in each sample to be used as additional features.

In [31]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

In [34]:
peaks_df = df.iloc[peaks]

In [35]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

2114865
2114865
2114865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [36]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences
7,201210-1-00,7,204.7836,201210-1,Normal,0.99297,0.814277,0.01624
10,201210-1-00,10,206.83371,201210-1,Normal,0.974289,1.441874,0.009341
13,201210-1-00,13,208.88382,201210-1,Normal,0.975291,2.022615,0.012332
16,201210-1-00,16,210.93394,201210-1,Normal,0.964932,0.718095,0.001305
21,201210-1-00,21,214.3508,201210-1,Normal,0.966294,0.93965,0.008765


Standard Deviation of the Absorbances appears to have a positive affect on accuracy.

In [37]:
# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

In [38]:
peak_stats = pd.merge(peak_stats, statuses, on='SpecID')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(False)

In [39]:
peak_stats.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,0.586307,0.109314,774,0.99297,0.403344,2.116394,4.798122,107.305604,0.500097,0.007439,0.00664,0.071446,9.227238e-07,Normal
201210-1-01,0.591854,0.107845,775,1.0,0.421768,2.122257,3.383461,54.616659,0.500664,0.007108,0.007004,0.073636,5.157907e-06,Normal
201210-1-02,0.59111,0.075044,745,0.998339,0.507275,1.990689,2.86513,36.809791,0.503868,0.007488,0.020092,0.475954,4.418561e-05,Normal
201210-1-03,0.246601,0.090337,687,1.0,0.164754,2.214918,4.89446,74.637545,0.502425,0.003391,0.01154,0.214573,3.188238e-06,Normal
201210-1-04,0.639487,0.080673,764,1.0,0.4965,2.319962,8.282523,217.370659,0.50146,0.007233,0.007636,0.128428,4.978035e-06,Normal


In [40]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_stats.drop(['Status'], axis=1)
y = peak_stats['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8417 +/- 0.0180
ExtraTreesClassifier Cross-Validation Accuracy: 0.8591 +/- 0.0162


Combine these peak statistics with the full wavelength

In [41]:
stats_and_spectrum = peak_stats.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [42]:
stats_and_spectrum.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.586307,0.109314,774,0.99297,0.403344,2.116394,4.798122,107.305604,0.500097,0.007439,...,0.391354,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089
201210-1-01,0.591854,0.107845,775,1.0,0.421768,2.122257,3.383461,54.616659,0.500664,0.007108,...,0.419834,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721
201210-1-02,0.59111,0.075044,745,0.998339,0.507275,1.990689,2.86513,36.809791,0.503868,0.007488,...,0.558115,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509
201210-1-03,0.246601,0.090337,687,1.0,0.164754,2.214918,4.89446,74.637545,0.502425,0.003391,...,0.16602,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991
201210-1-04,0.639487,0.080673,764,1.0,0.4965,2.319962,8.282523,217.370659,0.50146,0.007233,...,0.488446,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975


In [43]:
# Splitting the dataframe into features (X) and target variable (y)
X = stats_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = stats_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9241 +/- 0.0105
ExtraTreesClassifier Cross-Validation Accuracy: 0.9412 +/- 0.0133


#### 3. Creating a uniform Peak Featureset

This aims to create a featureset using peaks within wavenumber intervals.

First get the peak properties

In [44]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [46]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

2114865
2114865
2114865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [47]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences
7,201210-1-00,7,204.7836,201210-1,Normal,0.99297,0.814277,0.01624
10,201210-1-00,10,206.83371,201210-1,Normal,0.974289,1.441874,0.009341
13,201210-1-00,13,208.88382,201210-1,Normal,0.975291,2.022615,0.012332
16,201210-1-00,16,210.93394,201210-1,Normal,0.964932,0.718095,0.001305
21,201210-1-00,21,214.3508,201210-1,Normal,0.966294,0.93965,0.008765


Assign Peaks to bins of a fixed wavelength interval.

In [48]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


In [49]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences,Bin
7,201210-1-00,7,204.7836,201210-1,Normal,0.99297,0.814277,0.01624,200-225
10,201210-1-00,10,206.83371,201210-1,Normal,0.974289,1.441874,0.009341,200-225
13,201210-1-00,13,208.88382,201210-1,Normal,0.975291,2.022615,0.012332,200-225
16,201210-1-00,16,210.93394,201210-1,Normal,0.964932,0.718095,0.001305,200-225
21,201210-1-00,21,214.3508,201210-1,Normal,0.966294,0.93965,0.008765,200-225


Set the bins as columns with the peak absrobances, widths and prominences as the values.
If no peaks appear in a bin the value is set to False.
If multiple peaks appear their properties are aggregated.

In [50]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='max')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

In [51]:
peak_bins.head()

Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_775-800,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.627438,0.614806,0.618088,0.601372,0.592746,0.594347,0.587267,0.575622,0.572823,0.579848,...,5.00254,3.623595,5.683584,3.549329,3.704639,107.305604,4.119952,3.331286,26.861586,Normal
201210-1-01,0.624357,0.615726,0.607079,0.605444,0.591233,0.595953,0.589998,0.580315,0.572311,0.581374,...,4.253764,5.433428,5.350486,7.543329,54.616659,2.79402,5.442918,2.987664,21.226474,Normal
201210-1-02,0.614595,0.624772,0.599226,0.587981,0.583814,0.595169,0.589122,0.578443,0.586603,0.592522,...,3.230627,2.798142,3.91982,5.202362,2.780712,26.32483,2.534114,3.877994,2.70726,Normal
201210-1-03,0.273511,0.265451,0.261093,0.256781,0.259645,0.282992,0.302798,0.297568,0.253889,0.251585,...,5.454504,2.339554,5.176415,2.730207,1.975918,74.637545,1.999101,2.047551,40.139866,Normal
201210-1-04,0.644224,0.631643,0.633717,0.628743,0.635447,0.647366,0.649869,0.655588,0.652292,0.65334,...,4.126038,11.403364,12.228331,4.721217,4.000319,6.449576,7.462045,4.183068,3.39072,Normal


In [52]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9186 +/- 0.0143
ExtraTreesClassifier Cross-Validation Accuracy: 0.9333 +/- 0.0141


Merge with the full spectrum.

In [53]:
bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')

  bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')


In [54]:
bins_and_spectrum.head()

Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.627438,0.614806,0.618088,0.601372,0.592746,0.594347,0.587267,0.575622,0.572823,0.579848,...,0.391354,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089
201210-1-01,0.624357,0.615726,0.607079,0.605444,0.591233,0.595953,0.589998,0.580315,0.572311,0.581374,...,0.419834,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721
201210-1-02,0.614595,0.624772,0.599226,0.587981,0.583814,0.595169,0.589122,0.578443,0.586603,0.592522,...,0.558115,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509
201210-1-03,0.273511,0.265451,0.261093,0.256781,0.259645,0.282992,0.302798,0.297568,0.253889,0.251585,...,0.16602,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991
201210-1-04,0.644224,0.631643,0.633717,0.628743,0.635447,0.647366,0.649869,0.655588,0.652292,0.65334,...,0.488446,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975


In [55]:
# Splitting the dataframe into features (X) and target variable (y)
X = bins_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = bins_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9376 +/- 0.0116
ExtraTreesClassifier Cross-Validation Accuracy: 0.9481 +/- 0.0141


In [56]:
# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Training the classifier on the entire dataset
    clf.fit(X, y)
    
    # Displaying the cross-validation results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')
    
    # Displaying the top 10 most important features
    feature_importances = clf.feature_importances_
    indices = np.argsort(feature_importances)[::-1][:10]  # Get indices of top 10 features
    top_features = X.columns[indices]
    print(f'Top 10 Features: {top_features}')

RandomForestClassifier Cross-Validation Accuracy: 0.9376 +/- 0.0116
Top 10 Features: Index(['PeakProminences_1000-1025', '277.22095', '271.07062', '280.63782',
       '297.03873', '275.85422', '323.69022', '1670.615', '301.13895',
       '1574.943'],
      dtype='object')
ExtraTreesClassifier Cross-Validation Accuracy: 0.9481 +/- 0.0141
Top 10 Features: Index(['287.47153', '297.03873', '291.57175', '290.20502', '272.43735',
       '1462.1868', '298.40546', '285.42142', '301.82233',
       'PeakProminences_1025-1050'],
      dtype='object')
