### This aims to classify the exosome staus based on a featureset derrived from the peaks.

Import Libraries

In [1]:
import pandas as pd
from scipy.signal import find_peaks
import seaborn as sns
import numpy as np
from scipy.signal import savgol_filter
from scipy import sparse
from scipy.sparse.linalg import spsolve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.signal import peak_widths
from scipy.signal import peak_prominences
from scipy.integrate import simps
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

Read the spectral data

In [2]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [3]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

df['Scaled_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))

#### Machine Learning

In [5]:
spectra_df = df.drop(columns=['Absorbance'])

In [6]:
spectra_df = spectra_df.rename(columns={'Scaled_Absorbance':'Absorbance'})

In [8]:
#spectra_df.to_csv("../../data/scaled_and_noise_removal.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [9]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [10]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [11]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.0,0.995483,0.995081,0.98864,0.985799,0.979026,0.976731,0.99297,0.967134,0.962959,...,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089,Normal
201210-1-01,0.99786,1.0,0.994795,0.987068,0.985319,0.985517,0.984678,0.984173,0.98165,0.976585,...,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721,Normal
201210-1-02,1.0,0.970271,0.98333,0.970521,0.967885,0.967837,0.959964,0.947223,0.943806,0.942708,...,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509,Normal
201210-1-03,0.785427,0.794385,0.80868,0.824538,0.839005,0.850751,0.866092,0.890804,0.894421,0.908143,...,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991,Normal
201210-1-04,0.998059,0.996477,1.0,0.998082,0.993222,0.989748,0.989609,0.991243,0.991758,0.989352,...,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [13]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [14]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [15]:
# Splitting the dataframe into features (X) and target variable (y)
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9278 +/- 0.0111
ExtraTreesClassifier Cross-Validation Accuracy: 0.9415 +/- 0.0133


#### 2. Using Peak Statistics

This uses statistical properties of the peaks in each sample to be used as additional features.

In [16]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42/3200, width=6)
    #peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [17]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

23374
23374
23374


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [18]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences
727,201210-1-00,727,696.81091,201210-1,Normal,0.683524,33.525178,0.071446
1026,201210-1-00,1026,901.13898,201210-1,Normal,0.644579,107.305604,0.05831
1820,201210-1-00,1820,1443.7357,201210-1,Normal,0.567641,13.417171,0.026911
3038,201210-1-01,403,475.39862,201210-1,Normal,0.669348,11.709582,0.024231
3252,201210-1-01,617,621.64008,201210-1,Normal,0.681339,40.674278,0.065589


Standard Deviation of the Absorbances appears to have a positive affect on accuracy.

In [19]:
# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

In [20]:
peak_stats = pd.merge(peak_stats, statuses, on='SpecID')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(False)

In [21]:
peak_stats.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,0.631915,0.05897,3,0.683524,0.567641,51.415985,49.435009,107.305604,13.417171,0.052222,0.022883,0.071446,0.026911,Normal
201210-1-01,0.621718,0.046737,7,0.681339,0.580548,23.416188,17.568953,54.616659,6.661455,0.044112,0.020343,0.073636,0.024231,Normal
201210-1-02,0.703538,0.158656,6,0.998339,0.593027,21.815435,12.237078,36.809791,7.003516,0.149415,0.177653,0.475954,0.024287,Normal
201210-1-03,0.402375,0.265404,7,1.0,0.245046,36.308556,22.580902,74.637545,18.36476,0.094325,0.066908,0.214573,0.013324,Normal
201210-1-04,0.662115,0.025789,8,0.707432,0.623749,45.247401,70.521171,217.370659,8.83833,0.047597,0.034514,0.128428,0.020016,Normal


In [22]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_stats.drop(['Status'], axis=1)
y = peak_stats['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.7409 +/- 0.0210
ExtraTreesClassifier Cross-Validation Accuracy: 0.7461 +/- 0.0177


Combine these peak statistics with the full wavelength

In [23]:
stats_and_spectrum = peak_stats.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [24]:
stats_and_spectrum.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.631915,0.05897,3,0.683524,0.567641,51.415985,49.435009,107.305604,13.417171,0.052222,...,0.391354,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089
201210-1-01,0.621718,0.046737,7,0.681339,0.580548,23.416188,17.568953,54.616659,6.661455,0.044112,...,0.419834,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721
201210-1-02,0.703538,0.158656,6,0.998339,0.593027,21.815435,12.237078,36.809791,7.003516,0.149415,...,0.558115,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509
201210-1-03,0.402375,0.265404,7,1.0,0.245046,36.308556,22.580902,74.637545,18.36476,0.094325,...,0.16602,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991
201210-1-04,0.662115,0.025789,8,0.707432,0.623749,45.247401,70.521171,217.370659,8.83833,0.047597,...,0.488446,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975


In [25]:
# Splitting the dataframe into features (X) and target variable (y)
X = stats_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = stats_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9258 +/- 0.0126
ExtraTreesClassifier Cross-Validation Accuracy: 0.9412 +/- 0.0111


#### 3. Creating a uniform Peak Featureset

This aims to create a featureset using peaks within wavenumber intervals.

First get the peak properties

In [26]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42/3200, width=6)
    #peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [27]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

23374
23374
23374


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [28]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences
727,201210-1-00,727,696.81091,201210-1,Normal,0.683524,33.525178,0.071446
1026,201210-1-00,1026,901.13898,201210-1,Normal,0.644579,107.305604,0.05831
1820,201210-1-00,1820,1443.7357,201210-1,Normal,0.567641,13.417171,0.026911
3038,201210-1-01,403,475.39862,201210-1,Normal,0.669348,11.709582,0.024231
3252,201210-1-01,617,621.64008,201210-1,Normal,0.681339,40.674278,0.065589


Assign Peaks to bins of a fixed wavelength interval.

In [29]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


In [30]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,PeakWidths,PeakProminences,Bin
727,201210-1-00,727,696.81091,201210-1,Normal,0.683524,33.525178,0.071446,675-700
1026,201210-1-00,1026,901.13898,201210-1,Normal,0.644579,107.305604,0.05831,900-925
1820,201210-1-00,1820,1443.7357,201210-1,Normal,0.567641,13.417171,0.026911,1425-1450
3038,201210-1-01,403,475.39862,201210-1,Normal,0.669348,11.709582,0.024231,475-500
3252,201210-1-01,617,621.64008,201210-1,Normal,0.681339,40.674278,0.065589,600-625


Set the bins as columns with the peak absrobances, widths and prominences as the values.
If no peaks appear in a bin the value is set to False.
If multiple peaks appear their properties are aggregated.

In [31]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='max')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

In [32]:
peak_bins.head()

Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,PeakWidths_775-800,PeakWidths_800-825,PeakWidths_825-850,PeakWidths_850-875,PeakWidths_875-900,PeakWidths_900-925,PeakWidths_925-950,PeakWidths_950-975,PeakWidths_975-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,107.305604,False,False,False,Normal
201210-1-01,False,False,False,False,False,0.595953,False,False,False,0.581374,...,False,False,False,False,54.616659,False,False,False,False,Normal
201210-1-02,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,26.32483,False,False,False,Normal
201210-1-03,False,False,False,False,False,False,0.302798,False,False,False,...,False,False,False,False,False,74.637545,False,False,False,Normal
201210-1-04,0.644224,False,False,False,False,False,False,0.655588,False,False,...,False,False,False,False,False,False,False,False,False,Normal


In [33]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8010 +/- 0.0223
ExtraTreesClassifier Cross-Validation Accuracy: 0.7816 +/- 0.0180


Merge with the full spectrum.

In [34]:
bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')

  bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')


In [35]:
bins_and_spectrum.head()

Unnamed: 0_level_0,Absorbance_1000-1025,Absorbance_1025-1050,Absorbance_1050-1075,Absorbance_1075-1100,Absorbance_1100-1125,Absorbance_1125-1150,Absorbance_1150-1175,Absorbance_1175-1200,Absorbance_1200-1225,Absorbance_1225-1250,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,False,False,False,False,False,False,False,False,False,False,...,0.391354,0.406183,0.401843,0.403086,0.407818,0.40014,0.397486,0.401969,0.403344,0.401089
201210-1-01,False,False,False,False,False,0.595953,False,False,False,0.581374,...,0.419834,0.417414,0.425509,0.426087,0.425988,0.425384,0.430038,0.431047,0.427505,0.424721
201210-1-02,False,False,False,False,False,False,False,False,False,False,...,0.558115,0.547909,0.55282,0.565779,0.558489,0.551799,0.557342,0.563581,0.557661,0.557509
201210-1-03,False,False,False,False,False,False,0.302798,False,False,False,...,0.16602,0.16254,0.163745,0.165235,0.165661,0.168732,0.164766,0.164548,0.164754,0.162991
201210-1-04,0.644224,False,False,False,False,False,False,0.655588,False,False,...,0.488446,0.4965,0.495582,0.505587,0.502293,0.497229,0.503632,0.502699,0.505335,0.502975


In [36]:
# Splitting the dataframe into features (X) and target variable (y)
X = bins_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = bins_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.9284 +/- 0.0128
ExtraTreesClassifier Cross-Validation Accuracy: 0.9455 +/- 0.0138


In [37]:
# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Training the classifier on the entire dataset
    clf.fit(X, y)
    
    # Displaying the cross-validation results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')
    
    # Displaying the top 10 most important features
    feature_importances = clf.feature_importances_
    indices = np.argsort(feature_importances)[::-1][:10]  # Get indices of top 10 features
    top_features = X.columns[indices]
    print(f'Top 10 Features: {top_features}')

RandomForestClassifier Cross-Validation Accuracy: 0.9284 +/- 0.0128
Top 10 Features: Index(['333.94077', 'PeakProminences_1000-1025', '272.43735', '273.80411',
       '299.77222', '301.13895', '275.85422', '295.67197', '279.27106',
       '332.57404'],
      dtype='object')
ExtraTreesClassifier Cross-Validation Accuracy: 0.9455 +/- 0.0138
Top 10 Features: Index(['297.03873', '292.93851', '302.50571', '287.47153', '291.57175',
       '303.18906', '290.20502', '1462.1868', '295.67197',
       'Absorbance_1000-1025'],
      dtype='object')
