Import Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.signal import find_peaks
from scipy.signal import peak_widths
from scipy.signal import peak_prominences
from scipy.integrate import simps
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

Read the spectral data

In [2]:
spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [3]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [4]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [5]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [6]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [7]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [8]:
# Splitting the dataframe into features (X) and target variable (y)
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8535 +/- 0.0193
ExtraTreesClassifier Cross-Validation Accuracy: 0.8834 +/- 0.0167


#### 2. Using Peak Statistics

This uses statistical properties of the peaks in each sample to be used as additional features.

In [43]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    # peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [44]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

2114865
2114865
2114865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [45]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences
7,201210-1-00,7,204.7836,2690.324,201210-1,Normal,0.814277,43.9995
10,201210-1-00,10,206.83371,2639.7104,201210-1,Normal,1.441874,25.309
13,201210-1-00,13,208.88382,2642.4243,201210-1,Normal,2.022615,33.4111
16,201210-1-00,16,210.93394,2614.3574,201210-1,Normal,0.718095,3.5368
21,201210-1-00,21,214.3508,2618.0491,201210-1,Normal,0.93965,23.7469


Standard Deviation of the Absorbances appears to have a positive affect on accuracy.

In [46]:
# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({
    'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
    'PeakWidths': ['mean', 'std', 'max', 'min'],
    'PeakProminences': ['mean', 'std', 'max', 'min']
}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min',
                      'PeakProminences_mean', 'PeakProminences_std', 'PeakProminences_max', 'PeakProminences_min']

In [47]:
peak_stats = pd.merge(peak_stats, statuses, on='SpecID')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(False)

In [48]:
peak_stats.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,PeakProminences_std,PeakProminences_max,PeakProminences_min,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201210-1-00,1588.522535,296.171266,774,2690.324,1092.8083,2.116394,4.798122,107.305604,0.500097,20.156168,17.98927,193.5749,0.0025,Normal
201210-1-01,1767.102055,321.993283,775,2985.707,1259.2762,2.122257,3.383461,54.616659,0.500664,21.222345,20.910573,219.855,0.0154,Normal
201210-1-02,2188.622821,277.853989,745,3696.4109,1878.2166,1.990689,2.86513,36.809791,0.503868,27.725293,74.390633,1762.2488,0.1636,Normal
201210-1-03,2552.458041,935.040474,687,10350.545,1705.2943,2.214918,4.89446,74.637545,0.502425,35.095653,119.446485,2220.9512,0.033,Normal
201210-1-04,2222.388623,280.358726,764,3475.2666,1725.4711,2.319962,8.282523,217.370659,0.50146,25.13681,26.535681,446.3222,0.0173,Normal


In [49]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_stats.drop(['Status'], axis=1)
y = peak_stats['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8319 +/- 0.0186
ExtraTreesClassifier Cross-Validation Accuracy: 0.8401 +/- 0.0135


Combine these peak statistics with the full wavelength

In [50]:
stats_and_spectrum = peak_stats.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [51]:
stats_and_spectrum.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,PeakProminences_mean,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1588.522535,296.171266,774,2690.324,1092.8083,2.116394,4.798122,107.305604,0.500097,20.156168,...,1060.3231,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699
201210-1-01,1767.102055,321.993283,775,2985.707,1259.2762,2.122257,3.383461,54.616659,0.500664,21.222345,...,1253.5012,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922
201210-1-02,2188.622821,277.853989,745,3696.4109,1878.2166,1.990689,2.86513,36.809791,0.503868,27.725293,...,2066.4561,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126
201210-1-03,2552.458041,935.040474,687,10350.545,1705.2943,2.214918,4.89446,74.637545,0.502425,35.095653,...,1718.3978,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048
201210-1-04,2222.388623,280.358726,764,3475.2666,1725.4711,2.319962,8.282523,217.370659,0.50146,25.13681,...,1697.4792,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722


In [52]:
# Splitting the dataframe into features (X) and target variable (y)
X = stats_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = stats_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8614 +/- 0.0206
ExtraTreesClassifier Cross-Validation Accuracy: 0.8847 +/- 0.0195


#### 3. Creating a uniform Peak Featureset

This aims to create a featureset using peaks within wavenumber intervals.

First get the peak properties

In [15]:
peaks = []
widths = []
prominences = []
areas = []

df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Calculate prominence of each peak
    prominences += list(peak_prominences(group['Absorbance'], peaks=peak_index)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [16]:
print(len(peaks))
print(len(widths))
print(len(prominences))
peaks_df['PeakWidths'] = widths
peaks_df['PeakProminences'] = prominences

2114865
2114865
2114865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakProminences'] = prominences


In [66]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences
727,201210-1-00,727,696.81091,1851.9185,201210-1,Normal,33.525178,193.5749
1026,201210-1-00,1026,901.13898,1746.4041,201210-1,Normal,107.305604,157.983
1820,201210-1-00,1820,1443.7357,1537.9485,201210-1,Normal,13.417171,72.912
3038,201210-1-01,403,475.39862,1998.4773,201210-1,Normal,11.709582,72.348
3252,201210-1-01,617,621.64008,2034.2784,201210-1,Normal,40.674278,195.8289


Assign Peaks to bins

In [67]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 100

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


In [68]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths,PeakProminences,Bin
727,201210-1-00,727,696.81091,1851.9185,201210-1,Normal,33.525178,193.5749,600-700
1026,201210-1-00,1026,901.13898,1746.4041,201210-1,Normal,107.305604,157.983,900-1000
1820,201210-1-00,1820,1443.7357,1537.9485,201210-1,Normal,13.417171,72.912,1400-1500
3038,201210-1-01,403,475.39862,1998.4773,201210-1,Normal,11.709582,72.348,400-500
3252,201210-1-01,617,621.64008,2034.2784,201210-1,Normal,40.674278,195.8289,600-700


In [69]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

In [70]:
peak_bins.head()

Unnamed: 0_level_0,Absorbance_1000-1100,Absorbance_1100-1200,Absorbance_1200-1300,Absorbance_1300-1400,Absorbance_1400-1500,Absorbance_1500-1600,Absorbance_1600-1700,Absorbance_1700-1800,Absorbance_1800-1900,Absorbance_1900-2000,...,PeakWidths_1900-2000,PeakWidths_200-300,PeakWidths_300-400,PeakWidths_400-500,PeakWidths_500-600,PeakWidths_600-700,PeakWidths_700-800,PeakWidths_800-900,PeakWidths_900-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,False,False,False,False,1537.9485,False,False,False,False,False,...,False,False,False,False,False,33.525178,False,False,107.305604,Normal
201210-1-01,False,1779.342,1735.8127,False,1735.0914,False,1733.3473,False,False,False,...,False,False,False,11.709582,False,40.674278,False,54.616659,False,Normal
201210-1-02,False,False,2195.7212,False,2257.9094,False,2418.2576,False,False,False,...,False,False,False,7.003516,False,18.177925,False,False,26.32483,Normal
201210-1-03,False,3134.1235,2910.6362,False,3426.8677,False,2552.4478,False,False,False,...,False,19.520211,False,20.201528,False,22.928061,False,False,74.637545,Normal
201210-1-04,2238.8494,2278.3433,2366.2205,False,2356.8567,False,2458.5142,False,False,False,...,False,False,False,15.704823,False,46.155512,8.83833,False,False,Normal


In [9]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

NameError: name 'peak_bins' is not defined

Merge with the full spectrum.

In [72]:
bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [73]:
bins_and_spectrum.head()

Unnamed: 0_level_0,Absorbance_1000-1100,Absorbance_1100-1200,Absorbance_1200-1300,Absorbance_1300-1400,Absorbance_1400-1500,Absorbance_1500-1600,Absorbance_1600-1700,Absorbance_1700-1800,Absorbance_1800-1900,Absorbance_1900-2000,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,False,False,False,False,1537.9485,False,False,False,False,False,...,1060.3231,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699
201210-1-01,False,1779.342,1735.8127,False,1735.0914,False,1733.3473,False,False,False,...,1253.5012,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922
201210-1-02,False,False,2195.7212,False,2257.9094,False,2418.2576,False,False,False,...,2066.4561,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126
201210-1-03,False,3134.1235,2910.6362,False,3426.8677,False,2552.4478,False,False,False,...,1718.3978,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048
201210-1-04,2238.8494,2278.3433,2366.2205,False,2356.8567,False,2458.5142,False,False,False,...,1697.4792,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722


In [74]:
# Splitting the dataframe into features (X) and target variable (y)
X = bins_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = bins_and_spectrum['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.8646 +/- 0.0256
ExtraTreesClassifier Cross-Validation Accuracy: 0.8840 +/- 0.0206


In [20]:
# Define the function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the range of bin sizes
bin_sizes_range = range(250, 500, 25)

# Create a dictionary to store cross-validation results for each bin size
cv_results = {}

# Iterate through different bin sizes
for bin_size in bin_sizes_range:

    # Add a "Bin" column to the DataFrame
    peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

    # Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
    peak_bins = peaks_df.pivot_table(index='SpecID', columns='Bin', values=['Absorbance', 'PeakWidths', 'PeakProminences'], aggfunc='mean')
    peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
    peak_bins.reset_index(inplace=True)

    # Merge with 'Status' information
    statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
    peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

    # Set 'SpecID' as the index
    peak_bins.set_index('SpecID', inplace=True)

    # Fill NaN values with False
    peak_bins.fillna(False, inplace=True)

    bins_and_spectrum = peak_bins.merge(wavelength_df.drop(columns='Status'), on='SpecID')

    # Splitting the dataframe into features (X) and target variable (y)
    X = bins_and_spectrum.drop(['Status'], axis=1)
    X.columns = X.columns.astype(str)
    y = bins_and_spectrum['Status']

    # Creating the classifiers
    rf = RandomForestClassifier(random_state=1234)
    et = ExtraTreesClassifier(random_state=1234)

    # Combining the classifiers into a list
    classifiers = [rf, et]

    # Store cross-validation results for each classifier and bin size
    for clf in classifiers:
        # Using StratifiedKFold for classification tasks
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
        
        # Getting cross-validation scores
        scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')

        # Store the results
        key = (clf.__class__.__name__, bin_size)
        cv_results[key] = {'Accuracy': np.mean(scores), 'StdDev': np.std(scores)}

# Displaying the results
for key, result in cv_results.items():
    print(f'{key[0]} Cross-Validation Accuracy (Bin Size={key[1]}): {result["Accuracy"]:.4f} +/- {result["StdDev"]:.4f}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] =

RandomForestClassifier Cross-Validation Accuracy (Bin Size=250): 0.8598 +/- 0.0213
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=250): 0.8880 +/- 0.0143
RandomForestClassifier Cross-Validation Accuracy (Bin Size=275): 0.8594 +/- 0.0225
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=275): 0.8867 +/- 0.0179
RandomForestClassifier Cross-Validation Accuracy (Bin Size=300): 0.8604 +/- 0.0221
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=300): 0.8860 +/- 0.0218
RandomForestClassifier Cross-Validation Accuracy (Bin Size=325): 0.8575 +/- 0.0195
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=325): 0.8870 +/- 0.0172
RandomForestClassifier Cross-Validation Accuracy (Bin Size=350): 0.8568 +/- 0.0226
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=350): 0.8841 +/- 0.0211
RandomForestClassifier Cross-Validation Accuracy (Bin Size=375): 0.8571 +/- 0.0222
ExtraTreesClassifier Cross-Validation Accuracy (Bin Size=375): 0.8867 +/- 0.0145
RandomForestClas