#### **This notebook contains the functions used to clean the spectra.**

Import Libraries

In [3]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score
from Spectra_Preparation_Functions import *

Read the spectral data

In [71]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
df = df[(df['WaveNumber'] >= 400) & (df['WaveNumber'] <= 1800)]

In [72]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


>Evaluate an Extra Trees Classifier on the Spectrum

In [73]:
def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)

    scores = []
    for train_index, test_index in cv.split(X, y, groups):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the Extra Trees classifier
        et.fit(X_train, y_train)
        predictions = et.predict(X_test)
        
        # Evaluate the model
        score = accuracy_score(y_test, predictions)
        scores.append(score)
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

>##### **First Try it without Despiking for Comparison**

#### **Select the chosen cleaning parameters then run the functions**

Choose the Parameters

In [74]:
# # Best Full Spectrum Parameters

lam = 10 ** 8
p = 0.01
window_size = 51
poly_order = 3

In [75]:
df['Baseline'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [76]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6004 +/- 0.1028


In [77]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5522 +/- 0.1244


### **Try with Despiking**

In [128]:
# https://towardsdatascience.com/data-science-for-raman-spectroscopy-a-practical-example-e81c56cf25f

def modified_z_score(ys):
    ysb = np.diff(ys) # Differentiated intensity values
    median_y = np.median(ysb) # Median of the intensity values
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ysb]) # median_absolute_deviation of the differentiated intensity values
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y for y in ysb] # median_absolute_deviationmodified z scores
    return modified_z_scores
    
# The next function calculates the average values around the point to be replaced.
def fixer(y,ma):
    threshold = 7 # binarisation threshold
    #threshold = 3.5 # binarisation threshold
    spikes = abs(np.array(modified_z_score(y))) > threshold
    y_out = y.copy()
    for i in np.arange(len(spikes)):
        
        if spikes[i] != 0:
            # Calculate the window range, ensuring it stays within the bounds of the spectrum
            w_start = max(i - ma, 0)
            w_end = min(i + ma + 1, len(y))
            w = np.arange(w_start, w_end)
            
            valid_w = w[w < len(spikes)]  # Ensure w doesn't go beyond the length of spikes
            
            # Indices within the window that do not correspond to spikes
            valid_indices = valid_w[~spikes[valid_w]]
            
            # If there are valid indices, calculate the mean of 'y' over these indices
            if len(valid_indices) > 0:
                y_out[i] = np.mean(y[valid_indices])
            else:
                y_out[i] = y[i]
    return y_out

def despike_group(absorbances, ma=20):
    absorbance_data = absorbances.to_numpy()
    despiked_absorbance = fixer(absorbance_data, ma=ma)
    return despiked_absorbance

df['Despiked_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x))

In [129]:
df['Baseline'] = df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Despiked_Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [130]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6045 +/- 0.1170


In [131]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5435 +/- 0.1272


Compare the raw and despiked spectra.

### **Compare this with SpectraPepper**

In [82]:
import spectrapepper as spep

In [83]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Baseline,Baseline_Corrected_Absorbance,Smooth_Baseline_Corrected,Scaled_Smooth_Baseline_Corrected,Despiked_Absorbance
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,1722.845684,42.817116,41.863303,-0.106297,1765.6628
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,1722.675011,52.105889,41.803843,0.145411,1774.7809
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,1722.504338,46.525862,41.741884,-0.005797,1769.0302
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,1722.333665,34.088335,41.677722,-0.342830,1756.4220
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,1722.162992,36.706008,41.611654,-0.271896,1758.8690
...,...,...,...,...,...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1617.650379,-0.257779,12.378163,-1.299204,1617.3926
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1616.718912,16.372188,13.269937,-0.865022,1633.0911
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1615.787446,17.520154,14.199285,-0.835050,1633.3076
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1614.855979,27.010521,15.166531,-0.587272,1641.8665


First create a wrapper to properly input the data into spectrapep.

In [123]:
def apply_cosmicdd_to_spec(absorbances, th=100, asy=0.6745, m=20):

    # Convert the Series to a list of lists for cosmicdd to read it
    prepared_absorbances = [absorbances.tolist()]

    # Apply cosmicdd
    despiked_data = spep.cosmicdd(prepared_absorbances, th=th, asy=asy, m=m)

    return despiked_data[0]

df['Despiked_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: apply_cosmicdd_to_spec(x, th=7))

In [124]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Baseline,Baseline_Corrected_Absorbance,Smooth_Baseline_Corrected,Scaled_Smooth_Baseline_Corrected,Despiked_Absorbance
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,1722.529077,43.133723,42.847287,-0.083159,1765.6628
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,1722.360770,52.420130,42.587567,0.171006,1774.7809
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,1722.192462,46.837738,42.348074,0.018219,1769.0302
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,1722.024155,34.397845,42.128165,-0.322256,1756.4220
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,1721.855848,37.013152,41.927198,-0.250676,1758.8690
...,...,...,...,...,...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1618.519145,-1.126545,11.378191,-1.307718,1617.3926
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1617.584792,15.506308,12.348481,-0.868940,1633.0911
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1616.650438,16.657162,13.376423,-0.838580,1633.3076
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1615.716085,26.150415,14.463451,-0.588147,1641.8665


In [125]:
df['Baseline'] = df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Despiked_Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [126]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6022 +/- 0.1092


In [127]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5589 +/- 0.1281


---

### **Test Outlier Removal in the whole dataset**

First lets look at the approach from this [paper](https://www.researchgate.net/publication/255178361_Raman_Spectroscopy_and_Chemometrics_for_Identification_and_Strain_Discrimination_of_the_Wine_Spoilage_Yeasts_Saccharomyces_cerevisiae_Zygosaccharomyces_bailii_and_Brettanomyces_bruxellensis/link/53ee3f020cf2981ada175f52/download).

In [6]:
from sklearn.decomposition import PCA
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2

In [43]:
df = pd.read_csv("../../data/current_clean_spectrum.csv")
svn_df = pd.read_csv("../../data/svn_spectrum.csv")

In [5]:
svn_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,-0.141836
1,201210-1-00,294,400.91116,201210-1,Normal,-0.143564
2,201210-1-00,295,401.59454,201210-1,Normal,-0.145366
3,201210-1-00,296,402.27789,201210-1,Normal,-0.147231
4,201210-1-00,297,402.96127,201210-1,Normal,-0.149152
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,-1.073592
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,-1.047798
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,-1.020916
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,-0.992939


In [37]:
wavelength_df = prepare_wavelength_df(svn_df, 'Absorbance')

In [38]:
def find_outliers(X):
    
    # Apply PCA
    pca = PCA().fit(X)
    components = pca.transform(X)
    
    # Keep components with eigenvalues > 1
    important_components = components[:, pca.explained_variance_ > 1]
    
    # Calculate Mahalanobis distance for each sample
    cov_matrix = np.cov(important_components, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mean_important_components = np.mean(important_components, axis=0)
    
    distances = np.array([mahalanobis(sample, mean_important_components, inv_cov_matrix) for sample in important_components])
    
    # Calculate the threshold for being an outlier (3 standard deviations away)
    threshold = np.mean(distances) + 3*np.std(distances)
    
    # Identify outliers
    outliers = distances > threshold
    
    # Get DataFrame indices of outliers
    outlier_indices = X.index[outliers]
    
    return outlier_indices

In [39]:
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
outliers = find_outliers(X)

In [40]:
wavelength_df = wavelength_df.drop(outliers)

Try the scaled spectra with outliers removed.

In [41]:
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

scores = []
for train_index, test_index in cv.split(X, y, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the Extra Trees classifier
    et.fit(X_train, y_train)
    predictions = et.predict(X_test)
    
    # Evaluate the model
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Displaying the results
print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

ExtraTreesClassifier Cross-Validation Accuracy: 0.5596 +/- 0.1060


Try on the unscaled spectra.

In [47]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

wavelength_df = wavelength_df.drop(outliers)
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

scores = []
for train_index, test_index in cv.split(X, y, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the Extra Trees classifier
    et.fit(X_train, y_train)
    predictions = et.predict(X_test)
    
    # Evaluate the model
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Displaying the results
print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

KeyboardInterrupt: 

### **Test Outlier Removal within each Surface**

In [65]:
def find_outliers_grouped(X):
    outlier_indices = []

    # Iterate over each group
    for _, group in X.groupby('SurID'):
        # Drop the 'SurID' column to only include features for PCA
        features = group.drop(['SurID'], axis=1)

        # Apply PCA
        pca = PCA().fit(features)
        components = pca.transform(features)

        # Keep components with eigenvalues > 1
        important_components = components[:, pca.explained_variance_ > 1]

        # Calculate Mahalanobis distance for each sample
        if important_components.size > 0:  # Proceed only if there are components meeting the criteria
            cov_matrix = np.cov(important_components, rowvar=False)
            if np.linalg.cond(cov_matrix) < 1 / np.finfo(cov_matrix.dtype).eps:
                # Only invert the covariance matrix if it's not singular
                inv_cov_matrix = np.linalg.inv(cov_matrix)
                mean_important_components = np.mean(important_components, axis=0)
                
                distances = np.array([mahalanobis(sample, mean_important_components, inv_cov_matrix) for sample in important_components])
                
                # Calculate the threshold for being an outlier
                #threshold = np.mean(distances) + 3*np.std(distances)
                threshold = np.mean(distances) + 1*np.std(distances)
                
                # Identify outliers
                outliers = distances > threshold
                
                # Get DataFrame indices of outliers in this group and add to the list
                outlier_indices.extend(group.index[outliers])

    return outlier_indices


In [66]:
wavelength_df = prepare_wavelength_df(svn_df, 'Absorbance')

# Prepare features, assuming 'Status' is not a feature but a label
X = wavelength_df.drop(['Status'], axis=1)

# Find outliers within each group
outliers = find_outliers_grouped(X)

# Drop outliers from the dataframe
wavelength_df = wavelength_df.drop(index=outliers)

In [67]:
wavelength_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-09,-0.687654,-0.705050,-0.717160,-0.724166,-0.726253,-0.723603,-0.716399,-0.704824,-0.689062,-0.669294,...,-1.339581,-1.344419,-1.351360,-1.360580,-1.372255,-1.386559,-1.403669,-1.423760,201210-1,Normal
201210-1-12,-1.349118,-1.354587,-1.357775,-1.358748,-1.357572,-1.354312,-1.349034,-1.341804,-1.332688,-1.321750,...,-1.399717,-1.416670,-1.437766,-1.463267,-1.493434,-1.528528,-1.568812,-1.614546,201210-1,Normal
201210-1-13,-0.900826,-0.982382,-1.055508,-1.120497,-1.177641,-1.227231,-1.269561,-1.304921,-1.333604,-1.355902,...,-1.570046,-1.581210,-1.593075,-1.605696,-1.619128,-1.633427,-1.648646,-1.664842,201210-1,Normal
201210-1-14,-1.667936,-1.648257,-1.628019,-1.607192,-1.585747,-1.563655,-1.540887,-1.517414,-1.493207,-1.468238,...,-1.648230,-1.646069,-1.643273,-1.639845,-1.635792,-1.631119,-1.625830,-1.619931,201210-1,Normal
201210-1-15,-1.324278,-1.372435,-1.411937,-1.443110,-1.466280,-1.481773,-1.489915,-1.491033,-1.485453,-1.473502,...,-1.736159,-1.722004,-1.704979,-1.684980,-1.661903,-1.635643,-1.606098,-1.573163,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-44,-1.508912,-1.486979,-1.464204,-1.440635,-1.416319,-1.391303,-1.365634,-1.339360,-1.312528,-1.285186,...,-1.584857,-1.539453,-1.490837,-1.438975,-1.383837,-1.325391,-1.263607,-1.198451,210526-3,Hyperglycemia
210526-3-45,-1.220935,-1.170299,-1.126399,-1.088894,-1.057444,-1.031708,-1.011344,-0.996011,-0.985369,-0.979076,...,-1.439151,-1.412652,-1.386760,-1.361571,-1.337184,-1.313695,-1.291201,-1.269800,210526-3,Hyperglycemia
210526-3-46,-1.050248,-1.083828,-1.115930,-1.146473,-1.175372,-1.202545,-1.227909,-1.251382,-1.272881,-1.292322,...,-1.682668,-1.651643,-1.615758,-1.574789,-1.528511,-1.476699,-1.419129,-1.355575,210526-3,Hyperglycemia
210526-3-47,-0.886523,-0.875867,-0.869625,-0.867470,-0.869075,-0.874114,-0.882261,-0.893190,-0.906574,-0.922088,...,-1.481694,-1.432694,-1.380981,-1.326563,-1.269448,-1.209645,-1.147161,-1.082005,210526-3,Hyperglycemia


In [68]:
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

scores = []
for train_index, test_index in cv.split(X, y, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the Extra Trees classifier
    et.fit(X_train, y_train)
    predictions = et.predict(X_test)
    
    # Evaluate the model
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Displaying the results
print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

ExtraTreesClassifier Cross-Validation Accuracy: 0.5698 +/- 0.1047


Try on the unscaled spectra.

In [69]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

wavelength_df = wavelength_df.drop(outliers)
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

scores = []
for train_index, test_index in cv.split(X, y, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the Extra Trees classifier
    et.fit(X_train, y_train)
    predictions = et.predict(X_test)
    
    # Evaluate the model
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Displaying the results
print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

ExtraTreesClassifier Cross-Validation Accuracy: 0.5867 +/- 0.0890


### **Test Outlier Removal within each fold**

In [70]:
wavelength_df = prepare_wavelength_df(svn_df, 'Absorbance')

wavelength_df = wavelength_df.drop(outliers)
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

scores = []

for train_index, test_index in cv.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    groups_train = groups.iloc[train_index]
    
    # Combine X_train and y_train to detect and remove outliers
    train_df = X_train.copy()
    train_df['Status'] = y_train
    train_df['SurID'] = groups_train
    
    # Detect outliers in the training set
    outlier_indices = remove_outliers(train_df.drop(['Status', 'SurID'], axis=1))
    
    # Drop outliers from the training set
    train_df = train_df.drop(index=outlier_indices)
    
    # Split the cleaned training set back into X and y
    X_train_cleaned = train_df.drop(['Status', 'SurID'], axis=1)
    y_train_cleaned = train_df['Status']
    
    # Train the Extra Trees classifier on the cleaned training set
    et = ExtraTreesClassifier(random_state=1234)
    et.fit(X_train_cleaned, y_train_cleaned)
    
    # Make predictions and evaluate the model on the untouched test set
    predictions = et.predict(X_test)
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Displaying the results
print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

KeyError: '[16, 17, 93, 94, 217, 219, 518, 656, 657, 886, 945, 948, 949, 957, 960, 972, 1004, 1005, 1011, 1075, 1076, 1081, 1119, 1141, 1246, 1359, 1369, 2114, 2237, 2238] not found in axis'