#### **This notebook contains the functions used to clean the spectra.**

Import Libraries

In [70]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score
from Spectra_Preparation_Functions import *

Read the spectral data

In [71]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")
df = df[(df['WaveNumber'] >= 400) & (df['WaveNumber'] <= 1800)]

In [72]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


>Evaluate an Extra Trees Classifier on the Spectrum

In [73]:
def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)

    scores = []
    for train_index, test_index in cv.split(X, y, groups):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the Extra Trees classifier
        et.fit(X_train, y_train)
        predictions = et.predict(X_test)
        
        # Evaluate the model
        score = accuracy_score(y_test, predictions)
        scores.append(score)
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

>##### **First Try it without Despiking for Comparison**

#### **Select the chosen cleaning parameters then run the functions**

Choose the Parameters

In [74]:
# # Best Full Spectrum Parameters

lam = 10 ** 8
p = 0.01
window_size = 51
poly_order = 3

In [75]:
df['Baseline'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [76]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6004 +/- 0.1028


In [77]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5522 +/- 0.1244


### **Try with Despiking**

In [128]:
# https://towardsdatascience.com/data-science-for-raman-spectroscopy-a-practical-example-e81c56cf25f

def modified_z_score(ys):
    ysb = np.diff(ys) # Differentiated intensity values
    median_y = np.median(ysb) # Median of the intensity values
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ysb]) # median_absolute_deviation of the differentiated intensity values
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y for y in ysb] # median_absolute_deviationmodified z scores
    return modified_z_scores
    
# The next function calculates the average values around the point to be replaced.
def fixer(y,ma):
    threshold = 7 # binarisation threshold
    #threshold = 3.5 # binarisation threshold
    spikes = abs(np.array(modified_z_score(y))) > threshold
    y_out = y.copy()
    for i in np.arange(len(spikes)):
        
        if spikes[i] != 0:
            # Calculate the window range, ensuring it stays within the bounds of the spectrum
            w_start = max(i - ma, 0)
            w_end = min(i + ma + 1, len(y))
            w = np.arange(w_start, w_end)
            
            valid_w = w[w < len(spikes)]  # Ensure w doesn't go beyond the length of spikes
            
            # Indices within the window that do not correspond to spikes
            valid_indices = valid_w[~spikes[valid_w]]
            
            # If there are valid indices, calculate the mean of 'y' over these indices
            if len(valid_indices) > 0:
                y_out[i] = np.mean(y[valid_indices])
            else:
                y_out[i] = y[i]
    return y_out

def despike_group(absorbances, ma=20):
    absorbance_data = absorbances.to_numpy()
    despiked_absorbance = fixer(absorbance_data, ma=ma)
    return despiked_absorbance

df['Despiked_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x))

In [129]:
df['Baseline'] = df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Despiked_Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [130]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6045 +/- 0.1170


In [131]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5435 +/- 0.1272


Compare the raw and despiked spectra.

### **Compare this with SpectraPepper**

In [82]:
import spectrapepper as spep

In [83]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Baseline,Baseline_Corrected_Absorbance,Smooth_Baseline_Corrected,Scaled_Smooth_Baseline_Corrected,Despiked_Absorbance
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,1722.845684,42.817116,41.863303,-0.106297,1765.6628
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,1722.675011,52.105889,41.803843,0.145411,1774.7809
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,1722.504338,46.525862,41.741884,-0.005797,1769.0302
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,1722.333665,34.088335,41.677722,-0.342830,1756.4220
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,1722.162992,36.706008,41.611654,-0.271896,1758.8690
...,...,...,...,...,...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1617.650379,-0.257779,12.378163,-1.299204,1617.3926
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1616.718912,16.372188,13.269937,-0.865022,1633.0911
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1615.787446,17.520154,14.199285,-0.835050,1633.3076
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1614.855979,27.010521,15.166531,-0.587272,1641.8665


First create a wrapper to properly input the data into spectrapep.

In [123]:
def apply_cosmicdd_to_spec(absorbances, th=100, asy=0.6745, m=20):

    # Convert the Series to a list of lists for cosmicdd to read it
    prepared_absorbances = [absorbances.tolist()]

    # Apply cosmicdd
    despiked_data = spep.cosmicdd(prepared_absorbances, th=th, asy=asy, m=m)

    return despiked_data[0]

df['Despiked_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: apply_cosmicdd_to_spec(x, th=7))

In [124]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Baseline,Baseline_Corrected_Absorbance,Smooth_Baseline_Corrected,Scaled_Smooth_Baseline_Corrected,Despiked_Absorbance
293,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,1722.529077,43.133723,42.847287,-0.083159,1765.6628
294,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,1722.360770,52.420130,42.587567,0.171006,1774.7809
295,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,1722.192462,46.837738,42.348074,0.018219,1769.0302
296,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,1722.024155,34.397845,42.128165,-0.322256,1756.4220
297,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,1721.855848,37.013152,41.927198,-0.250676,1758.8690
...,...,...,...,...,...,...,...,...,...,...,...
8023277,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1618.519145,-1.126545,11.378191,-1.307718,1617.3926
8023278,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1617.584792,15.506308,12.348481,-0.868940,1633.0911
8023279,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1616.650438,16.657162,13.376423,-0.838580,1633.3076
8023280,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1615.716085,26.150415,14.463451,-0.588147,1641.8665


In [125]:
df['Baseline'] = df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
df['Baseline_Corrected_Absorbance'] = df['Despiked_Absorbance'] - df['Baseline']
df['Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window_size, poly_order, deriv=0))
df['Scaled_Smooth_Baseline_Corrected'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

In [126]:
wavelength_df = prepare_wavelength_df(df, 'Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6022 +/- 0.1092


In [127]:
wavelength_df = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline_Corrected')
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.5589 +/- 0.1281
