# Median Outlier Removal

This notebook details getting the median spectra for each surface and then training a model based on these median surface spectra. The aim here is to remove noisy or outlier spectra, to be left with a spectrum that is representative of the true Raman fingerprint of that surface.

This notebook uses KFold cross validation as there is no data leakage between surfaces since each spectrum only represents one surface

Import Libraries

In [1]:
import sys
sys.path.append('..')  # Adds the parent directory to the path so Python can find the `Cleaning_and_Evaluation` package
from Cleaning_and_Evaluation import *
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

Read the spectral data

In [2]:
df = pd.read_csv("../data/exosomes.raw_spectrum_400-1800.csv")

Clean the spectra

In [3]:
cleaning_params = {
    'despike': False,
    'baseline_correct': True,
    'smoothing': True,
    'scaling': False,
    'despike_ma': 10,
    'despike_threshold': 7,
    'lam': 10**9,
    'p': 0.05,
    'window_size': 35,
    'poly_order': 3
}
spectra_cleaning(df, **cleaning_params)

Preparing spectra and getting median absorbance at each wavenumber within each surface

In [4]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Group by 'SurID' and 'WaveNumber' and calculate median absorbance
    median_absorbance = df.groupby(['SurID', 'WaveNumber'])[absorbance_col].median().reset_index()

    # Pivot the table to get 'WaveNumber' as columns, 'SurID' as index, and median absorbance as values
    wavelength_df = median_absorbance.pivot(index='SurID', columns='WaveNumber', values=absorbance_col)

    # Merge with the statuses based on SpecID
    # Include the SurID to perform GroupKFold CV
    statuses_and_surface = df[['SurID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses_and_surface, on='SurID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SurID')

    return wavelength_df

In [5]:
def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    X = df.drop(['Status'], axis=1)
    y = df['Status']
    
    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using StratifiedKFold for classification tasks
    #cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    cv = KFold(n_splits=10, shuffle=True, random_state=1234)

    # Getting cross-validation scores
    scores = cross_val_score(et, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

In [6]:
prep_df = prepare_wavelength_df(df, 'Absorbance')
evaluate_extra_trees(prep_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6071 +/- 0.1701
