### This aims to classify the exosome staus based on a featureset derrived from the peaks.

Lets test different spectral cleaning parameters

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from scipy import sparse
from scipy.sparse.linalg import spsolve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate


Read the spectral data

In [2]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [3]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,0,200.00000,2709.3699,201210-1,Normal
1,201210-1-00,1,200.68336,2697.1318,201210-1,Normal
2,201210-1-00,2,201.36674,2696.0413,201210-1,Normal
3,201210-1-00,3,202.05011,2678.5925,201210-1,Normal
4,201210-1-00,4,202.73349,2670.8928,201210-1,Normal
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,1321.0371,210526-3,Hyperglycemia
8023571,210526-3-09,2631,1997.94980,1316.4056,210526-3,Hyperglycemia
8023572,210526-3-09,2632,1998.63330,1311.2640,210526-3,Hyperglycemia
8023573,210526-3-09,2633,1999.31670,1318.0909,210526-3,Hyperglycemia


In [4]:
df['SpecID'].unique()

array(['201210-1-00', '201210-1-01', '201210-1-02', ..., '210526-3-07',
       '210526-3-08', '210526-3-09'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8023575 entries, 0 to 8023574
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   SpecID      object 
 1   Seq         int64  
 2   WaveNumber  float64
 3   Absorbance  float64
 4   SurID       object 
 5   Status      object 
dtypes: float64(2), int64(1), object(3)
memory usage: 367.3+ MB


#### Train an Extra Trees Classifier on the full spectrum.

In [6]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Pivot the DataFrame to get wavelengths as columns and absorbance values
    wavelength_df = df.pivot(index='SpecID', columns='WaveNumber', values=absorbance_col).reset_index()
    wavelength_df.columns.name = None

    # Merge with the statuses based on SpecID
    statuses = df[['SpecID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SpecID')

    return wavelength_df

In [7]:
wavelength_df = prepare_wavelength_df(df, 'Absorbance')

In [8]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


In [9]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

>Calculate the baseline using Asymmetric Least Squares, then subtract it from the spectrum.

In [10]:
def baseline_als_optimized(y, lam, p, niter=10):
    L = len(y)
    D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
    D = lam * D.dot(D.transpose()) # Precompute this term since it does not depend on `w`
    w = np.ones(L)
    W = sparse.spdiags(w, 0, L, L)
    for i in range(niter):
        W.setdiag(w) # Do not create a new matrix, just update diagonal values
        Z = W + D
        z = spsolve(Z, w*y)
        w = p * (y > z) + (1-p) * (y < z)
    return z

# lam = 10 ** 8
# p = 0.05
# df['Baseline_Corrected_Absorbance'] = df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))

>#### Perform Grid-Search to find the best Assymetric Least Squares Parameters

Set the Baseline Correction and Smoothing Parameters to Search

In [13]:
# lam_values = [10**4, 10**5, 10**6, 10**7]
# p_values = [0.001, 0.005, 0.01, 0.05, 0.1]
# window_size = [5, 7, 9, 19, 25, 51, 101, 151, 251]
# poly_order = [1, 2, 3]

# lam_values = [10**7, 10**8, 10**9]
# p_values = [0.001, 0.005, 0.01, 0.05, 0.1]
# window_size = [9, 19, 25, 51, 101, 151]
# poly_order = [1, 2]

lam_values = [10**7, 10**8]
p_values = [0.001]
window_size = [9, 101]
poly_order = [1]

In [12]:
results = []

for lam in lam_values:
    for p in p_values:
        # Apply baseline correction
        df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df.groupby('SpecID')['Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))
                        
        baseline_corrected = prepare_wavelength_df(df, 'Baseline_Corrected_Absorbance')
        X = baseline_corrected.drop(['Status'], axis=1)
        y = baseline_corrected['Status']
        
        et = ExtraTreesClassifier(random_state=1234)
        cv = KFold(n_splits=10, shuffle=True, random_state=1234)
        
        # Perform cross-validation with multiple scoring metrics
        scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
        
        # Append the results
        results.append({
            'lam': lam,
            'p': p,
            'Accuracy': np.mean(scores['test_accuracy']),
            'Precision': np.mean(scores['test_precision_macro']),
            'Recall': np.mean(scores['test_recall_macro']),
            'F1': np.mean(scores['test_f1_macro']),
            'Accuracy Std': np.std(scores['test_accuracy']),
            'Precision Std': np.std(scores['test_precision_macro']),
            'Recall Std': np.std(scores['test_recall_macro']),
            'F1 Std': np.std(scores['test_f1_macro'])
        })

KeyboardInterrupt: 

Try it with Parallel Processing

In [22]:
from joblib import Parallel, delayed

def process_combination(lam, p, df):
    df_copy = df.copy()  # Work on a copy to avoid modifying the original df in-place
    df_copy['Baseline_Corrected_Absorbance'] = df_copy['Absorbance'] - df_copy.groupby('SpecID')['Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))
    baseline_corrected = prepare_wavelength_df(df_copy, 'Baseline_Corrected_Absorbance')
    X = baseline_corrected.drop(['Status'], axis=1)
    y = baseline_corrected['Status']
    
    et = ExtraTreesClassifier(random_state=1234)
    cv = KFold(n_splits=10, shuffle=True, random_state=1234)
    
    scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
    
    return {
        'lam': lam,
        'p': p,
        'Accuracy': np.mean(scores['test_accuracy']),
        'Precision': np.mean(scores['test_precision_macro']),
        'Recall': np.mean(scores['test_recall_macro']),
        'F1': np.mean(scores['test_f1_macro']),
        'Accuracy Std': np.std(scores['test_accuracy']),
        'Precision Std': np.std(scores['test_precision_macro']),
        'Recall Std': np.std(scores['test_recall_macro']),
        'F1 Std': np.std(scores['test_f1_macro'])
    }

# Define your parameters
lam_values = [10**7, 10**8, 10**9]
p_values = [0.001, 0.005, 0.01, 0.05, 0.1]

# Parallel execution
results = Parallel(n_jobs=-1)(delayed(process_combination)(lam, p, df) for lam in lam_values for p in p_values)


In [14]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: lam={best_row['lam']}, p={best_row['p']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Baseline_Corrected_Results.csv", index=False)

Best Parameters: lam=1000000000.0, p=0.05 with Accuracy: 0.9343


>#### Try with Different Smoothing Parameters

In [15]:
results = []

for window in window_size:
    for poly in poly_order:
        # Apply smoothing
        df['Smoothed_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: savgol_filter(x, window, poly, deriv=0))
        
        smoothed_absorbance = prepare_wavelength_df(df, 'Smoothed_Absorbance')
        X = smoothed_absorbance.drop(['Status'], axis=1)
        y = smoothed_absorbance['Status']
        
        et = ExtraTreesClassifier(random_state=1234)
        cv = KFold(n_splits=10, shuffle=True, random_state=1234)
        
        # Perform cross-validation with multiple scoring metrics
        scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
        
        # Append the results
        results.append({
            'window': window,
            'poly': poly,
            'Accuracy': np.mean(scores['test_accuracy']),
            'Precision': np.mean(scores['test_precision_macro']),
            'Recall': np.mean(scores['test_recall_macro']),
            'F1': np.mean(scores['test_f1_macro']),
            'Accuracy Std': np.std(scores['test_accuracy']),
            'Precision Std': np.std(scores['test_precision_macro']),
            'Recall Std': np.std(scores['test_recall_macro']),
            'F1 Std': np.std(scores['test_f1_macro'])
        })

KeyboardInterrupt: 

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: window_size={best_row['window']}, poly_order={best_row['poly']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Smoothed_Absorbance_Results.csv", index=False)

Best Parameters: window_size=7.0, poly_order=2.0 with Accuracy: 0.8831


>##### Try with both

In [None]:
results = []

for lam in lam_values:
    for p in p_values:
        # Apply baseline correction
        df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df.groupby('SpecID')['Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))
                
        for window in window_size:
            for poly in poly_order:
                # Apply smoothing
                df['Smoothed_Baseline'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window, poly, deriv=0))
                
                smoothed_baseline = prepare_wavelength_df(df, 'Smoothed_Baseline')
                X = smoothed_baseline.drop(['Status'], axis=1)
                y = smoothed_baseline['Status']
                
                et = ExtraTreesClassifier(random_state=1234)
                cv = KFold(n_splits=10, shuffle=True, random_state=1234)
                
                # Perform cross-validation with multiple scoring metrics
                scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
                
                # Append the results
                results.append({
                    'lam': lam,
                    'p': p,
                    'window': window,
                    'poly': poly,
                    'Accuracy': np.mean(scores['test_accuracy']),
                    'Precision': np.mean(scores['test_precision_macro']),
                    'Recall': np.mean(scores['test_recall_macro']),
                    'F1': np.mean(scores['test_f1_macro']),
                    'Accuracy Std': np.std(scores['test_accuracy']),
                    'Precision Std': np.std(scores['test_precision_macro']),
                    'Recall Std': np.std(scores['test_recall_macro']),
                    'F1 Std': np.std(scores['test_f1_macro'])
                })

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: lam={best_row['lam']}, p={best_row['p']}, window_size={best_row['window']}, poly_order={best_row['poly']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Smoothed_Baseline_Results.csv", index=False)

Best Parameters: lam=10000000.0, p=0.05, window_size=51.0, poly_order=2.0 with Accuracy: 0.9524


>#### Finally Try this using again with scaling

In [16]:
results = []

for lam in lam_values:
    for p in p_values:
        # Apply baseline correction
        df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df.groupby('SpecID')['Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))
        df['Scaled_Baseline'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: normalise(x))
                        
        baseline_corrected = prepare_wavelength_df(df, 'Scaled_Baseline')
        X = baseline_corrected.drop(['Status'], axis=1)
        y = baseline_corrected['Status']
        
        et = ExtraTreesClassifier(random_state=1234)
        cv = KFold(n_splits=10, shuffle=True, random_state=1234)
        
        # Perform cross-validation with multiple scoring metrics
        scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
        
        # Append the results
        results.append({
            'lam': lam,
            'p': p,
            'Accuracy': np.mean(scores['test_accuracy']),
            'Precision': np.mean(scores['test_precision_macro']),
            'Recall': np.mean(scores['test_recall_macro']),
            'F1': np.mean(scores['test_f1_macro']),
            'Accuracy Std': np.std(scores['test_accuracy']),
            'Precision Std': np.std(scores['test_precision_macro']),
            'Recall Std': np.std(scores['test_recall_macro']),
            'F1 Std': np.std(scores['test_f1_macro'])
        })

In [17]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: lam={best_row['lam']}, p={best_row['p']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Scaled_Baseline_Corrected_Results.csv", index=False)

Best Parameters: lam=10000000.0, p=0.001 with Accuracy: 0.8959


>#### Try with Different Smoothing Parameters

In [18]:
results = []

for window in window_size:
    for poly in poly_order:
        # Apply smoothing
        df['Smoothed_Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: savgol_filter(x, window, poly, deriv=0))
        df['Scaled_Smooth'] = df.groupby('SpecID')['Smoothed_Absorbance'].transform(lambda x: normalise(x))
        
        smoothed_absorbance = prepare_wavelength_df(df, 'Scaled_Smooth')
        X = smoothed_absorbance.drop(['Status'], axis=1)
        y = smoothed_absorbance['Status']
        
        et = ExtraTreesClassifier(random_state=1234)
        cv = KFold(n_splits=10, shuffle=True, random_state=1234)
        
        # Perform cross-validation with multiple scoring metrics
        scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
        
        # Append the results
        results.append({
            'window': window,
            'poly': poly,
            'Accuracy': np.mean(scores['test_accuracy']),
            'Precision': np.mean(scores['test_precision_macro']),
            'Recall': np.mean(scores['test_recall_macro']),
            'F1': np.mean(scores['test_f1_macro']),
            'Accuracy Std': np.std(scores['test_accuracy']),
            'Precision Std': np.std(scores['test_precision_macro']),
            'Recall Std': np.std(scores['test_recall_macro']),
            'F1 Std': np.std(scores['test_f1_macro'])
        })

In [19]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: window_size={best_row['window']}, poly_order={best_row['poly']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Scaled_Smoothed_Absorbance_Results.csv", index=False)

Best Parameters: window_size=101.0, poly_order=2.0 with Accuracy: 0.9507


>##### Try with both

In [20]:
results = []

for lam in lam_values:
    for p in p_values:
        # Apply baseline correction
        df['Baseline_Corrected_Absorbance'] = df['Absorbance'] - df.groupby('SpecID')['Absorbance'].transform(lambda x: baseline_als_optimized(x, lam=lam, p=p))
                
        for window in window_size:
            for poly in poly_order:
                # Apply smoothing
                df['Smoothed_Baseline'] = df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: savgol_filter(x, window, poly, deriv=0))
                df['Scaled_Smooth_Baseline'] = df.groupby('SpecID')['Smoothed_Baseline'].transform(lambda x: normalise(x))
                
                smoothed_baseline = prepare_wavelength_df(df, 'Scaled_Smooth_Baseline')
                X = smoothed_baseline.drop(['Status'], axis=1)
                y = smoothed_baseline['Status']
                
                et = ExtraTreesClassifier(random_state=1234)
                cv = KFold(n_splits=10, shuffle=True, random_state=1234)
                
                # Perform cross-validation with multiple scoring metrics
                scores = cross_validate(et, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'])
                
                # Append the results
                results.append({
                    'lam': lam,
                    'p': p,
                    'window': window,
                    'poly': poly,
                    'Accuracy': np.mean(scores['test_accuracy']),
                    'Precision': np.mean(scores['test_precision_macro']),
                    'Recall': np.mean(scores['test_recall_macro']),
                    'F1': np.mean(scores['test_f1_macro']),
                    'Accuracy Std': np.std(scores['test_accuracy']),
                    'Precision Std': np.std(scores['test_precision_macro']),
                    'Recall Std': np.std(scores['test_recall_macro']),
                    'F1 Std': np.std(scores['test_f1_macro'])
                })

In [21]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best parameters based on the selected criteria
best_row = results_df.loc[results_df['Accuracy'].idxmax()]

print(f"Best Parameters: lam={best_row['lam']}, p={best_row['p']}, window_size={best_row['window']}, poly_order={best_row['poly']} with Accuracy: {best_row['Accuracy']:.4f}")

# Save the results to csv
results_df.sort_values('Accuracy', ascending=False).to_csv("Scaled_Smoothed_Baseline_Results.csv", index=False)

Best Parameters: lam=10000000.0, p=0.005, window_size=51.0, poly_order=2.0 with Accuracy: 0.9370
