### **This notebook will attempt to remove outliers from each group** 

In [477]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_validate, GroupKFold
from sklearn.metrics import accuracy_score

In [478]:
spectra_df = pd.read_csv("../../data/current_clean_spectrum.csv")
#spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [479]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

#df['Absorbance'] = spectra_df.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))

In [480]:
#sns.lineplot(data=df, x='WaveNumber', y='Absorbance', hue='Status', errorbar=('ci', False))

In [481]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,1.000000
1,201210-1-00,294,400.91116,201210-1,Normal,0.995483
2,201210-1-00,295,401.59454,201210-1,Normal,0.995081
3,201210-1-00,296,402.27789,201210-1,Normal,0.988640
4,201210-1-00,297,402.96127,201210-1,Normal,0.985799
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,0.204714
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,0.202320
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,0.203623
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,0.202461


Test on one surface

In [482]:
#df = df[df['SurID'] == '201210-1']

##### **Function to apply quartiles calculation for each SurID group, then by WaveNumber**


In [483]:
# Correctly define the function to apply multiple aggregations

def get_surface_wavenumber_quartiles(df):

    # Apply aggregation directly within groupby
    grouped = df.groupby(['SurID', 'WaveNumber']).agg(
        Lower_Quartile=('Absorbance', lambda x: x.quantile(0.25)),
        Median=('Absorbance', 'median'),
        Upper_Quartile=('Absorbance', lambda x: x.quantile(0.75))
    ).reset_index()
    return grouped

# Apply the function to your DataFrame
quartile_data = get_surface_wavenumber_quartiles(df)

# Calculate the IQR
quartile_data['IQR'] = quartile_data['Upper_Quartile'] - quartile_data['Lower_Quartile']

In [484]:
quartile_data

Unnamed: 0,SurID,WaveNumber,Lower_Quartile,Median,Upper_Quartile,IQR
0,201210-1,400.22778,0.526941,0.601204,0.712468,0.185527
1,201210-1,400.91116,0.528315,0.602977,0.703482,0.175167
2,201210-1,401.59454,0.531546,0.603377,0.703626,0.172080
3,201210-1,402.27789,0.528924,0.598372,0.705787,0.176863
4,201210-1,402.96127,0.526553,0.601487,0.712526,0.185973
...,...,...,...,...,...,...
129082,210526-3,1797.03870,0.710877,0.818616,0.865193,0.154316
129083,210526-3,1797.72200,0.713776,0.815285,0.865033,0.151256
129084,210526-3,1798.40550,0.714516,0.815624,0.862259,0.147743
129085,210526-3,1799.08890,0.708963,0.818961,0.861205,0.152242


Now we shall remove outliers spectra that have an absorbance outside 1.5 times the IQR at each WaveNumber in their group.

In [485]:
merged_df = df.merge(quartile_data, on=['SurID', 'WaveNumber'])
merged_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,Lower_Quartile,Median,Upper_Quartile,IQR
0,201210-1-00,293,400.22778,201210-1,Normal,1.000000,0.526941,0.601204,0.712468,0.185527
1,201210-1-00,294,400.91116,201210-1,Normal,0.995483,0.528315,0.602977,0.703482,0.175167
2,201210-1-00,295,401.59454,201210-1,Normal,0.995081,0.531546,0.603377,0.703626,0.172080
3,201210-1-00,296,402.27789,201210-1,Normal,0.988640,0.528924,0.598372,0.705787,0.176863
4,201210-1-00,297,402.96127,201210-1,Normal,0.985799,0.526553,0.601487,0.712526,0.185973
...,...,...,...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,0.204714,0.710877,0.818616,0.865193,0.154316
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,0.202320,0.713776,0.815285,0.865033,0.151256
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,0.203623,0.714516,0.815624,0.862259,0.147743
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,0.202461,0.708963,0.818961,0.861205,0.152242


In [486]:
# Filter rows where Absorbance is an outlier
outliers = merged_df[
    (merged_df['Absorbance'] < (merged_df['Lower_Quartile'] - 1.5 * merged_df['IQR'])) |
    (merged_df['Absorbance'] > (merged_df['Upper_Quartile'] + 1.5 * merged_df['IQR']))
]

outliers

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance,Lower_Quartile,Median,Upper_Quartile,IQR
0,201210-1-00,293,400.22778,201210-1,Normal,1.000000,0.526941,0.601204,0.712468,0.185527
1,201210-1-00,294,400.91116,201210-1,Normal,0.995483,0.528315,0.602977,0.703482,0.175167
2,201210-1-00,295,401.59454,201210-1,Normal,0.995081,0.531546,0.603377,0.703626,0.172080
3,201210-1-00,296,402.27789,201210-1,Normal,0.988640,0.528924,0.598372,0.705787,0.176863
7,201210-1-00,300,405.01138,201210-1,Normal,0.992970,0.530212,0.603619,0.709135,0.178923
...,...,...,...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,0.204714,0.710877,0.818616,0.865193,0.154316
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,0.202320,0.713776,0.815285,0.865033,0.151256
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,0.203623,0.714516,0.815624,0.862259,0.147743
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,0.202461,0.708963,0.818961,0.861205,0.152242


**If a defined proportion of a spectra is an outlier drop it from the dataframe**

In [487]:
spectra_length = df['WaveNumber'].nunique()  # This should be consistent for all spectra
spectra_length

2049

In [488]:
outlier_proportions = outliers['SpecID'].value_counts()/spectra_length
outlier_proportions

SpecID
210407-1-08    0.945827
210526-3-08    0.939483
210519-3-08    0.935090
210526-3-09    0.934114
210526-1-07    0.927282
                 ...   
210429-1-36    0.000488
210504-1-24    0.000488
210429-1-44    0.000488
210519-3-19    0.000488
210526-1-06    0.000488
Name: count, Length: 2038, dtype: float64

**Set the proportion of outliers needed to drop this spectra.**

In [559]:
proportion_cutoff = 0.6
outlier_SpecIDs = outlier_proportions[outlier_proportions > proportion_cutoff].index
print(outlier_SpecIDs)
print(len(outlier_SpecIDs))

Index(['210407-1-08', '210526-3-08', '210519-3-08', '210526-3-09',
       '210526-1-07', '201210-1-04', '210304-2-36', '210526-3-07',
       '210304-2-37', '210526-1-09', '210519-3-09', '210407-1-09',
       '210526-1-08', '210304-2-38'],
      dtype='object', name='SpecID')
14


Drop the outler spectra from the dataset

In [560]:
clean_df = spectra_df[~spectra_df['SpecID'].isin(outlier_SpecIDs)]

In [561]:
clean_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,41.863303
1,201210-1-00,294,400.91116,201210-1,Normal,41.803843
2,201210-1-00,295,401.59454,201210-1,Normal,41.741884
3,201210-1-00,296,402.27789,201210-1,Normal,41.677722
4,201210-1-00,297,402.96127,201210-1,Normal,41.611654
...,...,...,...,...,...,...
6233053,210526-3-06,2337,1797.03870,210526-3,Hyperglycemia,4.724717
6233054,210526-3-06,2338,1797.72200,210526-3,Hyperglycemia,5.836103
6233055,210526-3-06,2339,1798.40550,210526-3,Hyperglycemia,7.071687
6233056,210526-3-06,2340,1799.08890,210526-3,Hyperglycemia,8.436967


In [562]:
# clean_df.to_csv("../../data/outlier_spectra_removed.csv", index=False)

#### **Check model performance when outliers are removed**

In [563]:
clean_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,41.863303
1,201210-1-00,294,400.91116,201210-1,Normal,41.803843
2,201210-1-00,295,401.59454,201210-1,Normal,41.741884
3,201210-1-00,296,402.27789,201210-1,Normal,41.677722
4,201210-1-00,297,402.96127,201210-1,Normal,41.611654
...,...,...,...,...,...,...
6233053,210526-3-06,2337,1797.03870,210526-3,Hyperglycemia,4.724717
6233054,210526-3-06,2338,1797.72200,210526-3,Hyperglycemia,5.836103
6233055,210526-3-06,2339,1798.40550,210526-3,Hyperglycemia,7.071687
6233056,210526-3-06,2340,1799.08890,210526-3,Hyperglycemia,8.436967


In [564]:
def prepare_wavelength_df(df, absorbance_col, status_col='Status'):

    # Pivot the DataFrame to get wavelengths as columns and absorbance values
    wavelength_df = df.pivot(index='SpecID', columns='WaveNumber', values=absorbance_col).reset_index()
    wavelength_df.columns.name = None

    # Merge with the statuses based on SpecID
    # Include the SurID to perform GroupKFold CV
    statuses_and_surface = df[['SpecID', 'SurID', status_col]].drop_duplicates()
    wavelength_df = pd.merge(wavelength_df, statuses_and_surface, on='SpecID')

    # Set SpecID as the index
    wavelength_df = wavelength_df.set_index('SpecID')

    return wavelength_df

In [565]:
def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)

    scores = []
    for train_index, test_index in cv.split(X, y, groups):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the Extra Trees classifier
        et.fit(X_train, y_train)
        predictions = et.predict(X_test)
        
        # Evaluate the model
        score = accuracy_score(y_test, predictions)
        scores.append(score)
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

In [566]:
wavelength_df = prepare_wavelength_df(clean_df, 'Absorbance')

In [567]:
wavelength_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,41.863303,41.803843,41.741884,41.677722,41.611654,41.543974,41.474980,41.404968,41.334234,41.263073,...,6.280946,5.549559,4.745724,3.866578,2.909255,1.870891,0.748623,-0.460415,201210-1,Normal
201210-1-01,46.314608,47.323684,48.299209,49.241395,50.150457,51.026608,51.870063,52.681035,53.459738,54.206386,...,6.769011,7.280928,7.861246,8.512936,9.238972,10.042323,10.925962,11.892860,201210-1,Normal
201210-1-02,118.159018,114.686240,111.563911,108.777452,106.312282,104.153823,102.287493,100.698715,99.372907,98.295491,...,-4.633601,-4.557349,-4.439365,-4.278894,-4.075180,-3.827470,-3.535010,-3.197043,201210-1,Normal
201210-1-03,175.466997,174.846086,174.188020,173.498226,172.782129,172.045155,171.292728,170.530275,169.763222,168.996993,...,-10.801936,-10.349539,-9.864191,-9.347124,-8.799567,-8.222752,-7.617909,-6.986269,201210-1,Normal
201210-1-05,143.834972,140.283156,137.083266,134.224502,131.696062,129.487147,127.586955,125.984687,124.669540,123.630716,...,-14.814007,-14.904465,-14.998892,-15.100839,-15.213854,-15.341490,-15.487294,-15.654819,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,11.003178,12.008451,12.879986,13.624553,14.248922,14.759862,15.164145,15.468539,15.679816,15.804744,...,6.671003,7.197072,7.711103,8.211164,8.695324,9.161650,9.608210,10.033072,210526-3,Hyperglycemia
210526-3-46,14.239653,13.596345,12.981340,12.396227,11.842591,11.322020,10.836101,10.386421,9.974566,9.602123,...,2.124074,2.718437,3.405898,4.190762,5.077335,6.069920,7.172824,8.390352,210526-3,Hyperglycemia
210526-3-47,17.165901,17.349777,17.457499,17.494696,17.466999,17.380039,17.239446,17.050852,16.819887,16.552181,...,6.895403,7.740965,8.633346,9.572405,10.558000,11.589990,12.668233,13.792588,210526-3,Hyperglycemia
210526-3-48,15.865434,14.551958,13.412815,12.441007,11.629535,10.971400,10.459605,10.087150,9.847038,9.732269,...,8.137265,9.444800,10.919724,12.570428,14.405304,16.432743,18.661137,21.098876,210526-3,Hyperglycemia


In [568]:
evaluate_extra_trees(wavelength_df)

ExtraTreesClassifier Cross-Validation Accuracy: 0.6056 +/- 0.1322
