# Outlier Removal

This notebook will investigate if dropping spectra where a certain proportion of the absorbance values in a spectra are considered outliers, leads to increased model performance.

Import relevant libraries

In [54]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_validate, GroupKFold
from sklearn.metrics import accuracy_score
import sys
sys.path.append('..')  # Adds the parent directory to the path so Python can find the `Cleaning_and_Evaluation` package
from Cleaning_and_Evaluation import *

Import and clean raw spectra data

In [55]:
spectra_df = pd.read_csv("../data/exosomes.raw_spectrum_400-1800.csv")
df = spectra_df

In [56]:
cleaning_params = {
    'despike': True,
    'baseline_correct': True,
    'smoothing': True,
    'scaling': False,
    'despike_ma': 20,
    'despike_threshold': 7,
    'lam': 10**8,
    'p': 0.01,
    'window_size': 51,
    'poly_order': 3
}
spectra_cleaning(df, **cleaning_params)

In [57]:
df.head()

Unnamed: 0,index,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,293,201210-1-00,293,400.22778,41.863303,201210-1,Normal
1,294,201210-1-00,294,400.91116,41.803843,201210-1,Normal
2,295,201210-1-00,295,401.59454,41.741884,201210-1,Normal
3,296,201210-1-00,296,402.27789,41.677722,201210-1,Normal
4,297,201210-1-00,297,402.96127,41.611654,201210-1,Normal


Here we show the performance using the entire spectra without dropping outliers. This serves as a baseline to compare to.

In [58]:
wavelength_df = prepare_wavelength_df(df, "Absorbance")

et = ExtraTreesClassifier(random_state=1234)
evaluate_model(wavelength_df, et)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6119 +/- 0.1114
ExtraTreesClassifier Cross-Validation Precision: 0.6103 +/- 0.1103
ExtraTreesClassifier Cross-Validation Recall: 0.6215 +/- 0.1349
ExtraTreesClassifier Cross-Validation F1-Score: 0.5949 +/- 0.1254


Function to apply quartiles calculation for each SurID group, then by WaveNumber


In [60]:
# Correctly define the function to apply multiple aggregations

def get_surface_wavenumber_quartiles(df):

    # Apply aggregation directly within groupby
    grouped = df.groupby(['SurID', 'WaveNumber']).agg(
        Lower_Quartile=('Absorbance', lambda x: x.quantile(0.25)),
        Median=('Absorbance', 'median'),
        Upper_Quartile=('Absorbance', lambda x: x.quantile(0.75))
    ).reset_index()
    return grouped

# Apply the function to your DataFrame
quartile_data = get_surface_wavenumber_quartiles(df)

# Calculate the IQR
quartile_data['IQR'] = quartile_data['Upper_Quartile'] - quartile_data['Lower_Quartile']

In [61]:
quartile_data.head()

Unnamed: 0,SurID,WaveNumber,Lower_Quartile,Median,Upper_Quartile,IQR
0,201210-1,400.22778,-4.960245,5.810807,22.825529,27.785774
1,201210-1,400.91116,-4.334017,4.900007,21.091229,25.425247
2,201210-1,401.59454,-4.300642,4.760513,20.074852,24.375494
3,201210-1,402.27789,-3.981677,3.272608,19.067517,23.049194
4,201210-1,402.96127,-3.755034,2.603336,18.284681,22.039715


Now we shall remove outliers spectra that have an absorbance outside 1.5 times the IQR at each WaveNumber in their group.

In [62]:
merged_df = df.merge(quartile_data, on=['SurID', 'WaveNumber'])
merged_df.head()

Unnamed: 0,index,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Lower_Quartile,Median,Upper_Quartile,IQR
0,293,201210-1-00,293,400.22778,41.863303,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
1,2928,201210-1-01,293,400.22778,46.314608,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
2,5563,201210-1-02,293,400.22778,118.159018,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
3,8198,201210-1-03,293,400.22778,175.466997,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
4,10833,201210-1-04,293,400.22778,111.814973,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774


In [63]:
# Filter rows where Absorbance is an outlier
outliers = merged_df[
    (merged_df['Absorbance'] < (merged_df['Lower_Quartile'] - 1.5 * merged_df['IQR'])) |
    (merged_df['Absorbance'] > (merged_df['Upper_Quartile'] + 1.5 * merged_df['IQR']))
]

outliers.head()

Unnamed: 0,index,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Lower_Quartile,Median,Upper_Quartile,IQR
2,5563,201210-1-02,293,400.22778,118.159018,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
3,8198,201210-1-03,293,400.22778,175.466997,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
4,10833,201210-1-04,293,400.22778,111.814973,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
5,13468,201210-1-05,293,400.22778,143.834972,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774
6,16103,201210-1-06,293,400.22778,96.603892,201210-1,Normal,-4.960245,5.810807,22.825529,27.785774


Calculate what proportion of a spectra is an outlier

In [65]:
spectra_length = df['WaveNumber'].nunique()  # This should be consistent for all spectra
outlier_proportions = outliers['SpecID'].value_counts()/spectra_length
outlier_proportions

SpecID
210114-2-41    0.973646
210114-2-33    0.956564
210114-2-35    0.910688
210114-2-34    0.899463
210114-2-32    0.877501
                 ...   
210325-2-10    0.000488
210325-1-38    0.000488
210428-1-13    0.000488
210415-2-11    0.000488
210421-1-40    0.000488
Name: count, Length: 2179, dtype: float64

Set the proportion of outliers needed to drop this spectra.

In [89]:
proportion_cutoff = 0.1
outlier_SpecIDs = outlier_proportions[outlier_proportions > proportion_cutoff].index
print("Outlier Spectra:")
print(outlier_SpecIDs)
print(f"Number of spectra dropped: {len(outlier_SpecIDs)}")

Outlier Spectra:
Index(['210114-2-41', '210114-2-33', '210114-2-35', '210114-2-34',
       '210114-2-32', '210325-1-33', '210519-2-37', '210114-1-25',
       '210114-2-40', '210414-1-01',
       ...
       '210217-2-49', '210428-3-34', '210310-1-07', '210505-1-31',
       '210114-1-44', '210318-2-35', '210217-2-38', '210311-1-12',
       '210428-1-01', '210211-1-03'],
      dtype='object', name='SpecID', length=561)
Number of spectra dropped: 561


Drop the outler spectra from the dataset

In [90]:
clean_df = spectra_df[~spectra_df['SpecID'].isin(outlier_SpecIDs)]

In [68]:
# clean_df.to_csv("../../data/outlier_spectra_removed.csv", index=False)

#### **Check model performance when outliers are removed**

In [91]:
wavelength_df = prepare_wavelength_df(clean_df, 'Absorbance')

Before evaluating model, check that we haven't dropped any surfaces.

In [92]:
print(f"Surfaces remaining: {wavelength_df['SurID'].nunique()} out of 63")

Surfaces remaining: 63 out of 63


In [93]:
evaluate_model(wavelength_df, et)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5771 +/- 0.1689
ExtraTreesClassifier Cross-Validation Precision: 0.5471 +/- 0.1592
ExtraTreesClassifier Cross-Validation Recall: 0.5533 +/- 0.1806
ExtraTreesClassifier Cross-Validation F1-Score: 0.5177 +/- 0.1768


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Check a range of outlier proportions

In [94]:
for proportion_cutoff in range(10, 91, 5):
    proportion_cutoff /= 100  # Convert to proportion

    # Your existing code block
    outlier_SpecIDs = outlier_proportions[outlier_proportions > proportion_cutoff].index
    clean_df = spectra_df[~spectra_df['SpecID'].isin(outlier_SpecIDs)]
    wavelength_df = prepare_wavelength_df(clean_df, 'Absorbance')
    print(proportion_cutoff)
    evaluate_model(wavelength_df, et)

0.1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5771 +/- 0.1689
ExtraTreesClassifier Cross-Validation Precision: 0.5471 +/- 0.1592
ExtraTreesClassifier Cross-Validation Recall: 0.5533 +/- 0.1806
ExtraTreesClassifier Cross-Validation F1-Score: 0.5177 +/- 0.1768
0.15


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5969 +/- 0.1737
ExtraTreesClassifier Cross-Validation Precision: 0.5820 +/- 0.1711
ExtraTreesClassifier Cross-Validation Recall: 0.5930 +/- 0.1684
ExtraTreesClassifier Cross-Validation F1-Score: 0.5708 +/- 0.1765
0.2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5847 +/- 0.1060
ExtraTreesClassifier Cross-Validation Precision: 0.5816 +/- 0.1151
ExtraTreesClassifier Cross-Validation Recall: 0.5581 +/- 0.1562
ExtraTreesClassifier Cross-Validation F1-Score: 0.5438 +/- 0.1411
0.25
ExtraTreesClassifier Cross-Validation Accuracy: 0.6366 +/- 0.1277
ExtraTreesClassifier Cross-Validation Precision: 0.6402 +/- 0.1141
ExtraTreesClassifier Cross-Validation Recall: 0.6599 +/- 0.1393
ExtraTreesClassifier Cross-Validation F1-Score: 0.6244 +/- 0.1266
0.3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5707 +/- 0.1396
ExtraTreesClassifier Cross-Validation Precision: 0.5168 +/- 0.1502
ExtraTreesClassifier Cross-Validation Recall: 0.4891 +/- 0.1814
ExtraTreesClassifier Cross-Validation F1-Score: 0.4823 +/- 0.1587
0.35


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5778 +/- 0.1338
ExtraTreesClassifier Cross-Validation Precision: 0.5741 +/- 0.1408
ExtraTreesClassifier Cross-Validation Recall: 0.5849 +/- 0.1718
ExtraTreesClassifier Cross-Validation F1-Score: 0.5397 +/- 0.1482
0.4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5874 +/- 0.1243
ExtraTreesClassifier Cross-Validation Precision: 0.5391 +/- 0.1034
ExtraTreesClassifier Cross-Validation Recall: 0.5386 +/- 0.1248
ExtraTreesClassifier Cross-Validation F1-Score: 0.5047 +/- 0.1220
0.45


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5791 +/- 0.1174
ExtraTreesClassifier Cross-Validation Precision: 0.5553 +/- 0.1315
ExtraTreesClassifier Cross-Validation Recall: 0.5089 +/- 0.1269
ExtraTreesClassifier Cross-Validation F1-Score: 0.5038 +/- 0.1210
0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5864 +/- 0.0759
ExtraTreesClassifier Cross-Validation Precision: 0.5644 +/- 0.0822
ExtraTreesClassifier Cross-Validation Recall: 0.5747 +/- 0.0893
ExtraTreesClassifier Cross-Validation F1-Score: 0.5371 +/- 0.0873
0.55


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5757 +/- 0.1500
ExtraTreesClassifier Cross-Validation Precision: 0.5661 +/- 0.1445
ExtraTreesClassifier Cross-Validation Recall: 0.5357 +/- 0.1670
ExtraTreesClassifier Cross-Validation F1-Score: 0.5279 +/- 0.1555
0.6


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6101 +/- 0.0973
ExtraTreesClassifier Cross-Validation Precision: 0.5741 +/- 0.0850
ExtraTreesClassifier Cross-Validation Recall: 0.5839 +/- 0.1038
ExtraTreesClassifier Cross-Validation F1-Score: 0.5622 +/- 0.0873
0.65


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6034 +/- 0.0880
ExtraTreesClassifier Cross-Validation Precision: 0.6011 +/- 0.0904
ExtraTreesClassifier Cross-Validation Recall: 0.5923 +/- 0.0990
ExtraTreesClassifier Cross-Validation F1-Score: 0.5650 +/- 0.0845
0.7


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6035 +/- 0.0738
ExtraTreesClassifier Cross-Validation Precision: 0.5529 +/- 0.0827
ExtraTreesClassifier Cross-Validation Recall: 0.5480 +/- 0.1259
ExtraTreesClassifier Cross-Validation F1-Score: 0.5276 +/- 0.0972
0.75


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6028 +/- 0.1069
ExtraTreesClassifier Cross-Validation Precision: 0.5669 +/- 0.0959
ExtraTreesClassifier Cross-Validation Recall: 0.5607 +/- 0.1276
ExtraTreesClassifier Cross-Validation F1-Score: 0.5450 +/- 0.1071
0.8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.6151 +/- 0.1243
ExtraTreesClassifier Cross-Validation Precision: 0.5654 +/- 0.1191
ExtraTreesClassifier Cross-Validation Recall: 0.5638 +/- 0.1271
ExtraTreesClassifier Cross-Validation F1-Score: 0.5431 +/- 0.1183
0.85


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5827 +/- 0.1148
ExtraTreesClassifier Cross-Validation Precision: 0.5810 +/- 0.1062
ExtraTreesClassifier Cross-Validation Recall: 0.5852 +/- 0.1474
ExtraTreesClassifier Cross-Validation F1-Score: 0.5591 +/- 0.1235
0.9


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ExtraTreesClassifier Cross-Validation Accuracy: 0.5857 +/- 0.1148
ExtraTreesClassifier Cross-Validation Precision: 0.5836 +/- 0.1080
ExtraTreesClassifier Cross-Validation Recall: 0.5877 +/- 0.1473
ExtraTreesClassifier Cross-Validation F1-Score: 0.5631 +/- 0.1261
