## This notebook will construct the nodes and edges that we will build our graph with

Import relevant libraries and import raw data

In [2]:
import seaborn as sns
import pandas as pd
from scipy.signal import find_peaks
import numpy as np
import matplotlib.pyplot as plt

In [223]:
df = pd.read_csv("../../data/current_clean_spectrum.csv")

In [224]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

df['Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))

In [5]:
#sns.lineplot(data=df, x='WaveNumber', y='Absorbance', hue='Status', errorbar=('ci', False))

Here we use scipy's function "find_peaks" to find all peaks in each individual spectra sample as well as pulling featurse such as prominence and width of peaks

In [225]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,0.203556
1,201210-1-00,294,400.91116,201210-1,Normal,0.203010
2,201210-1-00,295,401.59454,201210-1,Normal,0.202464
3,201210-1-00,296,402.27789,201210-1,Normal,0.201917
4,201210-1-00,297,402.96127,201210-1,Normal,0.201370
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,-0.078901
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,-0.074710
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,-0.070333
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,-0.065772


In [226]:
# Assuming 'df' is your DataFrame
# Step 1: Group by 'SurID'
groups = df.groupby('SurID')

In [227]:
print(groups['Absorbance'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000023E0489B580>


In [228]:
# Assuming 'df' is your DataFrame
# Function to identify samples where at least 25% of values are outliers
def identify_outliers(group):
    Q1 = group['Absorbance'].quantile(0.25)
    Q3 = group['Absorbance'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Determine if a measurement is an outlier
    is_outlier = (group['Absorbance'] < lower_bound) | (group['Absorbance'] > upper_bound)
    outlier_prop = is_outlier.mean()  # Proportion of measurements that are outliers
    
    # Return the SpecID if outlier proportion is >= 0.25
    if outlier_prop >= 0.04:
        return group['SpecID'].iloc[0]
    return pd.NA  # Use pandas NA for non-outliers or when the condition is not met

# Apply the function to each 'SpecID' group within each 'SurID' and flatten the result
outlier_specids = df.groupby('SurID').apply(lambda x: x.groupby('SpecID').apply(identify_outliers)).dropna()

# Convert to a flat list of unique SpecIDs
outlier_specids = pd.Series(outlier_specids.values.flatten()).dropna().unique()

# Remove samples (SpecID) where ≥ 25% of 'Absorbance' values are outliers
clean_df = df[~df['SpecID'].isin(outlier_specids)]

In [229]:
print(clean_df['SpecID'].nunique())

2065


In [230]:
df = clean_df

In [231]:
df = df.groupby(["SpecID", "WaveNumber", "Status", "SurID"])["Absorbance"].median().reset_index()

In [232]:
pivoted_df = df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
pivoted_df.columns.name = None
statuses = df[['SpecID', 'Status']].drop_duplicates()
pivoted_df = pd.merge(pivoted_df, statuses, on='SpecID')
pivoted_df = pivoted_df.set_index('SpecID')

In [213]:
pivoted_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.3053,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.203556,0.203010,0.202464,0.201917,0.201370,0.200822,0.200273,0.199724,0.199174,0.198624,...,-0.075285,-0.080143,-0.085290,-0.090727,-0.096452,-0.102466,-0.108769,-0.115361,-0.122242,Normal
201210-1-01,0.207050,0.212595,0.217953,0.223125,0.228109,0.232907,0.237518,0.241943,0.246181,0.250232,...,-0.072436,-0.071108,-0.069651,-0.068065,-0.066350,-0.064505,-0.062531,-0.060428,-0.058195,Normal
201210-1-05,0.499901,0.494222,0.488921,0.483997,0.479451,0.475281,0.471489,0.468074,0.465036,0.462376,...,-0.165273,-0.165654,-0.165905,-0.166024,-0.166013,-0.165872,-0.165600,-0.165197,-0.164664,Normal
201210-1-10,0.089658,0.088674,0.088423,0.088906,0.090124,0.092075,0.094759,0.098178,0.102330,0.107216,...,-0.139571,-0.139367,-0.138950,-0.138322,-0.137482,-0.136430,-0.135166,-0.133690,-0.132002,Normal
201210-1-12,0.095981,0.094277,0.092913,0.091889,0.091205,0.090861,0.090856,0.091191,0.091866,0.092881,...,-0.132303,-0.134703,-0.137284,-0.140048,-0.142993,-0.146120,-0.149429,-0.152919,-0.156592,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.009309,-0.009209,-0.009153,-0.009140,-0.009172,-0.009247,-0.009367,-0.009530,-0.009737,-0.009989,...,-0.208941,-0.204031,-0.199054,-0.194012,-0.188903,-0.183729,-0.178489,-0.173182,-0.167810,Hyperglycemia
210526-3-46,0.008168,-0.007448,-0.022068,-0.035692,-0.048320,-0.059951,-0.070587,-0.080226,-0.088869,-0.096515,...,-0.214451,-0.214824,-0.215151,-0.215431,-0.215664,-0.215850,-0.215989,-0.216081,-0.216126,Hyperglycemia
210526-3-47,0.094002,0.083090,0.072727,0.062912,0.053645,0.044925,0.036754,0.029131,0.022056,0.015529,...,-0.225522,-0.218086,-0.210190,-0.201834,-0.193018,-0.183743,-0.174007,-0.163812,-0.153157,Hyperglycemia
210526-3-48,-0.057456,-0.061159,-0.064344,-0.067009,-0.069154,-0.070781,-0.071888,-0.072476,-0.072544,-0.072094,...,-0.144897,-0.133755,-0.121894,-0.109314,-0.096016,-0.082000,-0.067265,-0.051811,-0.035638,Hyperglycemia


In [120]:
from pycaret.classification import *

In [233]:
surface = df.groupby(['SurID', 'SpecID']).first().reset_index()

In [234]:
surface.drop(columns=['WaveNumber', 'Status', 'Absorbance'], inplace=True)

In [235]:
pivoted_df.reset_index(inplace=True)

In [236]:
pivoted_df = pd.merge(pivoted_df, surface, on='SpecID', how='left')

In [237]:
pivoted_df

Unnamed: 0,SpecID,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,Status,SurID
0,201210-1-00,0.203556,0.203010,0.202464,0.201917,0.201370,0.200822,0.200273,0.199724,0.199174,...,-0.080143,-0.085290,-0.090727,-0.096452,-0.102466,-0.108769,-0.115361,-0.122242,Normal,201210-1
1,201210-1-01,0.207050,0.212595,0.217953,0.223125,0.228109,0.232907,0.237518,0.241943,0.246181,...,-0.071108,-0.069651,-0.068065,-0.066350,-0.064505,-0.062531,-0.060428,-0.058195,Normal,201210-1
2,201210-1-05,0.499901,0.494222,0.488921,0.483997,0.479451,0.475281,0.471489,0.468074,0.465036,...,-0.165654,-0.165905,-0.166024,-0.166013,-0.165872,-0.165600,-0.165197,-0.164664,Normal,201210-1
3,201210-1-10,0.089658,0.088674,0.088423,0.088906,0.090124,0.092075,0.094759,0.098178,0.102330,...,-0.139367,-0.138950,-0.138322,-0.137482,-0.136430,-0.135166,-0.133690,-0.132002,Normal,201210-1
4,201210-1-11,-0.027146,-0.024271,-0.021382,-0.018480,-0.015564,-0.012634,-0.009691,-0.006735,-0.003765,...,-0.066071,-0.067065,-0.068063,-0.069065,-0.070071,-0.071081,-0.072095,-0.073112,Normal,201210-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060,210526-3-45,-0.009309,-0.009209,-0.009153,-0.009140,-0.009172,-0.009247,-0.009367,-0.009530,-0.009737,...,-0.204031,-0.199054,-0.194012,-0.188903,-0.183729,-0.178489,-0.173182,-0.167810,Hyperglycemia,210526-3
2061,210526-3-46,0.008168,-0.007448,-0.022068,-0.035692,-0.048320,-0.059951,-0.070587,-0.080226,-0.088869,...,-0.214824,-0.215151,-0.215431,-0.215664,-0.215850,-0.215989,-0.216081,-0.216126,Hyperglycemia,210526-3
2062,210526-3-47,0.094002,0.083090,0.072727,0.062912,0.053645,0.044925,0.036754,0.029131,0.022056,...,-0.218086,-0.210190,-0.201834,-0.193018,-0.183743,-0.174007,-0.163812,-0.153157,Hyperglycemia,210526-3
2063,210526-3-48,-0.057456,-0.061159,-0.064344,-0.067009,-0.069154,-0.070781,-0.071888,-0.072476,-0.072544,...,-0.133755,-0.121894,-0.109314,-0.096016,-0.082000,-0.067265,-0.051811,-0.035638,Hyperglycemia,210526-3


In [238]:
caret = setup(data=pivoted_df, target='Status', session_id=1234, fold_strategy='groupkfold', fold_groups='SurID')

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Status
2,Target type,Multiclass
3,Target mapping,"Hyperglycemia: 0, Hypoglycemia: 1, Normal: 2"
4,Original data shape,"(2065, 2052)"
5,Transformed data shape,"(2065, 2052)"
6,Transformed train set shape,"(1445, 2052)"
7,Transformed test set shape,"(620, 2052)"
8,Numeric features,2049
9,Categorical features,2


In [239]:
compare_models(exclude=['lightgbm', 'gbc'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6254,0.0,0.6254,0.674,0.6166,0.4135,0.4402,0.641
rf,Random Forest Classifier,0.5961,0.0,0.5961,0.6455,0.5842,0.3594,0.3876,1.019
knn,K Neighbors Classifier,0.5847,0.0,0.5847,0.6062,0.5702,0.3502,0.3738,0.514
ridge,Ridge Classifier,0.449,0.0,0.449,0.4271,0.4077,0.1263,0.1365,0.53
svm,SVM - Linear Kernel,0.4397,0.0,0.4397,0.4865,0.399,0.1495,0.1737,0.582
lr,Logistic Regression,0.432,0.0,0.432,0.5388,0.3691,0.1335,0.1839,1.029
nb,Naive Bayes,0.3472,0.0,0.3472,0.1373,0.1927,0.0,0.0,0.51
dt,Decision Tree Classifier,0.3472,0.0,0.3472,0.1373,0.1927,0.0,0.0,0.6
qda,Quadratic Discriminant Analysis,0.3472,0.0,0.3472,0.1373,0.1927,0.0,0.0,0.74
ada,Ada Boost Classifier,0.3472,0.0,0.3472,0.1373,0.1927,0.0,0.0,3.972
