## This notebook will construct the nodes and edges that we will build our graph with

Import relevant libraries and import raw data

In [24]:
import seaborn as sns
import pandas as pd
from scipy.signal import find_peaks
import numpy as np
import matplotlib.pyplot as plt

In [25]:
df = pd.read_csv("../../data/current_clean_spectrum.csv")

In [3]:
def normalise(absorbances):
    max_value = np.max(absorbances)
    normalized_absorbances = absorbances / max_value
    return normalized_absorbances

df['Absorbance'] = df.groupby('SpecID')['Absorbance'].transform(lambda x: normalise(x))

In [5]:
#sns.lineplot(data=df, x='WaveNumber', y='Absorbance', hue='Status', errorbar=('ci', False))

Here we use scipy's function "find_peaks" to find all peaks in each individual spectra sample as well as pulling featurse such as prominence and width of peaks

In [26]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,32.316184
1,201210-1-00,294,400.91116,201210-1,Normal,32.229535
2,201210-1-00,295,401.59454,201210-1,Normal,32.142799
3,201210-1-00,296,402.27789,201210-1,Normal,32.055975
4,201210-1-00,297,402.96127,201210-1,Normal,31.969064
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,-16.873032
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,-15.976760
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,-15.040871
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,-14.065367


In [27]:
# Assuming 'df' is your DataFrame
# Step 1: Group by 'SurID'
groups = df.groupby('SurID')

In [28]:
print(groups['Absorbance'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fe8f8791390>


In [29]:
# Assuming 'df' is your DataFrame
# Function to identify samples where at least 25% of values are outliers
def identify_outliers(group):
    Q1 = group['Absorbance'].quantile(0.25)
    Q3 = group['Absorbance'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Determine if a measurement is an outlier
    is_outlier = (group['Absorbance'] < lower_bound) | (group['Absorbance'] > upper_bound)
    outlier_prop = is_outlier.mean()  # Proportion of measurements that are outliers
    
    # Return the SpecID if outlier proportion is >= 0.25
    if outlier_prop >= 0.04:
        return group['SpecID'].iloc[0]
    return pd.NA  # Use pandas NA for non-outliers or when the condition is not met

# Apply the function to each 'SpecID' group within each 'SurID' and flatten the result
outlier_specids = df.groupby('SurID').apply(lambda x: x.groupby('SpecID').apply(identify_outliers)).dropna()

# Convert to a flat list of unique SpecIDs
outlier_specids = pd.Series(outlier_specids.values.flatten()).dropna().unique()

# Remove samples (SpecID) where ≥ 25% of 'Absorbance' values are outliers
clean_df = df[~df['SpecID'].isin(outlier_specids)]

In [30]:
print(clean_df['SpecID'].nunique())

2065


In [31]:
df = clean_df

In [32]:
df = df.groupby(["SpecID", "WaveNumber", "Status", "SurID"])["Absorbance"].median().reset_index()

In [33]:
pivoted_df = df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
pivoted_df.columns.name = None
statuses = df[['SpecID', 'Status']].drop_duplicates()
pivoted_df = pd.merge(pivoted_df, statuses, on='SpecID')
pivoted_df = pivoted_df.set_index('SpecID')

In [34]:
pivoted_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.3053,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,32.316184,32.229535,32.142799,32.055975,31.969064,31.882065,31.794978,31.707804,31.620543,31.533194,...,-11.952036,-12.723337,-13.540514,-14.403567,-15.312496,-16.267300,-17.267980,-18.314536,-19.406968,Normal
201210-1-01,39.811601,40.877763,41.908020,42.902373,43.860821,44.783365,45.670004,46.520739,47.335569,48.114495,...,-13.927993,-13.672706,-13.392564,-13.087568,-12.757716,-12.403010,-12.023449,-11.619033,-11.189762,Normal
201210-1-05,189.995888,187.837705,185.822902,183.951479,182.223436,180.638774,179.197492,177.899590,176.745069,175.733928,...,-62.814992,-62.959766,-63.054903,-63.100402,-63.096265,-63.042490,-62.939078,-62.786029,-62.583343,Normal
201210-1-10,14.304090,14.147067,14.107112,14.184225,14.378407,14.689656,15.117974,15.663361,16.325815,17.105338,...,-22.267236,-22.234664,-22.168275,-22.068068,-21.934042,-21.766199,-21.564538,-21.329059,-21.059762,Normal
201210-1-11,-10.199739,-9.119337,-8.033871,-6.943341,-5.847746,-4.747088,-3.641365,-2.530578,-1.414727,-0.293811,...,-24.452758,-24.824830,-25.198372,-25.573386,-25.949872,-26.327829,-26.707257,-27.088156,-27.470527,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.768060,-0.759782,-0.755132,-0.754107,-0.756709,-0.762937,-0.772792,-0.786274,-0.803382,-0.824116,...,-17.238661,-16.833514,-16.422926,-16.006897,-15.585426,-15.158515,-14.726162,-14.288368,-13.845133,Hyperglycemia
210526-3-46,0.678560,-0.618710,-1.833223,-2.964977,-4.013973,-4.980212,-5.863693,-6.664416,-7.382381,-8.017588,...,-17.814559,-17.845597,-17.872739,-17.895985,-17.915336,-17.930791,-17.942350,-17.950013,-17.953780,Hyperglycemia
210526-3-47,7.029672,6.213698,5.438707,4.704698,4.011673,3.359630,2.748569,2.178492,1.649397,1.161285,...,-16.865098,-16.309001,-15.718517,-15.093644,-14.434382,-13.740733,-13.012695,-12.250268,-11.453454,Hyperglycemia
210526-3-48,-4.485428,-4.774563,-5.023158,-5.231212,-5.398725,-5.525698,-5.612130,-5.658022,-5.663373,-5.628184,...,-11.311757,-10.441917,-9.515975,-8.533931,-7.495785,-6.401538,-5.251188,-4.044737,-2.782184,Hyperglycemia


In [13]:
from pycaret.classification import *

In [35]:
surface = df.groupby(['SurID', 'SpecID']).first().reset_index()

In [36]:
surface.drop(columns=['WaveNumber', 'Status', 'Absorbance'], inplace=True)

In [37]:
pivoted_df.reset_index(inplace=True)

In [38]:
pivoted_df = pd.merge(pivoted_df, surface, on='SpecID', how='left')

In [39]:
pivoted_df['SurID'].nunique()

62

In [40]:
caret = setup(data=pivoted_df, target='Status', session_id=1234, fold_strategy='groupkfold', fold_groups='SurID')

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Status
2,Target type,Multiclass
3,Target mapping,"Hyperglycemia: 0, Hypoglycemia: 1, Normal: 2"
4,Original data shape,"(2065, 2052)"
5,Transformed data shape,"(2065, 2052)"
6,Transformed train set shape,"(1445, 2052)"
7,Transformed test set shape,"(620, 2052)"
8,Numeric features,2049
9,Categorical features,2


In [47]:
models = compare_models(exclude=['lightgbm', 'gbc'], n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6441,0.8119,0.6441,0.7066,0.6407,0.4519,0.477,0.954
rf,Random Forest Classifier,0.6352,0.8155,0.6352,0.6963,0.6303,0.433,0.4579,1.383
knn,K Neighbors Classifier,0.5826,0.7353,0.5826,0.6139,0.5753,0.3511,0.3665,0.777
lr,Logistic Regression,0.5473,0.6684,0.5473,0.5836,0.5521,0.2874,0.2958,5.121
svm,SVM - Linear Kernel,0.4915,0.0,0.4915,0.5357,0.4695,0.179,0.1862,0.86
nb,Naive Bayes,0.4066,0.57,0.4066,0.4606,0.3185,0.0718,0.0938,0.792
ridge,Ridge Classifier,0.406,0.0,0.406,0.4548,0.4098,0.1004,0.1072,0.827
qda,Quadratic Discriminant Analysis,0.3509,0.524,0.3509,0.4168,0.3532,0.0443,0.0489,1.054
dt,Decision Tree Classifier,0.3472,0.5,0.3472,0.1373,0.1927,0.0,0.0,0.88
ada,Ada Boost Classifier,0.3472,0.5,0.3472,0.1373,0.1927,0.0,0.0,4.588


In [48]:
models

[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=1234, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='sqrt',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=-1, oob_score=False,
                        random_state=1234, verbose=0, warm_start=False)]

In [49]:
blended = blend_models(models)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8462,0.9708,0.8462,0.8481,0.839,0.7641,0.7707
1,0.6993,0.9521,0.6993,0.7766,0.696,0.54,0.5653
2,0.5473,0.8379,0.5473,0.5974,0.5077,0.2843,0.3173
3,0.4552,0.633,0.4552,0.4993,0.4712,0.1438,0.1461
4,0.4444,0.6069,0.4444,0.4829,0.4179,0.1772,0.1985
5,0.7746,0.7905,0.7746,0.7697,0.7675,0.6348,0.6392
6,0.5594,0.806,0.5594,0.579,0.5427,0.3429,0.3629
7,0.7113,0.9022,0.7113,0.7948,0.7422,0.4839,0.4985
8,0.75,0.9198,0.75,0.8655,0.7628,0.6332,0.6801
9,0.5782,0.7922,0.5782,0.7874,0.5834,0.3833,0.4678


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [50]:
stacked = stack_models(models)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6434,0.7688,0.6434,0.6329,0.6346,0.4546,0.4571
1,0.6643,0.7986,0.6643,0.7095,0.6594,0.4941,0.5125
2,0.527,0.6324,0.527,0.537,0.5249,0.2369,0.2408
3,0.3448,0.4326,0.3448,0.3428,0.3372,-0.0936,-0.0962
4,0.4444,0.5529,0.4444,0.4791,0.4458,0.1576,0.1639
5,0.5493,0.6934,0.5493,0.5495,0.5465,0.2738,0.2752
6,0.4895,0.5815,0.4895,0.5043,0.4928,0.2182,0.2202
7,0.5211,0.744,0.5211,0.6841,0.5766,0.2147,0.2325
8,0.6689,0.7702,0.6689,0.6669,0.6677,0.4778,0.478
9,0.5102,0.6576,0.5102,0.624,0.5172,0.2773,0.3126


In [46]:
tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8601,0.9663,0.8601,0.8608,0.8523,0.7856,0.7923
1,0.7692,0.9568,0.7692,0.8278,0.7697,0.6482,0.669
2,0.5338,0.8338,0.5338,0.6388,0.4946,0.2711,0.3087
3,0.4345,0.5997,0.4345,0.5041,0.4538,0.1289,0.1342
4,0.4792,0.6114,0.4792,0.5046,0.4606,0.2191,0.2346
5,0.6761,0.7401,0.6761,0.675,0.6745,0.4896,0.4904
6,0.6364,0.7781,0.6364,0.6442,0.6279,0.4567,0.4666
7,0.6831,0.9152,0.6831,0.7815,0.716,0.4478,0.4671
8,0.7635,0.8976,0.7635,0.869,0.7762,0.6514,0.6937
9,0.5986,0.7707,0.5986,0.8022,0.5911,0.4013,0.4812


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
