## This notebook will construct the nodes and edges that we will build our graph with

Import relevant libraries and import raw data

In [1]:
import seaborn as sns
import pandas as pd
from scipy.signal import find_peaks
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../../data/current_clean_spectrum.csv")

In [3]:
#sns.lineplot(data=df, x='WaveNumber', y='Absorbance', hue='Status', errorbar=('ci', False))

Here we use scipy's function "find_peaks" to find all peaks in each individual spectra sample as well as pulling featurse such as prominence and width of peaks

In [4]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,293,400.22778,201210-1,Normal,41.863303
1,201210-1-00,294,400.91116,201210-1,Normal,41.803843
2,201210-1-00,295,401.59454,201210-1,Normal,41.741884
3,201210-1-00,296,402.27789,201210-1,Normal,41.677722
4,201210-1-00,297,402.96127,201210-1,Normal,41.611654
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,210526-3,Hyperglycemia,12.378163
6239201,210526-3-09,2338,1797.72200,210526-3,Hyperglycemia,13.269937
6239202,210526-3-09,2339,1798.40550,210526-3,Hyperglycemia,14.199285
6239203,210526-3-09,2340,1799.08890,210526-3,Hyperglycemia,15.166531


In [5]:
pivoted_df = df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
pivoted_df.columns.name = None
statuses = df[['SpecID', 'Status']].drop_duplicates()
pivoted_df = pd.merge(pivoted_df, statuses, on='SpecID')
pivoted_df = pivoted_df.set_index('SpecID')

In [6]:
pivoted_df

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.3053,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,41.863303,41.803843,41.741884,41.677722,41.611654,41.543974,41.474980,41.404968,41.334234,41.263073,...,6.942749,6.280946,5.549559,4.745724,3.866578,2.909255,1.870891,0.748623,-0.460415,Normal
201210-1-01,46.314608,47.323684,48.299209,49.241395,50.150457,51.026608,51.870063,52.681035,53.459738,54.206386,...,6.322524,6.769011,7.280928,7.861246,8.512936,9.238972,10.042323,10.925962,11.892860,Normal
201210-1-02,118.159018,114.686240,111.563911,108.777452,106.312282,104.153823,102.287493,100.698715,99.372907,98.295491,...,-4.668874,-4.633601,-4.557349,-4.439365,-4.278894,-4.075180,-3.827470,-3.535010,-3.197043,Normal
201210-1-03,175.466997,174.846086,174.188020,173.498226,172.782129,172.045155,171.292728,170.530275,169.763222,168.996993,...,-11.220152,-10.801936,-10.349539,-9.864191,-9.347124,-8.799567,-8.222752,-7.617909,-6.986269,Normal
201210-1-04,111.814973,106.629998,101.867380,97.512673,93.551430,89.969205,86.751551,83.884023,81.352173,79.141556,...,-11.600625,-11.689508,-11.752441,-11.789205,-11.799583,-11.783357,-11.740310,-11.670224,-11.572882,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,11.003178,12.008451,12.879986,13.624553,14.248922,14.759862,15.164145,15.468539,15.679816,15.804744,...,6.134829,6.671003,7.197072,7.711103,8.211164,8.695324,9.161650,9.608210,10.033072,Hyperglycemia
210526-3-46,14.239653,13.596345,12.981340,12.396227,11.842591,11.322020,10.836101,10.386421,9.974566,9.602123,...,1.618505,2.124074,2.718437,3.405898,4.190762,5.077335,6.069920,7.172824,8.390352,Hyperglycemia
210526-3-47,17.165901,17.349777,17.457499,17.494696,17.466999,17.380039,17.239446,17.050852,16.819887,16.552181,...,6.096801,6.895403,7.740965,8.633346,9.572405,10.558000,11.589990,12.668233,13.792588,Hyperglycemia
210526-3-48,15.865434,14.551958,13.412815,12.441007,11.629535,10.971400,10.459605,10.087150,9.847038,9.732269,...,6.988727,8.137265,9.444800,10.919724,12.570428,14.405304,16.432743,18.661137,21.098876,Hyperglycemia


In [7]:
status_df = pivoted_df['Status']

In [8]:
status_df = status_df.reset_index()

In [9]:
status_df

Unnamed: 0,SpecID,Status
0,201210-1-00,Normal
1,201210-1-01,Normal
2,201210-1-02,Normal
3,201210-1-03,Normal
4,201210-1-04,Normal
...,...,...
3040,210526-3-45,Hyperglycemia
3041,210526-3-46,Hyperglycemia
3042,210526-3-47,Hyperglycemia
3043,210526-3-48,Hyperglycemia


In [10]:
surface = df.groupby(['SpecID', 'SurID']).first().reset_index()

In [11]:
surface.drop(columns=['Seq', 'WaveNumber', 'Status', 'Absorbance'], inplace=True)

In [12]:
surface

Unnamed: 0,SpecID,SurID
0,201210-1-00,201210-1
1,201210-1-01,201210-1
2,201210-1-02,201210-1
3,201210-1-03,201210-1
4,201210-1-04,201210-1
...,...,...
3040,210526-3-45,210526-3
3041,210526-3-46,210526-3
3042,210526-3-47,210526-3
3043,210526-3-48,210526-3


In [13]:
status_df = pd.merge(status_df, surface, left_on='SpecID', right_on='SpecID', how='inner')

In [14]:
status_df

Unnamed: 0,SpecID,Status,SurID
0,201210-1-00,Normal,201210-1
1,201210-1-01,Normal,201210-1
2,201210-1-02,Normal,201210-1
3,201210-1-03,Normal,201210-1
4,201210-1-04,Normal,201210-1
...,...,...,...
3040,210526-3-45,Hyperglycemia,210526-3
3041,210526-3-46,Hyperglycemia,210526-3
3042,210526-3-47,Hyperglycemia,210526-3
3043,210526-3-48,Hyperglycemia,210526-3


In [18]:
import os

# Define the path to the directory
directory = '../../data/graph_2_8/'

# Check if the directory exists
if not os.path.exists(directory):
    # If the directory does not exist, create it
    os.makedirs(directory)

# Now you can safely write the DataFrame to CSV
status_df.to_csv(directory + 'status.csv')


In [19]:
import pandas as pd
from scipy.spatial.distance import pdist, squareform

# Drop the 'Status' column as it is not numeric
X = pivoted_df.drop(columns=['Status'])

# Calculate pairwise Euclidean distances
distances = pdist(X.values, metric='euclidean')

# Convert the condensed distances to a square matrix form
distance_matrix = squareform(distances)

In [20]:
import numpy as np

def gaussian_kernel(distances, sigma):
    return np.exp(-distances**2 / (2 * sigma**2))

# Sigma is the bandwidth parameter
sigma = 3000 # Adjust sigma as needed
kernel_matrix = gaussian_kernel(distance_matrix, sigma)

# Convert the kernel matrix to a DataFrame
kernel_df = pd.DataFrame(kernel_matrix, index=pivoted_df.index, columns=pivoted_df.index)

In [21]:
kernel_df

SpecID,201210-1-00,201210-1-01,201210-1-02,201210-1-03,201210-1-04,201210-1-05,201210-1-06,201210-1-07,201210-1-09,201210-1-10,...,210526-3-40,210526-3-41,210526-3-42,210526-3-43,210526-3-44,210526-3-45,210526-3-46,210526-3-47,210526-3-48,210526-3-49
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.000000,0.930380,0.050248,0.000683,0.734352,0.577033,0.575171,0.726200,0.782364,0.858996,...,0.893850,0.892015,0.875925,0.890022,0.894600,0.889022,0.886525,0.881538,0.887171,0.883364
201210-1-01,0.930380,1.000000,0.074513,0.001152,0.811391,0.667950,0.643591,0.754042,0.793578,0.830592,...,0.807723,0.810600,0.789138,0.793076,0.791789,0.782451,0.779693,0.769265,0.776751,0.772518
201210-1-02,0.050248,0.074513,1.000000,0.014906,0.050390,0.047160,0.039163,0.045003,0.049491,0.043247,...,0.028118,0.028786,0.027547,0.027126,0.026874,0.025970,0.026254,0.027386,0.026160,0.025609
201210-1-03,0.000683,0.001152,0.014906,1.000000,0.000901,0.000987,0.001021,0.000995,0.001046,0.000714,...,0.000385,0.000414,0.000361,0.000338,0.000325,0.000302,0.000299,0.000308,0.000305,0.000295
201210-1-04,0.734352,0.811391,0.050390,0.000901,1.000000,0.903939,0.731659,0.751390,0.745852,0.760661,...,0.665539,0.668372,0.644293,0.643143,0.630909,0.621433,0.621072,0.601957,0.609388,0.614546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.889022,0.782451,0.025970,0.000302,0.621433,0.487391,0.535738,0.691829,0.742746,0.838303,...,0.985119,0.983558,0.980999,0.994355,0.996607,1.000000,0.997227,0.993936,0.995941,0.993156
210526-3-46,0.886525,0.779693,0.026254,0.000299,0.621072,0.487937,0.538401,0.695711,0.746990,0.842395,...,0.981686,0.981521,0.977403,0.993630,0.995435,0.997227,1.000000,0.995409,0.996831,0.995024
210526-3-47,0.881538,0.769265,0.027386,0.000308,0.601957,0.468924,0.520432,0.679664,0.733206,0.827908,...,0.973235,0.971413,0.968805,0.987506,0.991228,0.993936,0.995409,1.000000,0.996084,0.992372
210526-3-48,0.887171,0.776751,0.026160,0.000305,0.609388,0.474359,0.528116,0.687050,0.740156,0.836655,...,0.977826,0.976824,0.972111,0.991547,0.994602,0.995941,0.996831,0.996084,1.000000,0.994815


In [22]:
relationship_df = kernel_df.reset_index()
relationship_df = relationship_df.melt(id_vars='SpecID', var_name=':END_ID', value_name='Weight:float')
relationship_df = relationship_df.rename(columns={'SpecID': ':START_ID'})
relationship_df[':TYPE'] = 'LINK'

In [23]:
relationship_df = relationship_df[relationship_df[":START_ID"] != relationship_df[":END_ID"]]
relationship_df = relationship_df[relationship_df[":START_ID"] < relationship_df[":END_ID"]]

In [24]:
relationship_df

Unnamed: 0,:START_ID,:END_ID,Weight:float,:TYPE
3045,201210-1-00,201210-1-01,0.930380,LINK
6090,201210-1-00,201210-1-02,0.050248,LINK
6091,201210-1-01,201210-1-02,0.074513,LINK
9135,201210-1-00,201210-1-03,0.000683,LINK
9136,201210-1-01,201210-1-03,0.001152,LINK
...,...,...,...,...
9272019,210526-3-44,210526-3-49,0.993261,LINK
9272020,210526-3-45,210526-3-49,0.993156,LINK
9272021,210526-3-46,210526-3-49,0.995024,LINK
9272022,210526-3-47,210526-3-49,0.992372,LINK


In [25]:
relationship_df.to_csv('../../data/graph_2_8/relationships.csv')

In [26]:
node_df = status_df.drop(columns=['SurID'])

In [27]:
node_df.rename(columns={'SpecID':'SpecID:ID', 'Status':'Status:LABEL'}, inplace=True)

In [28]:
node_df.to_csv('../../data/graph_2_8/nodes.csv')