#### Machine Learning

In [4]:
import pandas as pd

In [5]:
spectra_df = pd.read_csv("../../data/scaled_and_noise_removal.csv", index_col=0)

In [6]:
spectra_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,0,200.00000,201210-1,Normal,1.000000
1,201210-1-00,1,200.68336,201210-1,Normal,0.982870
2,201210-1-00,2,201.36674,201210-1,Normal,0.966481
3,201210-1-00,3,202.05011,201210-1,Normal,0.950833
4,201210-1-00,4,202.73349,201210-1,Normal,0.935925
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,210526-3,Hyperglycemia,0.087327
8023571,210526-3-09,2631,1997.94980,210526-3,Hyperglycemia,0.090222
8023572,210526-3-09,2632,1998.63330,210526-3,Hyperglycemia,0.091124
8023573,210526-3-09,2633,1999.31670,210526-3,Hyperglycemia,0.090032


#### First we will look at the full wavelength.

Create a field for each wavelength.

In [7]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [8]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [9]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.0,0.98287,0.966481,0.950833,0.935925,0.927346,0.902431,0.889797,0.880243,0.864841,...,0.07069,0.082414,0.09097,0.09289,0.08323,0.082239,0.082605,0.084328,0.087409,Normal
201210-1-01,1.0,0.98055,0.964007,0.950373,0.939647,0.930871,0.928574,0.925606,0.91491,0.90003,...,0.113337,0.117351,0.131463,0.146428,0.158236,0.161601,0.160516,0.15498,0.144994,Normal
201210-1-02,0.502527,0.491051,0.47954,0.467993,0.45641,0.450161,0.431959,0.424441,0.417415,0.408823,...,0.144569,0.141991,0.139207,0.144627,0.149526,0.150847,0.1506,0.148786,0.145404,Normal
201210-1-03,0.492251,0.418315,0.375666,0.364304,0.38423,0.461173,0.570848,0.577491,0.675831,0.807648,...,0.005644,0.007747,0.008992,0.010157,0.011991,0.011822,0.010642,0.008451,0.005248,Normal
201210-1-04,1.0,0.994372,0.989487,0.985346,0.981948,0.976845,0.970762,0.977536,0.981513,0.963198,...,0.019359,0.028442,0.041845,0.05873,0.063852,0.070008,0.07453,0.077417,0.078669,Normal


In [22]:
X = wavelength_df.drop(columns=['Status'])
y = pd.get_dummies(wavelength_df['Status'])  # One-hot encode target variable

CNN Training

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

In [32]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Reshape data for 1D convolution
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

In [33]:
# # Define CNN architecture
# model = Sequential([
#     Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
#     MaxPooling1D(pool_size=2),
#     Conv1D(filters=64, kernel_size=3, activation='relu'),
#     MaxPooling1D(pool_size=2),
#     Flatten(),
#     Dense(128, activation='relu'),
#     Dense(3, activation='softmax')
# ])

In [34]:
# Define CNN architecture with dropout
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.25),  # Adding dropout layer with dropout rate of 0.25
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.25),  # Adding dropout layer with dropout rate of 0.25
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),   # Adding dropout layer with dropout rate of 0.5
    Dense(3, activation='softmax')  
])

In [35]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5179959535598755, Test Accuracy: 0.8932676315307617
