#### Machine Learning

I got this running on an AMD gpu using DirectML in WSL.

In [24]:
import pandas as pd

In [25]:
#spectra_df = pd.read_csv("../../data/scaled_and_noise_removal.csv", index_col=0)
spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_400-1800.csv")

In [26]:
spectra_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal
...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia


#### **Train a Neural Network on the full spectrum**

Create a field for each wavelength.

In [27]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [28]:
surface_and_statuses = spectra_df[['SpecID', 'Status', 'SurID']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, surface_and_statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [29]:
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,Status,SurID
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1765.6628,1774.7809,1769.0302,1756.422,1758.869,1763.23,1745.2285,1773.3534,1774.7166,1753.3281,...,1210.4993,1213.9619,1225.2153,1210.001,1210.6858,1194.4679,1195.1451,1189.8683,Normal,201210-1
201210-1-01,1966.993,1962.4237,1954.5616,1954.3228,1963.0917,1975.0807,1979.3162,1963.4561,1968.4587,1964.0,...,1382.6973,1363.7004,1360.621,1354.0477,1353.0381,1353.9978,1361.2426,1370.2874,Normal,201210-1
201210-1-02,2182.6694,2149.6565,2146.0227,2159.3459,2167.291,2160.9861,2145.6575,2134.2004,2142.8303,2138.6309,...,1976.207,1989.0183,1996.2838,1979.3507,1976.2002,1994.9839,1974.203,1971.188,Normal,201210-1
201210-1-03,2445.0837,2430.4973,2422.7927,2434.3433,2454.97,2462.8245,2454.7007,2467.7329,2449.5161,2421.3474,...,1992.3817,2022.6331,2001.8311,2010.0946,2006.4933,2017.2891,2038.1699,2000.6475,Normal,201210-1
201210-1-04,2250.4536,2248.6235,2245.0984,2242.7173,2235.2803,2228.9585,2236.0095,2229.6091,2225.9231,2211.0359,...,2009.0385,1953.3303,1963.5698,1964.5299,1969.5634,1986.6266,1970.1484,2007.0848,Normal,201210-1


In [30]:
X = wavelength_df.drop(columns=['Status'])
y = pd.get_dummies(wavelength_df['Status'])  # One-hot encode target variable

#### **CNN Training**

In [31]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization

In [40]:
from sklearn.model_selection import GroupShuffleSplit

# Split data into training and testing sets ensuring no overlap in SurID
group_kfold = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=1234)

# Outline the groups for GroupKFoldgroups = X['SurID']
for train_idx, test_idx in group_kfold.split(X, y, groups):
    X_train_temp, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_temp, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Re-apply GroupShuffleSplit on the preliminary training set to further split it into training and validation
group_kfold_val = GroupShuffleSplit(test_size=0.125, n_splits=1, random_state=1234)  # Adjust the test_size as necessary
for train_idx, val_idx in group_kfold_val.split(X_train_temp, y_train_temp, X_train_temp['SurID']):
    X_train, X_val = X_train_temp.iloc[train_idx], X_train_temp.iloc[val_idx]
    y_train, y_val = y_train_temp.iloc[train_idx], y_train_temp.iloc[val_idx]

# Remove the SurID column if it should not be used as a feature for training
X_train = X_train.drop(columns=['SurID'])
X_test = X_test.drop(columns=['SurID'])
X_val = X_val.drop(columns=['SurID'])

# Reshape data for 1D convolution
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)
X_val = X_val.values.reshape(X_val.shape[0], X_val.shape[1], 1)

In [41]:
# # Define CNN architecture
# model = Sequential([
#     Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
#     MaxPooling1D(pool_size=2),
#     Conv1D(filters=64, kernel_size=3, activation='relu'),
#     MaxPooling1D(pool_size=2),
#     Flatten(),
#     Dense(128, activation='relu'),
#     Dense(3, activation='softmax')
# ])

In [42]:
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    BatchNormalization(),
    
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    BatchNormalization(),
    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

  super().__init__(


In [43]:
# # Define CNN architecture with dropout
# model = Sequential([
#     Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
#     MaxPooling1D(pool_size=2),
#     Dropout(0.25),  # Adding dropout layer with dropout rate of 0.25
#     Conv1D(filters=64, kernel_size=3, activation='relu'),
#     MaxPooling1D(pool_size=2),
#     Dropout(0.25),  # Adding dropout layer with dropout rate of 0.25
#     Flatten(),
#     Dense(128, activation='relu'),
#     Dropout(0.5),   # Adding dropout layer with dropout rate of 0.5
#     Dense(3, activation='softmax')  
# ])

In [44]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 305ms/step - accuracy: 0.4323 - loss: 11.2005 - val_accuracy: 0.5914 - val_loss: 1.8067
Epoch 2/10
[1m20/65[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m13s[0m 298ms/step - accuracy: 0.4433 - loss: 3.0320