### This notebook classifies the samples based on their spectral peaks.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ast import literal_eval


A function for examining model metrics

In [2]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


Read in the peak featureset

In [3]:
df = pd.read_csv("../../data/full_spectra.csv")

In [4]:
df.head()

Unnamed: 0,SpecID,Status,Seq,WaveNumber,Absorbance
0,201210-1-00,Normal,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[200.0, 200.68336, 201.36674, 202.05011, 202.7...","[2709.3699, 2697.1318, 2696.0413, 2678.5925, 2..."
1,201210-1-01,Normal,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[200.0, 200.68336, 201.36674, 202.05011, 202.7...","[2979.3169, 2985.707, 2970.1677, 2947.095, 294..."
2,201210-1-02,Normal,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[200.0, 200.68336, 201.36674, 202.05011, 202.7...","[3702.5627, 3592.4902, 3640.8423, 3593.415, 35..."
3,201210-1-03,Normal,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[200.0, 200.68336, 201.36674, 202.05011, 202.7...","[8129.5938, 8222.3184, 8370.2803, 8534.415, 86..."
4,201210-1-04,Normal,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[200.0, 200.68336, 201.36674, 202.05011, 202.7...","[3468.5203, 3463.0237, 3475.2666, 3468.5999, 3..."


Extract the features and labels.

In [5]:
X = df['Absorbance']
y = df['Status']

Pad the feature lists to make them of equal length.

In [6]:
from keras.preprocessing.sequence import pad_sequences

2024-01-17 14:53:58.343578: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-17 14:53:58.373593: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 14:53:58.373623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 14:53:58.374445: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-17 14:53:58.378916: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-17 14:53:58.380503: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [7]:
# Convert features to lists
X = X.apply(literal_eval)

In [8]:
# Padding sequences with zeros
X = pad_sequences(X, padding='post', dtype='float32')

In [9]:
print(X)

[[2709.3699  2697.1318  2696.0413  ... 1089.0814  1092.8083  1086.699  ]
 [2979.317   2985.707   2970.1677  ... 1286.9803  1276.4037  1268.0922 ]
 [3702.5627  3592.4902  3640.8423  ... 2086.6956  2064.7766  2064.2126 ]
 ...
 [1924.9346  1880.2902  1873.738   ...  890.5974   896.7415   907.45355]
 [1834.2257  1827.9034  1830.4606  ...  859.38354  853.8679   858.3631 ]
 [1792.1012  1771.9321  1791.6136  ...  822.06915  824.7414   829.86   ]]


Create the training and test sets.

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

Select and train the model

#### Random Forest

In [11]:
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

In [12]:
y_pred = rf.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8587848932676518


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.84      0.84       203
 Hypoglycemia       0.88      0.83      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[171  16  16]
 [ 18 167  15]
 [ 14   7 185]]


#### Tuning Random Forest

In [13]:
rf = RandomForestClassifier(random_state=1234, n_estimators=500)
rf.fit(X_train, y_train)

In [14]:
y_pred = rf.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8637110016420362


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.84      0.85       203
 Hypoglycemia       0.87      0.85      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[170  18  15]
 [ 14 171  15]
 [ 14   7 185]]


Cross-validation

In [15]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedKFold

# # The parameters that will be tested
# param_grid = {
#     'n_estimators': [500],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Create the RandomForestClassifier
# rf = RandomForestClassifier(random_state=1234)

# # Use StratifiedKFold for cross-validation, especially useful for imbalanced datasets
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

# # Create the GridSearchCV object
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# # Fit the GridSearchCV to the data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters
# print("Best Hyperparameters:", grid_search.best_params_)

# # Get the best model
# best_rf_model = grid_search.best_estimator_

# # Evaluate the model on the test set
# test_accuracy = best_rf_model.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy:.4f}")

#### XGBoost

In [16]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# XGBoost requires numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Create the XGBoost classifier
xgb = XGBClassifier(random_state=1234, n_estimators=500)

# Fit the model to the training data
xgb.fit(X_train, y_train_encoded)

In [17]:
y_test_encoded = label_encoder.transform(y_test)
y_pred = xgb.predict(X_test)
calculate_metrics(y_test_encoded, y_pred)

Overall Accuracy: 0.8604269293924466


Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       203
           1       0.86      0.86      0.86       200
           2       0.84      0.89      0.87       206

    accuracy                           0.86       609
   macro avg       0.86      0.86      0.86       609
weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[168  17  18]
 [ 12 172  16]
 [ 11  11 184]]


#### Logistic Regression

In [18]:
lr = LogisticRegression(multi_class="multinomial", max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
y_pred = lr.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.7274220032840722


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.75      0.64      0.69       203
 Hypoglycemia       0.73      0.81      0.76       200
       Normal       0.71      0.74      0.72       206

     accuracy                           0.73       609
    macro avg       0.73      0.73      0.73       609
 weighted avg       0.73      0.73      0.73       609


Confusion Matrix:
[[129  30  44]
 [ 19 161  20]
 [ 23  30 153]]


#### SVM

In [14]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', random_state=1234)
svm.fit(X_train, y_train)

In [16]:
y_pred = svm.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.7142857142857143


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.67      0.67      0.67       203
 Hypoglycemia       0.74      0.74      0.74       200
       Normal       0.74      0.73      0.73       206

     accuracy                           0.71       609
    macro avg       0.71      0.71      0.71       609
 weighted avg       0.71      0.71      0.71       609


Confusion Matrix:
[[137  29  37]
 [ 35 148  17]
 [ 33  23 150]]


#### Neural Approach

In [59]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [82]:
X_train = np.array(X_train)
y_train = np.array(y_train)

Encode the labels as numerical values

In [84]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

One-hot encoding should be used for multiclass classification

In [85]:
y_train_one_hot = tf.keras.utils.to_categorical(y_train_encoded, num_classes=3)

Create a validation set

In [86]:
print(len(y_train_one_hot))

2436


In [87]:
X_train, X_val, y_train_one_hot, y_val_one_hot = train_test_split(X_train, y_train_one_hot, test_size=0.2, random_state=1234)

Standardise the features

In [133]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Define the model parameters

In [180]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [181]:
# from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D

# model = Sequential()
# model.add(Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)))
# model.add(MaxPooling1D(2))
# model.add(Conv1D(128, 3, activation='relu'))
# model.add(MaxPooling1D(2))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(64, activation='relu'))
# model.add(Dense(3, activation='softmax'))

Compile the model

In [182]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Train the model

In [185]:
model.fit(X_train_scaled, y_train_one_hot, epochs=100, batch_size=32, validation_data=(X_val_scaled, y_val_one_hot))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Evaluate the model

In [None]:
y_pred = model.predict(X_test_scaled)
decoded_predictions = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(decoded_predictions.round().astype(int))  # Convert the labels back to strings
calculate_metrics(y_test, y_pred_labels)

Overall Accuracy: 0.6272577996715928


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.55      0.63      0.59       203
 Hypoglycemia       0.75      0.56      0.64       200
       Normal       0.63      0.68      0.66       206

     accuracy                           0.63       609
    macro avg       0.64      0.63      0.63       609
 weighted avg       0.64      0.63      0.63       609


Confusion Matrix:
[[128  29  46]
 [ 50 113  37]
 [ 56   9 141]]
