### This notebook classifies the samples based on their spectral peaks.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ast import literal_eval


A function for examining model metrics

In [2]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


Read in the peak featureset

In [3]:
df = pd.read_csv("../../data/Thomas_peaks.csv")

In [4]:
df.head()

Unnamed: 0,SpecID,Status,Seq,WaveNumber,Absorbance
0,201210-1-00,Normal,[ 727 1026 1820],"[696.81091, 901.13898, 1443.7357]","[1851.9185, 1746.4041, 1537.9485]"
1,201210-1-01,Normal,[ 403 617 1021 1378 1532 1818 2074],"[475.39862, 621.64008, 897.72211, 1141.6857, 1...","[1998.4773, 2034.2784, 1977.515, 1779.342, 173..."
2,201210-1-02,Normal,[ 412 600 1049 1552 1812 2079],"[481.54898, 610.02277, 916.85651, 1260.5923, 1...","[2214.0876, 2846.9824, 3696.4109, 2195.7212, 2..."
3,201210-1-03,Normal,[ 17 385 591 1047 1411 1590 1884 2052],"[211.61731, 463.09796, 603.87244, 915.48975, 1...","[10350.545, 2536.3599, 3342.7229, 3452.3679, 3..."
4,201210-1-04,Normal,[ 368 616 785 1173 1445 1602 1808 2090],"[451.48065, 620.95673, 736.44647, 1001.5945, 1...","[2277.2156, 2264.5063, 2167.6958, 2238.8494, 2..."


Extract the features and labels.

In [5]:
# The lists must be extracted from the strings
wavelengths = df['WaveNumber'].apply(literal_eval)
absorbances = df['Absorbance'].apply(literal_eval)

Create a 1 dimensional representation of the peaks containing both the wavenumber and absorbance.

In [6]:
peak_list = []

# Iterate over the samples
for i in range(len(wavelengths)):
    peak_vector = []

    # Iterate over the values
    for j in range(len(wavelengths[i])):
        peak_vector.append(wavelengths[i][j])
        peak_vector.append(absorbances[i][j])
    peak_list.append(peak_vector)

In [7]:
print(peak_list)

[[696.81091, 1851.9185, 901.13898, 1746.4041, 1443.7357, 1537.9485], [475.39862, 1998.4773, 621.64008, 2034.2784, 897.72211, 1977.515, 1141.6857, 1779.342, 1246.9248, 1735.8127, 1442.369, 1735.0914, 1617.3121, 1733.3473], [481.54898, 2214.0876, 610.02277, 2846.9824, 916.85651, 3696.4109, 1260.5923, 2195.7212, 1438.2688, 2257.9094, 1620.7289, 2418.2576], [211.61731, 10350.545, 463.09796, 2536.3599, 603.87244, 3342.7229, 915.48975, 3452.3679, 1164.2369, 3134.1235, 1286.5604, 2910.6362, 1487.4716, 3426.8677, 1602.278, 2552.4478], [451.48065, 2277.2156, 620.95673, 2264.5063, 736.44647, 2167.6958, 1001.5945, 2238.8494, 1187.4716, 2278.3433, 1294.7609, 2366.2205, 1435.5353, 2356.8567, 1628.246, 2458.5142], [615.48975, 2414.5728, 880.63782, 2218.6753, 1001.5945, 2287.447, 1249.6583, 2368.3093, 1378.1321, 2539.3604, 1624.1458, 2680.7439], [449.43051, 2787.6904, 689.29382, 2352.1768, 1002.2779, 2218.677, 1154.6697, 2173.0259, 1296.8109, 2288.8823, 1436.9021, 2283.9773, 1617.9955, 2340.7083, 193

In [8]:
X = peak_list
y = df['Status']

Pad the feature lists to make them of equal length.

In [9]:
from keras.preprocessing.sequence import pad_sequences

2024-01-16 22:10:15.387833: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-16 22:10:15.414298: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-16 22:10:15.414329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-16 22:10:15.415046: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-16 22:10:15.418903: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-16 22:10:15.419753: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [10]:
# Padding sequences with zeros
X = pad_sequences(X, padding='post', dtype='float32')

In [11]:
print(X)

[[ 696.8109  1851.9185   901.139   ...    0.         0.         0.     ]
 [ 475.39862 1998.4773   621.6401  ...    0.         0.         0.     ]
 [ 481.54898 2214.0876   610.02277 ...    0.         0.         0.     ]
 ...
 [ 280.63782 1906.7185   519.81775 ...    0.         0.         0.     ]
 [ 271.07062 1850.5511   519.81775 ...    0.         0.         0.     ]
 [ 450.79727 1482.5781   705.0114  ...    0.         0.         0.     ]]


Create the training and test sets.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

Select and train the model

##### Random Forest

In [17]:
rf = RandomForestClassifier(random_state=1234, n_estimators=500)
rf.fit(X_train, y_train)

In [18]:
y_pred = rf.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.7651888341543513


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.82      0.72      0.76       203
 Hypoglycemia       0.74      0.75      0.74       200
       Normal       0.75      0.83      0.79       206

     accuracy                           0.77       609
    macro avg       0.77      0.76      0.76       609
 weighted avg       0.77      0.77      0.76       609


Confusion Matrix:
[[146  30  27]
 [ 20 150  30]
 [ 13  23 170]]


#### Logistic Regression

In [15]:
lr = LogisticRegression(multi_class="multinomial", max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
y_pred = lr.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.46798029556650245


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.39      0.13      0.20       203
 Hypoglycemia       0.48      0.57      0.52       200
       Normal       0.48      0.69      0.56       206

     accuracy                           0.47       609
    macro avg       0.45      0.47      0.43       609
 weighted avg       0.45      0.47      0.43       609


Confusion Matrix:
[[ 27  73 103]
 [ 30 115  55]
 [ 12  51 143]]
