### This notebook classifies the samples based on their spectral peaks.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ast import literal_eval


A function for examining model metrics

In [2]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


Read in the peak featureset

In [3]:
df = pd.read_csv("../../data/Thomas_peaks.csv")

In [4]:
df.head()

Unnamed: 0,SpecID,Status,Seq,WaveNumber,Absorbance
0,201210-1-00,Normal,[ 727 1026 1820],"[696.81091, 901.13898, 1443.7357]","[1851.9185, 1746.4041, 1537.9485]"
1,201210-1-01,Normal,[ 403 617 1021 1378 1532 1818 2074],"[475.39862, 621.64008, 897.72211, 1141.6857, 1...","[1998.4773, 2034.2784, 1977.515, 1779.342, 173..."
2,201210-1-02,Normal,[ 412 600 1049 1552 1812 2079],"[481.54898, 610.02277, 916.85651, 1260.5923, 1...","[2214.0876, 2846.9824, 3696.4109, 2195.7212, 2..."
3,201210-1-03,Normal,[ 17 385 591 1047 1411 1590 1884 2052],"[211.61731, 463.09796, 603.87244, 915.48975, 1...","[10350.545, 2536.3599, 3342.7229, 3452.3679, 3..."
4,201210-1-04,Normal,[ 368 616 785 1173 1445 1602 1808 2090],"[451.48065, 620.95673, 736.44647, 1001.5945, 1...","[2277.2156, 2264.5063, 2167.6958, 2238.8494, 2..."


Extract the features and labels.

In [5]:
X = df['Absorbance']
y = df['Status']

Pad the feature lists to make them of equal length.

In [6]:
from keras.preprocessing.sequence import pad_sequences

2024-01-16 17:42:45.768801: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-16 17:42:45.801562: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-16 17:42:45.801595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-16 17:42:45.802321: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-16 17:42:45.806880: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-16 17:42:45.807571: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [7]:
# Convert features to lists
X = X.apply(literal_eval)

In [8]:
# Padding sequences with zeros
X = pad_sequences(X, padding='post', dtype='float32')

In [9]:
print(X)

[[1851.9185 1746.404  1537.9485 ...    0.        0.        0.    ]
 [1998.4773 2034.2784 1977.515  ...    0.        0.        0.    ]
 [2214.0876 2846.9824 3696.411  ...    0.        0.        0.    ]
 ...
 [1906.7185 1571.1713 1579.2836 ...    0.        0.        0.    ]
 [1850.5511 1533.9598 1549.0426 ...    0.        0.        0.    ]
 [1482.5781 1507.8912 1505.2144 ...    0.        0.        0.    ]]


Create the training and test sets.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

Select and train the model

##### Random Forest

In [11]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [12]:
y_pred = rf.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.729064039408867


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.72      0.74      0.73       203
 Hypoglycemia       0.71      0.70      0.71       200
       Normal       0.75      0.74      0.75       206

     accuracy                           0.73       609
    macro avg       0.73      0.73      0.73       609
 weighted avg       0.73      0.73      0.73       609


Confusion Matrix:
[[151  28  24]
 [ 34 140  26]
 [ 25  28 153]]


#### Logistic Regression

In [13]:
lr = LogisticRegression(multi_class="multinomial", max_iter=1000)
lr.fit(X_train, y_train)

In [14]:
y_pred = lr.predict(X_test)
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.44991789819376027


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.41      0.13      0.19       203
 Hypoglycemia       0.48      0.48      0.48       200
       Normal       0.44      0.73      0.55       206

     accuracy                           0.45       609
    macro avg       0.44      0.45      0.41       609
 weighted avg       0.44      0.45      0.41       609


Confusion Matrix:
[[ 26  53 124]
 [ 35  97  68]
 [  3  52 151]]
