#### Machine Learning

In [15]:
spectra_df = pd.read_csv("../../data/scaled_and_noise_removal.csv", index_col=0)

In [17]:
spectra_df

Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,201210-1-00,0,200.00000,201210-1,Normal,1.000000
1,201210-1-00,1,200.68336,201210-1,Normal,0.982870
2,201210-1-00,2,201.36674,201210-1,Normal,0.966481
3,201210-1-00,3,202.05011,201210-1,Normal,0.950833
4,201210-1-00,4,202.73349,201210-1,Normal,0.935925
...,...,...,...,...,...,...
8023570,210526-3-09,2630,1997.26650,210526-3,Hyperglycemia,0.087327
8023571,210526-3-09,2631,1997.94980,210526-3,Hyperglycemia,0.090222
8023572,210526-3-09,2632,1998.63330,210526-3,Hyperglycemia,0.091124
8023573,210526-3-09,2633,1999.31670,210526-3,Hyperglycemia,0.090032


#### First we will look at the full wavelength.

Create a field for each wavelength.

In [18]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [19]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [20]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.0,0.98287,0.966481,0.950833,0.935925,0.927346,0.902431,0.889797,0.880243,0.864841,...,0.07069,0.082414,0.09097,0.09289,0.08323,0.082239,0.082605,0.084328,0.087409,Normal
201210-1-01,1.0,0.98055,0.964007,0.950373,0.939647,0.930871,0.928574,0.925606,0.91491,0.90003,...,0.113337,0.117351,0.131463,0.146428,0.158236,0.161601,0.160516,0.15498,0.144994,Normal
201210-1-02,0.502527,0.491051,0.47954,0.467993,0.45641,0.450161,0.431959,0.424441,0.417415,0.408823,...,0.144569,0.141991,0.139207,0.144627,0.149526,0.150847,0.1506,0.148786,0.145404,Normal
201210-1-03,0.492251,0.418315,0.375666,0.364304,0.38423,0.461173,0.570848,0.577491,0.675831,0.807648,...,0.005644,0.007747,0.008992,0.010157,0.011991,0.011822,0.010642,0.008451,0.005248,Normal
201210-1-04,1.0,0.994372,0.989487,0.985346,0.981948,0.976845,0.970762,0.977536,0.981513,0.963198,...,0.019359,0.028442,0.041845,0.05873,0.063852,0.070008,0.07453,0.077417,0.078669,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [21]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [36]:
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.9049180327868852


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.91      0.88        91
 Hypoglycemia       0.91      0.91      0.91       107
       Normal       0.95      0.90      0.92       107

     accuracy                           0.90       305
    macro avg       0.90      0.91      0.90       305
 weighted avg       0.91      0.90      0.91       305


Confusion Matrix:
[[83  5  3]
 [ 8 97  2]
 [ 6  5 96]]
Overall Accuracy: 0.898360655737705


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.90      0.90      0.90        91
 Hypoglycemia       0.90      0.89      0.89       107
       Normal       0.90      0.91      0.90       107

     accuracy                           0.90       305
    macro avg       0.90      0.90      0.90       305
 weighted avg       0.90      0.90      0.90       305


Confusion Matrix:
[[82  6  3]
 [ 4 95  8]
 [

#### Create a smaller featureset based on feature importance.

In [37]:
# Fitting the Extra Trees classifier
et.fit(X, y)

feature_importances = et.feature_importances_

# Creating a DataFrame to store feature importances with corresponding column names
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sorting the DataFrame by importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Selecting the top 100 features
top_100_features = importance_df.head(500)['Feature'].tolist()

# Dropping columns in the DataFrame that are not in the top 100 features
X_top_100 = X[top_100_features]


In [38]:
X_top_100

Unnamed: 0_level_0,303.87244,317.53986,303.18906,299.08884,301.82233,329.15717,326.42368,1002.27790,1000.91110,296.35535,...,230.75171,1829.84060,1382.91580,588.15491,1637.12990,1015.94530,945.55811,373.57632,1022.09560,931.89069
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.319164,0.096823,0.329429,0.409527,0.344268,-0.052282,-0.010254,0.206219,0.210676,0.464530,...,0.768649,0.028030,0.057157,0.085592,0.211194,0.145592,0.030310,0.012798,0.122545,0.046868
201210-1-01,0.383976,0.091685,0.400264,0.377546,0.396996,-0.048396,-0.039092,0.156963,0.166336,0.434901,...,0.818957,0.019413,0.024127,0.050148,0.122180,0.141411,0.157287,-0.012970,0.140546,0.122122
201210-1-02,0.231887,0.109133,0.243603,0.281808,0.259837,0.001198,0.021070,0.091076,0.084114,0.294395,...,0.343334,0.016673,0.048397,0.045721,0.073404,0.052077,0.080405,-0.041060,0.047488,0.226354
201210-1-03,0.226318,0.096593,0.229848,0.276090,0.242202,0.054859,0.076625,0.056651,0.057868,0.308309,...,0.674799,0.000052,-0.003062,0.166534,0.021564,0.046300,0.045248,-0.076380,0.043907,0.094997
201210-1-04,0.535764,0.221975,0.549599,0.756738,0.620397,0.118391,0.164102,0.213213,0.188476,0.785093,...,0.833014,0.040811,0.097929,0.128002,0.457447,0.094489,0.010984,-0.078523,0.077247,0.073969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.704459,0.517387,0.736961,0.811766,0.728110,0.364634,0.355404,0.303084,0.347489,0.782819,...,0.597141,0.044218,0.098223,0.085358,0.022304,0.193695,0.003181,-0.028731,0.105207,-0.016260
210526-3-46,0.739683,0.564024,0.771741,0.808510,0.796618,0.304239,0.351558,0.293491,0.283642,0.881355,...,0.672655,0.067240,0.108499,0.144087,0.032482,0.136228,0.017082,-0.038187,0.118498,-0.033198
210526-3-47,0.726187,0.502791,0.704471,0.765395,0.667502,0.255526,0.282177,0.323448,0.290828,0.800473,...,0.646719,0.073813,0.099596,0.189581,0.072421,0.102537,0.028638,-0.040858,0.099619,0.045504
210526-3-48,0.717998,0.508047,0.732956,0.763370,0.792811,0.339413,0.421507,0.245229,0.231915,0.819220,...,0.677658,0.038455,0.100970,0.128714,0.030327,0.102641,-0.005548,-0.126089,0.067985,0.004472


In [39]:
X = X_top_100
et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.8950819672131147


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.88      0.87        91
 Hypoglycemia       0.90      0.89      0.90       107
       Normal       0.92      0.92      0.92       107

     accuracy                           0.90       305
    macro avg       0.89      0.89      0.89       305
 weighted avg       0.90      0.90      0.90       305


Confusion Matrix:
[[80  5  6]
 [ 9 95  3]
 [ 4  5 98]]
Overall Accuracy: 0.9016393442622951


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.93      0.87      0.90        91
 Hypoglycemia       0.91      0.87      0.89       107
       Normal       0.87      0.96      0.92       107

     accuracy                           0.90       305
    macro avg       0.90      0.90      0.90       305
 weighted avg       0.90      0.90      0.90       305


Confusion Matrix:
[[ 79   6   6]
 [  5  93 