#### Machine Learning

In [22]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [23]:
graph_metrics_df = pd.read_csv("../../data/current_graph_metrics.csv")
statuses_df = pd.read_csv("../../data/SpecIDs_and_Status.csv")

In [24]:
df = graph_metrics_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,LabelPropagation,Leiden,Louvain,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201210-1-00,0.830275,1170.371464,0.014458,0.195683,1,1,2283,Normal
201210-1-01,0.811969,1134.613827,0.014017,0.194311,1,1,2283,Normal
201210-1-02,0.580317,733.700940,0.009064,0.178658,1,195,356,Normal
201210-1-03,0.744123,1017.430766,0.012569,0.189725,1,195,2283,Normal
201210-1-04,0.794119,1107.509457,0.013682,0.193235,1,1,2283,Normal
...,...,...,...,...,...,...,...,...
210526-3-45,1.128517,1689.660278,0.020874,0.215918,1,1,2283,Hyperglycemia
210526-3-46,1.110504,1658.692469,0.020491,0.214711,1,1,2283,Hyperglycemia
210526-3-47,1.145219,1718.172663,0.021226,0.217025,1,1,2283,Hyperglycemia
210526-3-48,1.084825,1613.192575,0.019929,0.212946,1,1,2283,Hyperglycemia


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [25]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [26]:
X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.46885245901639344


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.43      0.46      0.45        91
 Hypoglycemia       0.46      0.49      0.47       107
       Normal       0.52      0.46      0.49       107

     accuracy                           0.47       305
    macro avg       0.47      0.47      0.47       305
 weighted avg       0.47      0.47      0.47       305


Confusion Matrix:
[[42 28 21]
 [30 52 25]
 [25 33 49]]
Overall Accuracy: 0.4491803278688525


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.39      0.38      0.39        91
 Hypoglycemia       0.43      0.44      0.43       107
       Normal       0.52      0.51      0.52       107

     accuracy                           0.45       305
    macro avg       0.45      0.45      0.45       305
 weighted avg       0.45      0.45      0.45       305


Confusion Matrix:
[[35 34 22]
 [32 47 28]


#### Try scaling the features

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your DataFrame and it has been defined previously

X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

# Initialize lists to store scores
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Scaling the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    et.fit(X_train_scaled, y_train)
    y_pred = et.predict(X_test_scaled)
    
    # Calculate and append the scores
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')


Accuracy: 0.4443 +/- 0.0164
Precision: 0.4449 +/- 0.0173
Recall: 0.4443 +/- 0.0164
F1-Score: 0.4439 +/- 0.0167


#### Create a smaller featureset based on feature importance.

In [28]:
# Fitting the Extra Trees classifier
et.fit(X, y)

feature_importances = et.feature_importances_

# Creating a DataFrame to store feature importances with corresponding column names
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sorting the DataFrame by importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Selecting the top 100 features
top_3_features = importance_df.head(3)['Feature'].tolist()

# Dropping columns in the DataFrame that are not in the top 100 features
X_top_3= X[top_3_features]


In [29]:
X_top_3

Unnamed: 0_level_0,PageRank,DegreeCentrality,EigenvectorCentrality
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
201210-1-00,0.830275,1170.371464,0.014458
201210-1-01,0.811969,1134.613827,0.014017
201210-1-02,0.580317,733.700940,0.009064
201210-1-03,0.744123,1017.430766,0.012569
201210-1-04,0.794119,1107.509457,0.013682
...,...,...,...
210526-3-45,1.128517,1689.660278,0.020874
210526-3-46,1.110504,1658.692469,0.020491
210526-3-47,1.145219,1718.172663,0.021226
210526-3-48,1.084825,1613.192575,0.019929


In [30]:
X = X_top_3
et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.49508196721311476


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.43      0.47      0.45        91
 Hypoglycemia       0.53      0.55      0.54       107
       Normal       0.52      0.46      0.49       107

     accuracy                           0.50       305
    macro avg       0.49      0.49      0.49       305
 weighted avg       0.50      0.50      0.50       305


Confusion Matrix:
[[43 25 23]
 [25 59 23]
 [31 27 49]]
Overall Accuracy: 0.4426229508196721


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.43      0.42      0.42        91
 Hypoglycemia       0.43      0.40      0.42       107
       Normal       0.47      0.50      0.48       107

     accuracy                           0.44       305
    macro avg       0.44      0.44      0.44       305
 weighted avg       0.44      0.44      0.44       305


Confusion Matrix:
[[38 26 27]
 [29 43 35]


>#### Classify based on Node2Vec Embeddings

In [31]:
node2vec_df = pd.read_csv("../../data/node2vec_embeddings.csv")
statuses_df = pd.read_csv("../../data/SpecIDs_and_Status.csv")

In [32]:
df = node2vec_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.086032,-0.185270,0.033179,-0.047239,0.065701,-0.009305,-0.100501,-0.027420,0.112858,-0.107585,...,-0.062231,-0.145350,-0.052798,0.161506,0.028158,-0.021661,-0.059368,0.109874,-0.007128,Normal
201210-1-01,0.062818,-0.176356,0.025358,-0.037777,0.063226,-0.007464,-0.093705,-0.024278,0.109136,-0.099892,...,-0.059595,-0.132377,-0.045025,0.162551,0.025750,-0.021477,-0.046240,0.107082,-0.014551,Normal
201210-1-02,0.159142,-0.171132,0.030563,-0.032695,0.067244,-0.016539,-0.087670,-0.028249,0.108239,-0.104724,...,-0.066370,-0.140389,-0.046804,0.155917,0.027292,-0.023651,-0.060648,0.114323,-0.016155,Normal
201210-1-03,0.120780,-0.178257,0.027309,-0.035732,0.062834,-0.012190,-0.098334,-0.026911,0.115594,-0.107886,...,-0.065155,-0.146154,-0.061851,0.150272,0.030853,-0.015814,-0.054435,0.117707,-0.025257,Normal
201210-1-04,-0.056296,-0.158114,0.027014,-0.052280,0.056288,-0.002634,-0.088209,-0.017488,0.122248,-0.099836,...,-0.060374,-0.140649,-0.045893,0.172345,0.045012,-0.016170,-0.046348,0.119398,-0.010119,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.079046,-0.167850,0.028405,-0.055476,0.068646,-0.005401,-0.092621,-0.022042,0.136452,-0.108054,...,-0.043312,-0.131374,-0.054812,0.160865,0.035409,-0.007019,-0.049420,0.109533,-0.011968,Hyperglycemia
210526-3-46,-0.064139,-0.173517,0.027888,-0.053396,0.055405,-0.016901,-0.080149,-0.025213,0.109773,-0.106112,...,-0.055176,-0.130427,-0.056682,0.167461,0.027530,-0.009512,-0.066765,0.109252,-0.026612,Hyperglycemia
210526-3-47,-0.004329,-0.165524,0.019107,-0.059162,0.060154,-0.006467,-0.081637,-0.030399,0.116801,-0.104015,...,-0.056407,-0.145853,-0.050214,0.154296,0.036782,-0.021398,-0.058882,0.115405,-0.019428,Hyperglycemia
210526-3-48,-0.009657,-0.156114,0.018858,-0.039965,0.049768,0.006564,-0.099949,-0.032901,0.109168,-0.100484,...,-0.050094,-0.131716,-0.039294,0.140912,0.039759,-0.018700,-0.054027,0.103762,-0.009763,Hyperglycemia


In [33]:
X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.419672131147541


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.35      0.24      0.29        91
 Hypoglycemia       0.42      0.50      0.46       107
       Normal       0.45      0.50      0.47       107

     accuracy                           0.42       305
    macro avg       0.41      0.41      0.41       305
 weighted avg       0.41      0.42      0.41       305


Confusion Matrix:
[[22 36 33]
 [23 53 31]
 [18 36 53]]
Overall Accuracy: 0.3704918032786885


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.33      0.23      0.27        91
 Hypoglycemia       0.40      0.41      0.41       107
       Normal       0.37      0.45      0.40       107

     accuracy                           0.37       305
    macro avg       0.36      0.36      0.36       305
 weighted avg       0.37      0.37      0.36       305


Confusion Matrix:
[[21 23 47]
 [27 44 36]
 [

>#### Classify based on FastRP Embeddings

In [37]:
fastRP_df = pd.read_csv("../../data/fastRP_embeddings.csv")
statuses_df = pd.read_csv("../../data/SpecIDs_and_Status.csv")

In [38]:
df = fastRP_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,-0.080063,0.134346,0.311409,0.068268,0.220651,0.020809,-0.157989,-0.023693,0.139395,0.119681,...,0.062691,-0.112076,0.184696,0.327179,-0.075120,0.091235,-0.209512,0.040763,-0.094208,Normal
201210-1-01,-0.080101,0.134585,0.310871,0.067332,0.220452,0.020230,-0.158860,-0.023553,0.139669,0.118961,...,0.063693,-0.111742,0.186010,0.326752,-0.076596,0.091260,-0.211565,0.040145,-0.094633,Normal
201210-1-02,-0.079054,0.138156,0.306293,0.055854,0.212508,0.017010,-0.174445,-0.019908,0.143086,0.116697,...,0.065156,-0.112998,0.196053,0.328230,-0.083976,0.093691,-0.220491,0.040576,-0.086324,Normal
201210-1-03,-0.079920,0.136971,0.307921,0.059693,0.213916,0.018670,-0.169574,-0.020787,0.141887,0.117424,...,0.063974,-0.112868,0.192851,0.328275,-0.082046,0.093300,-0.217178,0.040930,-0.088214,Normal
201210-1-04,-0.079154,0.133878,0.312250,0.070663,0.223848,0.020361,-0.154181,-0.024949,0.138894,0.120166,...,0.062828,-0.111242,0.182163,0.326090,-0.072420,0.090532,-0.207949,0.039952,-0.096966,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.077160,0.133637,0.313199,0.071438,0.226589,0.019494,-0.154102,-0.025647,0.139182,0.121855,...,0.062112,-0.111478,0.180060,0.326621,-0.067590,0.089882,-0.204368,0.040522,-0.096711,Hyperglycemia
210526-3-46,-0.077198,0.133361,0.313462,0.072150,0.227169,0.019669,-0.153159,-0.025882,0.138980,0.122001,...,0.062088,-0.111391,0.179395,0.326495,-0.067051,0.089678,-0.203765,0.040492,-0.097252,Hyperglycemia
210526-3-47,-0.076899,0.134395,0.312508,0.069708,0.225286,0.019181,-0.156088,-0.025093,0.139682,0.121436,...,0.062091,-0.111497,0.181595,0.326650,-0.068859,0.090412,-0.205910,0.040582,-0.095570,Hyperglycemia
210526-3-48,-0.077383,0.132969,0.313717,0.072891,0.227793,0.019769,-0.152275,-0.026112,0.138768,0.122063,...,0.062284,-0.111309,0.178866,0.326396,-0.066739,0.089389,-0.203345,0.040377,-0.097908,Hyperglycemia


In [39]:
X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.7934426229508197


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.72      0.74      0.73        91
 Hypoglycemia       0.81      0.78      0.79       107
       Normal       0.84      0.86      0.85       107

     accuracy                           0.79       305
    macro avg       0.79      0.79      0.79       305
 weighted avg       0.79      0.79      0.79       305


Confusion Matrix:
[[67 15  9]
 [15 83  9]
 [11  4 92]]
Overall Accuracy: 0.7672131147540984


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.73      0.79      0.76        91
 Hypoglycemia       0.76      0.69      0.73       107
       Normal       0.81      0.82      0.81       107

     accuracy                           0.77       305
    macro avg       0.77      0.77      0.77       305
 weighted avg       0.77      0.77      0.77       305


Confusion Matrix:
[[72 12  7]
 [19 74 14]
 