#### Machine Learning

In [111]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [112]:
graph_metrics_df = pd.read_csv("../../data/current_graph_metrics.csv")
statuses_df = pd.read_csv("../../data/SpecIDs_and_Status.csv")

In [113]:
df = graph_metrics_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,LabelPropagation,Leiden,Louvain,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201210-1-00,0.400982,38.599201,0.002301,0.151601,8,0,27,Normal
201210-1-01,0.361817,28.561028,0.001729,0.151186,8,0,27,Normal
201210-1-02,0.185693,1.846121,0.000129,0.150076,8,2,27,Normal
201210-1-03,0.311787,16.620121,0.001146,0.150689,8,2,27,Normal
201210-1-04,0.282971,23.934548,0.001388,0.150993,8,0,27,Normal
...,...,...,...,...,...,...,...,...
210526-3-45,1.183442,235.278916,0.013616,0.159743,8,0,27,Hyperglycemia
210526-3-46,1.162865,227.672284,0.013214,0.159432,8,0,27,Hyperglycemia
210526-3-47,1.095440,222.732215,0.012793,0.159213,8,0,27,Hyperglycemia
210526-3-48,1.081279,205.327842,0.011956,0.158511,8,0,27,Hyperglycemia


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [114]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [115]:
X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.5147540983606558


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.45      0.47      0.46        91
 Hypoglycemia       0.50      0.49      0.49       107
       Normal       0.59      0.58      0.58       107

     accuracy                           0.51       305
    macro avg       0.51      0.51      0.51       305
 weighted avg       0.52      0.51      0.52       305


Confusion Matrix:
[[43 28 20]
 [32 52 23]
 [21 24 62]]
Overall Accuracy: 0.5475409836065573


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.49      0.49      0.49        91
 Hypoglycemia       0.57      0.53      0.55       107
       Normal       0.57      0.61      0.59       107

     accuracy                           0.55       305
    macro avg       0.54      0.54      0.54       305
 weighted avg       0.55      0.55      0.55       305


Confusion Matrix:
[[45 23 23]
 [24 57 26]
 

#### Try scaling the features

In [128]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your DataFrame and it has been defined previously

X = df.drop(['Status'], axis=1)
y = df['Status']

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

# Initialize lists to store scores
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Scaling the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    et.fit(X_train_scaled, y_train)
    y_pred = et.predict(X_test_scaled)
    
    # Calculate and append the scores
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')


Accuracy: 0.5264 +/- 0.0194
Precision: 0.5271 +/- 0.0198
Recall: 0.5264 +/- 0.0194
F1-Score: 0.5259 +/- 0.0195


#### Create a smaller featureset based on feature importance.

In [116]:
# Fitting the Extra Trees classifier
et.fit(X, y)

feature_importances = et.feature_importances_

# Creating a DataFrame to store feature importances with corresponding column names
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sorting the DataFrame by importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Selecting the top 100 features
top_3_features = importance_df.head(3)['Feature'].tolist()

# Dropping columns in the DataFrame that are not in the top 100 features
X_top_3= X[top_3_features]


In [117]:
X_top_3

Unnamed: 0_level_0,PageRank,EigenvectorCentrality,DegreeCentrality
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
201210-1-00,0.400982,0.002301,38.599201
201210-1-01,0.361817,0.001729,28.561028
201210-1-02,0.185693,0.000129,1.846121
201210-1-03,0.311787,0.001146,16.620121
201210-1-04,0.282971,0.001388,23.934548
...,...,...,...
210526-3-45,1.183442,0.013616,235.278916
210526-3-46,1.162865,0.013214,227.672284
210526-3-47,1.095440,0.012793,222.732215
210526-3-48,1.081279,0.011956,205.327842


In [118]:
X = X_top_3
et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.5114754098360655


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.43      0.44      0.43        91
 Hypoglycemia       0.51      0.52      0.52       107
       Normal       0.59      0.56      0.58       107

     accuracy                           0.51       305
    macro avg       0.51      0.51      0.51       305
 weighted avg       0.51      0.51      0.51       305


Confusion Matrix:
[[40 30 21]
 [31 56 20]
 [23 24 60]]
Overall Accuracy: 0.5180327868852459


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.47      0.41      0.44        91
 Hypoglycemia       0.55      0.53      0.54       107
       Normal       0.52      0.60      0.56       107

     accuracy                           0.52       305
    macro avg       0.51      0.51      0.51       305
 weighted avg       0.52      0.52      0.52       305


Confusion Matrix:
[[37 23 31]
 [23 57 27]
 