#### This Version Creates the Graph in Neo4J then runs the Queries.

In [2]:
from neo4j import GraphDatabase
import os
import pandas as pd
import shutil
import subprocess

This moves the nodes and relationships files from data/current_working_graph into the DBMS import folder.

At the moment this uses local paths, but the rest should be automated.

In [3]:
# Define the source directory
source_dir = os.path.abspath('../../data/graph_2_0/train')

# Define the destination directory
# This should be the import directory of your graph database
destination_dir = '/home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b/import'

# Specify the filenames to move
filenames = ['nodes.csv', 'relationships.csv']

for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    
    # Move each file
    shutil.copy(source_path, destination_path)

Then run the admin import command.

In [4]:
# Define the working directory
working_dir = '/home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b'

# Construct the command
command = [
    './bin/neo4j-admin', 'database', 'import', 'full',
    '--nodes=import/nodes.csv',
    '--relationships=import/relationships.csv', 'neo4j'
]

#command = 'neo4j-admin database import full --nodes=import/nodes.csv --relationships=import/relationships.csv neo4j'

# Execute the command

result = subprocess.run(command, cwd=working_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Check if the command was successful
if result.returncode == 0:
    print("Import successful")
    print(result.stdout)
else:
    print("Error during import:")
    print(result.stderr)

Import successful
b"Neo4j version: 5.12.0\nImporting the contents of these files into /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b/data/databases/neo4j:\nNodes:\n  /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b/import/nodes.csv\n\nRelationships:\n  /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b/import/relationships.csv\n\n\nAvailable resources:\n  Total machine memory: 15.47GiB\n  Free machine memory: 515.2MiB\n  Max heap memory : 910.5MiB\n  Max worker threads: 8\n  Configured max memory: 247.8MiB\n  High parallel IO: true\n\nCypher type normalization is enabled (disable with --normalize-types=false):\n  Property type of 'Weight' normalized from 'float' --> 'double' in /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-114869c1-0bb9-4380-9f3b-e6c3cdb29b3b/import/relationships.csv\n\nImport s

Run and connect to the Neo4j Database

In [33]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"              # Neo4J username
#password = os.environ['NEO4J_Password']           # Neo4J password
password = '26622002'

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Ensure you close the driver connection when your program ends
def close_driver():
    driver.close()

**First create the gds Graph Projection**

In [34]:
def project_graph(tx):
    query = """
    CALL gds.graph.project(
      'myGraph', 
      ['Normal', 'Hyperglycemia', 'Hypoglycemia'],
      {
        LINK: {
          orientation: 'UNDIRECTED',
          properties: 'Weight'
        }
      }
    )
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(project_graph)

Define the graph algorithms.

In [19]:
def run_pagerank_centrality(tx):
    query = """
    CALL gds.pageRank.stream('myGraph', {
        relationshipWeightProperty: 'Weight'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["score"]) for record in results]

In [20]:
def run_degree_centrality(tx):
    query = """
    CALL gds.degree.stream('myGraph', {
        relationshipWeightProperty: 'Weight'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["score"]) for record in results]

In [10]:
def run_eigenvector_centrality(tx):
    query = """
    CALL gds.eigenvector.stream('myGraph', {
        relationshipWeightProperty: 'Weight'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["score"]) for record in results]

In [11]:
def run_articlerank_centrality(tx):
    query = """
    CALL gds.articleRank.stream('myGraph', {
        relationshipWeightProperty: 'Weight'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["score"]) for record in results]

In [12]:
def run_label_propagation_algorithm(tx):
    query = """
    CALL gds.labelPropagation.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["Community"]) for record in results]

In [18]:
def run_leiden_algorithm(tx):
    query = """
    CALL gds.leiden.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["Community"]) for record in results]

In [19]:
def run_louvain_algorithm(tx):
    query = """
    CALL gds.louvain.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["Community"]) for record in results]

In [14]:
def run_node2vec_algorithm(tx):
    query = """
    CALL gds.node2vec.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

In [None]:
# def run_fastRP_algorithm(tx):
#     query = """
#     CALL gds.fastRP.stream('myGraph',
#         { relationshipWeightProperty: 'Weight',
#          randomSeed:1234,
#          embeddingDimension: 128
#         }
#     )
#     YIELD nodeId, embedding
#     RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
#     """
#     results = tx.run(query)
#     return [(record["name"], record["embedding"]) for record in results]

In [37]:
def run_fastRP_algorithm(tx):
    query = """
    CALL gds.fastRP.stream('myGraph',
        { relationshipWeightProperty: 'Weight',
         randomSeed:1234,
         embeddingDimension: 128,
         propertyRatio:1.0
        }
    )
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

Execute the algorithms and store the results in a Dataframe.

In [20]:
# Use a session to execute the queries and retrieve the results
with driver.session() as session:
    pagerank_results = session.execute_read(run_pagerank_centrality)
    degree_results = session.execute_read(run_degree_centrality)
    eigenvector_results = session.execute_read(run_eigenvector_centrality)
    articlerank_results = session.execute_read(run_articlerank_centrality)
    label_propagation_results = session.execute_read(run_label_propagation_algorithm)
    leiden_results = session.execute_read(run_leiden_algorithm)
    louvain_results = session.execute_read(run_louvain_algorithm)

In [22]:
pagerank_df = pd.DataFrame(pagerank_results, columns=['name', 'PageRank'])
degree_df = pd.DataFrame(degree_results, columns=['name', 'DegreeCentrality'])
eigenvector_df = pd.DataFrame(eigenvector_results, columns=['name', 'EigenvectorCentrality'])
articlerank_df = pd.DataFrame(articlerank_results, columns=['name', 'ArticleRank'])
label_propagation_df = pd.DataFrame(label_propagation_results, columns=['name' 'LabelPropagation'])
leiden_df = pd.DataFrame(leiden_results, columns=['name', 'Leiden'])
louvain_df = pd.DataFrame(louvain_results, columns=['name', 'Louvain'])

ValueError: 1 columns passed, passed data had 2 columns

In [24]:
merged_df = pagerank_df
for df in [degree_df, eigenvector_df, articlerank_df, leiden_df, louvain_df]:
    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')

In [25]:
df = merged_df.rename(columns={'name' : 'SpecID'})
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain
0,210407-2-08,5289652,1.487887,543.682362,0.008894,0.298572,46,31753
1,210304-2-20,2554528,1.466770,299.735350,0.008258,0.236928,8,23569
2,210304-2-16,2543947,1.462001,268.882009,0.008094,0.228633,0,23569
3,210304-2-16,2543999,1.460277,268.982704,0.008088,0.228650,0,23569
4,210407-2-08,5289244,1.447841,500.355360,0.008408,0.288114,46,31594
...,...,...,...,...,...,...,...,...
41539,201210-1-06,18353,0.216328,23.347484,0.000296,0.156496,45,16100
41540,201210-1-09,23620,0.214901,18.060851,0.000213,0.155111,19,31642
41541,210114-2-34,484813,0.214737,32.948812,0.000389,0.158864,45,1277
41542,210414-3-34,5607088,0.213060,19.766993,0.000296,0.155656,46,31594


In [36]:
with driver.session() as session:
    fastRP_results = session.execute_read(run_fastRP_algorithm)

fastRP_df = pd.DataFrame(fastRP_results, columns=['SpecID', 'embeddings'])

# Expand the embeddings list into separate columns
embeddings_df = pd.DataFrame(fastRP_df['embeddings'].tolist(), index=fastRP_df.index)

# Optionally, rename the new columns
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

# Join the new embeddings columns to the original DataFrame
fastRP_df = pd.concat([fastRP_df.drop(['embeddings'], axis=1), embeddings_df], axis=1)
fastRP_df.to_csv('../../data/fastRP_embeddings.csv', index=False)
fastRP_df.head()

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.fastRP.stream`: Caused by: java.lang.IllegalArgumentException: When `propertyRatio` is non-zero, `featureProperties` may not be empty.}

Delete the projection

In [41]:
def delete_projection(tx):
    query = """
    CALL gds.graph.drop('myGraph')
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(delete_projection)

In [88]:
close_driver()

In [10]:
status_df = pd.read_csv('../../data/graph_2_0/train/nodes.csv')

In [48]:
df['id'] = df['id'].astype(int)

NameError: name 'df' is not defined

In [33]:
graph_df = pd.merge(df, status_df, left_on="id", right_on="Node_ID:ID", how="inner")
graph_df.drop(columns=["Node_ID:ID"], inplace=True)

In [11]:
fast_graph_df = pd.merge(fastRP_df, status_df, left_on="SpecID", right_on="SpecID:ID", how="inner")
fast_graph_df.drop(columns=["SpecID:ID"], inplace=True)

In [35]:
graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)
graph_df.drop(columns=['SpecID_y','Absorbance:float', 'Widths:float', 'Prominences:float', 'GridSlot'], inplace=True)
graph_df.rename(columns={'SpecID_x':'SpecID'}, inplace=True)

In [12]:
fast_graph_df.rename(columns={":LABEL":"Status"}, inplace=True)

In [37]:
fast_graph_df.drop(columns=['SpecID_y', 'WaveNumber:float', 'Absorbance:float', 'Widths:float', 'Prominences:float', 'GridSlot'], inplace=True)
fast_graph_df.rename(columns={'SpecID_x':'SpecID'}, inplace=True)

In [51]:
fast_graph_df

Unnamed: 0,SpecID,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
0,210211-2-15,-0.028350,0.035089,0.370113,0.024668,0.269506,0.261418,-0.096397,-0.106100,0.016672,...,0.072223,-0.012611,0.243187,0.110617,-0.151607,-0.116713,-0.214596,-0.031656,-0.323186,Hyperglycemia
1,210415-1-20,-0.013020,0.067876,0.389106,0.042684,0.322279,0.256548,-0.091430,-0.112429,-0.024678,...,0.078344,-0.011562,0.209786,0.077543,-0.164601,-0.115064,-0.212487,-0.017329,-0.294613,Hypoglycemia
2,210225-2-42,-0.013049,0.067798,0.389069,0.042666,0.322151,0.256592,-0.091410,-0.112421,-0.024595,...,0.078332,-0.011565,0.209861,0.077641,-0.164575,-0.115054,-0.212487,-0.017388,-0.294712,Hypoglycemia
3,210311-2-11,-0.013340,0.066986,0.388651,0.042478,0.320819,0.257037,-0.091202,-0.112357,-0.023709,...,0.078224,-0.011591,0.210632,0.078670,-0.164308,-0.114920,-0.212488,-0.018000,-0.295773,Normal
4,210419-2-15,-0.012985,0.067971,0.389156,0.042707,0.322431,0.256497,-0.091454,-0.112441,-0.024784,...,0.078357,-0.011561,0.209695,0.077426,-0.164633,-0.115076,-0.212488,-0.017257,-0.294491,Hypoglycemia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,210415-2-42,-0.013014,0.067894,0.389115,0.042686,0.322305,0.256539,-0.091435,-0.112432,-0.024699,...,0.078348,-0.011563,0.209769,0.077521,-0.164607,-0.115068,-0.212486,-0.017316,-0.294589,Hypoglycemia
2432,210224-1-15,-0.013023,0.067867,0.389104,0.042682,0.322262,0.256554,-0.091428,-0.112428,-0.024670,...,0.078343,-0.011565,0.209795,0.077556,-0.164598,-0.115061,-0.212487,-0.017336,-0.294622,Hypoglycemia
2433,210318-1-19,-0.013446,0.066721,0.388511,0.042413,0.320376,0.257168,-0.091161,-0.112284,-0.023383,...,0.078186,-0.011605,0.210916,0.078987,-0.164197,-0.114929,-0.212485,-0.018202,-0.296097,Hypoglycemia
2434,210225-1-24,-0.013164,0.067484,0.388910,0.042595,0.321636,0.256763,-0.091334,-0.112387,-0.024245,...,0.078290,-0.011576,0.210167,0.078038,-0.164467,-0.115012,-0.212489,-0.017626,-0.295119,Hypoglycemia


In [39]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
graph_df['Bin'] = graph_df['WaveNumber:float'].apply(lambda x: calculate_bin_interval(x, bin_size))

In [40]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = graph_df.pivot_table(index='SpecID', columns='Bin', values=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden', 'Louvain'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = graph_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

In [41]:
graph_bins = peak_bins.reset_index()

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.7406 +/- 0.0352
ExtraTreesClassifier Cross-Validation Accuracy: 0.7499 +/- 0.0267


In [60]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Random Forest model
graph_rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
graph_rf_model.fit(X_train, y_train)

# Extra Trees model
graph_et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
graph_et_model.fit(X_train, y_train)

# Predictions
rf_predictions = graph_rf_model.predict(X_test)
et_predictions = graph_et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

In [16]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [17]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [47]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.710691823899371


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.78      0.61      0.68       152
 Hypoglycemia       0.66      0.78      0.71       165
       Normal       0.73      0.74      0.73       160

     accuracy                           0.71       477
    macro avg       0.72      0.71      0.71       477
 weighted avg       0.72      0.71      0.71       477


Confusion Matrix:
[[ 93  38  21]
 [ 14 128  23]
 [ 13  29 118]]


In [48]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.7232704402515723


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.79      0.63      0.70       152
 Hypoglycemia       0.67      0.82      0.74       165
       Normal       0.73      0.71      0.72       160

     accuracy                           0.72       477
    macro avg       0.73      0.72      0.72       477
 weighted avg       0.73      0.72      0.72       477


Confusion Matrix:
[[ 96  33  23]
 [ 11 136  18]
 [ 14  33 113]]


In [61]:
get_feature_importances(graph_rf_model, X)

Unnamed: 0,Feature,Importance
72,DegreeCentrality_1000-1025,0.010568
0,ArticleRank_1000-1025,0.010037
402,PageRank_250-275,0.009209
42,ArticleRank_250-275,0.009035
114,DegreeCentrality_250-275,0.009007
186,EigenvectorCentrality_250-275,0.008902
144,EigenvectorCentrality_1000-1025,0.008265
330,Louvain_250-275,0.008262
57,ArticleRank_625-650,0.008161
153,EigenvectorCentrality_1225-1250,0.007628


In [62]:
get_feature_importances(graph_et_model, X)

Unnamed: 0,Feature,Importance
72,DegreeCentrality_1000-1025,0.007336
0,ArticleRank_1000-1025,0.007066
402,PageRank_250-275,0.006929
124,DegreeCentrality_500-525,0.006928
42,ArticleRank_250-275,0.006751
330,Louvain_250-275,0.006363
67,ArticleRank_875-900,0.006345
216,Leiden_1000-1025,0.006344
153,EigenvectorCentrality_1225-1250,0.006169
258,Leiden_250-275,0.00608


## FastRp

In [22]:
fast_graph_df

Unnamed: 0,SpecID,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
0,210211-2-15,0.014328,-0.005595,0.387151,-0.050839,0.129597,0.298107,-0.048177,-0.158895,-0.013871,...,0.051712,0.080105,0.178417,0.151002,-0.136364,-0.082126,-0.179031,-0.093109,-0.406810,Hyperglycemia
1,210415-1-20,0.044169,0.004059,0.410680,-0.053372,0.153831,0.291841,-0.013853,-0.202882,-0.028705,...,0.053677,0.100474,0.141583,0.133880,-0.132757,-0.038408,-0.170813,-0.067152,-0.392311,Hypoglycemia
2,210225-2-42,0.027787,-0.009399,0.401244,-0.050182,0.124582,0.302917,-0.022936,-0.183222,-0.033359,...,0.048686,0.096251,0.160653,0.158651,-0.130441,-0.048910,-0.180367,-0.087850,-0.405900,Hypoglycemia
3,210311-2-11,0.013703,-0.023615,0.391859,-0.048160,0.096460,0.311672,-0.027233,-0.168651,-0.039395,...,0.043169,0.094690,0.174415,0.182139,-0.126789,-0.052705,-0.188362,-0.105241,-0.414027,Normal
4,210419-2-15,0.054518,0.012325,0.415422,-0.054542,0.171389,0.283712,-0.008851,-0.213223,-0.025861,...,0.055895,0.102198,0.130378,0.117745,-0.134066,-0.032238,-0.164766,-0.053761,-0.381899,Hypoglycemia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,210415-2-42,0.060462,0.019052,0.414260,-0.056107,0.184144,0.275341,-0.003606,-0.222517,-0.023648,...,0.060053,0.104070,0.119559,0.103650,-0.133797,-0.028468,-0.158630,-0.043036,-0.371681,Hypoglycemia
2432,210224-1-15,0.041505,0.002882,0.408486,-0.052802,0.150492,0.292945,-0.015601,-0.199920,-0.028877,...,0.053843,0.099586,0.144179,0.136263,-0.132367,-0.041070,-0.171574,-0.069877,-0.394148,Hypoglycemia
2433,210318-1-19,0.023414,-0.011599,0.397291,-0.049093,0.118051,0.304459,-0.028183,-0.174043,-0.032666,...,0.047435,0.092910,0.167522,0.163167,-0.130441,-0.055624,-0.184225,-0.092807,-0.408004,Hypoglycemia
2434,210225-1-24,0.024418,-0.011768,0.398874,-0.049341,0.118865,0.304472,-0.025293,-0.178815,-0.034180,...,0.047548,0.095160,0.164923,0.163345,-0.129594,-0.051393,-0.182249,-0.091844,-0.407828,Hypoglycemia


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X_train = fast_graph_df.drop(columns=['SpecID', 'Status'])

# Target variable
y_train= fast_graph_df['Status']

# Random Forest model
fastrp_rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
fastrp_rf_model.fit(X_train, y_train)

# Extra Trees model
fastrp_et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
fastrp_et_model.fit(X_train, y_train)

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.7459016393442623


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.69      0.62      0.66        72
 Hypoglycemia       0.73      0.79      0.76        86
       Normal       0.80      0.80      0.80        86

     accuracy                           0.75       244
    macro avg       0.74      0.74      0.74       244
 weighted avg       0.74      0.75      0.74       244


Confusion Matrix:
[[45 18  9]
 [10 68  8]
 [10  7 69]]
Overall Accuracy: 0.6762295081967213


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.62      0.62      0.62        72
 Hypoglycemia       0.62      0.67      0.65        86
       Normal       0.78      0.72      0.75        86

     accuracy                           0.68       244
    macro avg       0.68      0.67      0.67       244
 weighted avg       0.68      0.68      0.68       244


Confusion Matrix:
[[45 19  8]
 [19 58  9]
 

Fast rp mixed with traditional graph metrics

In [53]:
joined_graph = pd.merge(graph_bins, fast_graph_df, on=['SpecID', 'Status'], how='inner')

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = joined_graph.drop(columns=['SpecID', 'Status'])

# Target variable
y = joined_graph['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)

Random Forest Accuracy: 0.710691823899371
Extra Trees Accuracy: 0.7358490566037735


## Full Graph

In [54]:
# Define the source directory
source_dir = os.path.abspath('../../data/graph_2_0/full')

# Define the destination directory
# This should be the import directory of your graph database
destination_dir = '/home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995/import'

# Specify the filenames to move
filenames = ['nodes.csv', 'relationships.csv']

for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    
    # Move each file
    shutil.copy(source_path, destination_path)

Then run the admin import command.

In [55]:
# Define the working directory
working_dir = '/home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995'

# Construct the command
command = [
    './bin/neo4j-admin', 'database', 'import', 'full',
    '--nodes=import/nodes.csv',
    '--relationships=import/relationships.csv', 'neo4j'
]

#command = 'neo4j-admin database import full --nodes=import/nodes.csv --relationships=import/relationships.csv neo4j'

# Execute the command


result = subprocess.run(command, cwd=working_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Check if the command was successful
if result.returncode == 0:
    print("Import successful")
    print(result.stdout)
else:
    print("Error during import:")
    print(result.stderr)

Import successful
b"Neo4j version: 5.12.0\nImporting the contents of these files into /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995/data/databases/neo4j:\nNodes:\n  /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995/import/nodes.csv\n\nRelationships:\n  /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995/import/relationships.csv\n\n\nAvailable resources:\n  Total machine memory: 15.47GiB\n  Free machine memory: 1.959GiB\n  Max heap memory : 910.5MiB\n  Max worker threads: 8\n  Configured max memory: 1007MiB\n  High parallel IO: true\n\nCypher type normalization is enabled (disable with --normalize-types=false):\n  Property type of 'Weight' normalized from 'float' --> 'double' in /home/stang/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-04f654c4-5261-43ac-84f9-c95b72cce995/import/relationships.csv\n\nImport st

Run and connect to the Neo4j Database

In [24]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"              # Neo4J username
#password = os.environ['NEO4J_Password']           # Neo4J password
password = '26622002'

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Ensure you close the driver connection when your program ends
def close_driver():
    driver.close()

**First create the gds Graph Projection**

In [25]:
def project_graph(tx):
    query = """
    CALL gds.graph.project(
      'myGraph', 
      ['Normal', 'Hyperglycemia', 'Hypoglycemia'],
      {
        LINK: {
          orientation: 'UNDIRECTED',
          properties: 'Weight'
        }
      }
    )
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(project_graph)

Define the graph algorithms.

In [67]:
def run_pagerank_centrality(tx):
    query = """
    CALL gds.pageRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [68]:
def run_degree_centrality(tx):
    query = """
    CALL gds.degree.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [71]:
def run_eigenvector_centrality(tx):
    query = """
    CALL gds.eigenvector.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [72]:
def run_articlerank_centrality(tx):
    query = """
    CALL gds.articleRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [73]:
def run_label_propagation_algorithm(tx):
    query = """
    CALL gds.labelPropagation.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [74]:
def run_leiden_algorithm(tx):
    query = """
    CALL gds.leiden.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [75]:
def run_louvain_algorithm(tx):
    query = """
    CALL gds.louvain.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [None]:
def run_node2vec_algorithm(tx):
    query = """
    CALL gds.node2vec.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

In [26]:
def run_fastRP_algorithm(tx):
    query = """
    CALL gds.fastRP.stream('myGraph',
        { relationshipWeightProperty: 'Weight',
         randomSeed:1234,
         embeddingDimension: 128
        }
    )
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

Execute the algorithms and store the results in a Dataframe.

In [77]:
# Use a session to execute the queries and retrieve the results
with driver.session() as session:
    pagerank_results = session.execute_read(run_pagerank_centrality)
    degree_results = session.execute_read(run_degree_centrality)
    eigenvector_results = session.execute_read(run_eigenvector_centrality)
    articlerank_results = session.execute_read(run_articlerank_centrality)
    label_propagation_results = session.execute_read(run_label_propagation_algorithm)
    leiden_results = session.execute_read(run_leiden_algorithm)
    louvain_results = session.execute_read(run_louvain_algorithm)

In [78]:
pagerank_df = pd.DataFrame(pagerank_results, columns=['name', 'id', 'PageRank'])
degree_df = pd.DataFrame(degree_results, columns=['name', 'id', 'DegreeCentrality'])
eigenvector_df = pd.DataFrame(eigenvector_results, columns=['name', 'id', 'EigenvectorCentrality'])
articlerank_df = pd.DataFrame(articlerank_results, columns=['name', 'id', 'ArticleRank'])
label_propagation_df = pd.DataFrame(label_propagation_results, columns=['name', 'id', 'LabelPropagation'])
leiden_df = pd.DataFrame(leiden_results, columns=['name', 'id', 'Leiden'])
louvain_df = pd.DataFrame(louvain_results, columns=['name', 'id', 'Louvain'])

In [27]:
with driver.session() as session:
    fastRP_results = session.execute_read(run_fastRP_algorithm)

fastRP_df = pd.DataFrame(fastRP_results, columns=['SpecID', 'embeddings'])

# Expand the embeddings list into separate columns
embeddings_df = pd.DataFrame(fastRP_df['embeddings'].tolist(), index=fastRP_df.index)

# Optionally, rename the new columns
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

# Join the new embeddings columns to the original DataFrame
fastRP_df = pd.concat([fastRP_df.drop(['embeddings'], axis=1), embeddings_df], axis=1)
fastRP_df.to_csv('../../data/fastRP_embeddings.csv', index=False)
fastRP_df.head()

Unnamed: 0,SpecID,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,201210-1-00,-0.048401,0.142808,0.31777,0.072214,0.210107,0.017737,-0.14429,-0.024043,0.150054,...,-0.059152,0.046251,-0.109424,0.194681,0.332609,-0.051543,0.10677,-0.21342,0.062702,-0.065898
1,201210-1-01,-0.048564,0.135493,0.321331,0.084843,0.219233,0.018701,-0.131932,-0.0291,0.146555,...,-0.067605,0.052296,-0.108939,0.188099,0.331918,-0.045744,0.102739,-0.21088,0.059606,-0.0775
2,201210-1-02,-0.050721,0.152951,0.310272,0.049624,0.194169,0.014977,-0.167032,-0.015844,0.155717,...,-0.045421,0.042401,-0.109429,0.209265,0.331881,-0.068219,0.112845,-0.224969,0.064168,-0.048032
3,201210-1-03,-0.049832,0.151239,0.31164,0.053972,0.195877,0.015656,-0.162691,-0.017073,0.15459,...,-0.04744,0.042123,-0.109135,0.207048,0.332527,-0.065328,0.112514,-0.223284,0.064342,-0.051069
4,201210-1-04,-0.047833,0.143938,0.317608,0.071591,0.211687,0.015489,-0.144303,-0.024923,0.150893,...,-0.059021,0.046932,-0.109002,0.194342,0.3317,-0.051336,0.107158,-0.213859,0.061699,-0.065709


In [28]:
fast_graph_df = pd.merge(fastRP_df, status_df, left_on="SpecID", right_on="SpecID:ID", how="inner")
fast_graph_df.drop(columns=["SpecID:ID"], inplace=True)

In [29]:
fast_graph_df.rename(columns={":LABEL":"Status"}, inplace=True)

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = fast_graph_df.drop(columns=['SpecID', 'Status'])

# Target variable
y = fast_graph_df['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
fastrp_rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
fastrp_rf_model.fit(X_train, y_train)

# Extra Trees model
fastrp_et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
fastrp_et_model.fit(X_train, y_train)

# Predictions
rf_predictions = fastrp_rf_model.predict(X_test)
et_predictions = fastrp_et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)

Random Forest Accuracy: 0.7684426229508197
Extra Trees Accuracy: 0.7192622950819673


In [67]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

et = ExtraTreesClassifier(random_state=1234)

# Performing 10-fold cross-validation for the classifier
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    calculate_metrics(y_test, y_pred)

# Displaying the results
print(f'Accuracy: {np.mean(accuracy_scores):.4f} +/- {np.std(accuracy_scores):.4f}')
print(f'Precision: {np.mean(precision_scores):.4f} +/- {np.std(precision_scores):.4f}')
print(f'Recall: {np.mean(recall_scores):.4f} +/- {np.std(recall_scores):.4f}')
print(f'F1-Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}')

Overall Accuracy: 0.7418032786885246


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.63      0.72      0.68        72
 Hypoglycemia       0.74      0.74      0.74        86
       Normal       0.86      0.76      0.80        86

     accuracy                           0.74       244
    macro avg       0.74      0.74      0.74       244
 weighted avg       0.75      0.74      0.74       244


Confusion Matrix:
[[52 16  4]
 [15 64  7]
 [15  6 65]]
Overall Accuracy: 0.6926229508196722


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.60      0.70      0.65        71
 Hypoglycemia       0.70      0.69      0.69        87
       Normal       0.79      0.69      0.73        86

     accuracy                           0.69       244
    macro avg       0.70      0.69      0.69       244
 weighted avg       0.70      0.69      0.69       244


Confusion Matrix:
[[50 13  8]
 [19 60  8]
 

In [81]:
merged_df = pagerank_df
for df in [degree_df, eigenvector_df, articlerank_df, leiden_df, louvain_df]:
    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')
df = merged_df.rename(columns={'name' : 'SpecID'})
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain
0,210304-2-20,2554528,1.472942,346.263893,0.007547,0.232032,2,479
1,210407-2-08,5289652,1.458982,684.913033,0.007900,0.301263,1,17979
2,210304-2-42,2612504,1.434791,341.089958,0.007298,0.230766,2,479
3,210304-2-16,2543947,1.429760,290.571562,0.007041,0.219674,2,10190
4,210304-2-16,2543999,1.427356,290.504637,0.007031,0.219649,2,10190
...,...,...,...,...,...,...,...,...
51922,201210-1-22,57890,0.214485,25.412934,0.000221,0.155686,26,16940
51923,210114-1-38,361035,0.211477,9.794399,0.000170,0.152430,7,13914
51924,210114-2-34,484813,0.202604,32.973024,0.000282,0.157219,6,39991
51925,210414-3-34,5607088,0.202541,19.781042,0.000215,0.154603,1,31970


In [31]:
status_df = pd.read_csv('../../data/graph_2_0/test/nodes.csv')
fast_graph_df = pd.merge(fastRP_df, status_df, left_on="SpecID", right_on="SpecID", how="inner")
#fast_graph_df.drop(columns=["SpecID_y"], inplace=True)
#fast_graph_df.rename(columns={'SpecID_x':'SpecID'})
#fast_graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)
fast_graph_df

Unnamed: 0,SpecID,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
0,201210-1-10,-0.049247,0.136851,0.320830,0.081677,0.215771,0.018689,-0.136461,-0.027345,0.147537,...,0.050779,-0.109583,0.190551,0.333719,-0.047825,0.104064,-0.211469,0.061279,-0.073389,Normal
1,201210-1-12,-0.049302,0.132280,0.322864,0.088900,0.221245,0.019831,-0.129085,-0.030131,0.145416,...,0.054422,-0.109105,0.186548,0.333036,-0.044077,0.101322,-0.209376,0.059840,-0.080133,Normal
2,201210-1-14,-0.049057,0.137773,0.320450,0.080246,0.215331,0.018585,-0.137373,-0.026965,0.148035,...,0.050209,-0.109431,0.191006,0.333335,-0.048218,0.104382,-0.211619,0.061593,-0.072310,Normal
3,201210-1-21,-0.049475,0.133764,0.321982,0.086393,0.218668,0.019633,-0.131797,-0.029024,0.145862,...,0.053301,-0.109267,0.188538,0.333553,-0.046072,0.102364,-0.211061,0.060279,-0.077818,Normal
4,201210-1-22,-0.049738,0.133778,0.322370,0.086193,0.218985,0.018968,-0.132734,-0.029072,0.146436,...,0.053522,-0.109616,0.188593,0.333871,-0.046118,0.102496,-0.210602,0.060235,-0.077248,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,210526-3-19,-0.047175,0.129351,0.326794,0.097359,0.234479,0.015212,-0.121396,-0.035370,0.146334,...,0.055477,-0.109846,0.174982,0.331237,-0.030639,0.097982,-0.195014,0.059053,-0.084741,Hyperglycemia
605,210526-3-23,-0.047099,0.126843,0.327889,0.101273,0.237793,0.015756,-0.117212,-0.037009,0.145452,...,0.056998,-0.109637,0.171946,0.330536,-0.027734,0.096525,-0.192637,0.058415,-0.088222,Hyperglycemia
606,210526-3-37,-0.046699,0.137921,0.321895,0.083423,0.223042,0.014868,-0.133682,-0.029693,0.148923,...,0.049788,-0.109455,0.184589,0.331630,-0.040115,0.102911,-0.203325,0.061236,-0.073934,Hyperglycemia
607,210526-3-38,-0.046819,0.135989,0.323057,0.086629,0.225646,0.015151,-0.130716,-0.030981,0.148303,...,0.051050,-0.109526,0.182410,0.331569,-0.037945,0.101820,-0.201542,0.060765,-0.076559,Hyperglycemia


In [86]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
graph_df['Bin'] = graph_df['WaveNumber:float'].apply(lambda x: calculate_bin_interval(x, bin_size))
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = graph_df.pivot_table(index='SpecID', columns='Bin', values=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden', 'Louvain'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = graph_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)
graph_bins = peak_bins.reset_index()

In [87]:
X_test = peak_bins.drop(['Status'], axis=1)
y_test = peak_bins['Status']

# Predictions
rf_predictions = graph_rf_model.predict(X_test)
et_predictions = graph_et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

In [88]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.6879194630872483


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.72      0.63      0.67       195
 Hypoglycemia       0.65      0.74      0.69       196
       Normal       0.70      0.70      0.70       205

     accuracy                           0.69       596
    macro avg       0.69      0.69      0.69       596
 weighted avg       0.69      0.69      0.69       596


Confusion Matrix:
[[122  37  36]
 [ 26 145  25]
 [ 21  41 143]]


In [89]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.6694630872483222


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.68      0.63      0.65       195
 Hypoglycemia       0.62      0.70      0.66       196
       Normal       0.72      0.68      0.70       205

     accuracy                           0.67       596
    macro avg       0.67      0.67      0.67       596
 weighted avg       0.67      0.67      0.67       596


Confusion Matrix:
[[122  44  29]
 [ 34 137  25]
 [ 24  41 140]]


In [90]:
fast_graph_df.drop(columns=['id'], inplace=True)
fast_graph_df = fast_graph_df.groupby(['SpecID', 'Status']).max().reset_index()

In [32]:
# Drop the SpecID column
X_test = fast_graph_df.drop(columns=['SpecID', 'Status'])

# Target variable
y_test = fast_graph_df['Status']

# Predictions
rf_predictions = fastrp_rf_model.predict(X_test)
et_predictions = fastrp_et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)

Random Forest Accuracy: 0.33825944170771755
Extra Trees Accuracy: 0.33825944170771755


In [92]:
joined_graph = pd.merge(graph_bins, fast_graph_df, on=['SpecID', 'Status'], how='inner')

In [93]:
joined_graph

Unnamed: 0,SpecID,ArticleRank_1000-1025,ArticleRank_1025-1050,ArticleRank_1050-1075,ArticleRank_1075-1100,ArticleRank_1100-1125,ArticleRank_1125-1150,ArticleRank_1150-1175,ArticleRank_1175-1200,ArticleRank_1200-1225,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,201210-1-10,0.327194,False,False,False,False,False,False,False,False,...,0.174145,0.306576,0.190821,0.293303,0.208554,0.228081,0.225902,0.219529,0.189622,0.382869
1,201210-1-12,0.319221,False,False,False,False,False,False,False,False,...,0.129365,0.096373,0.183873,0.171630,0.031642,0.288138,0.226378,0.217474,0.089820,0.176072
2,201210-1-14,0.310665,False,False,False,False,False,0.235115,False,False,...,0.168452,0.109232,0.415056,0.370557,0.238806,0.286417,0.322876,0.215117,0.257098,0.355038
3,201210-1-21,False,False,False,False,False,False,False,False,False,...,0.335456,0.152292,0.261854,0.099105,0.160350,0.286880,0.234223,0.339082,0.188879,0.155013
4,201210-1-22,0.355218,False,False,False,False,0.281222,False,False,False,...,0.334157,0.325833,0.061598,0.354299,0.253228,0.284353,0.239339,0.207170,0.249457,0.348569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,210526-3-05,0.343468,False,False,0.240963,False,0.246008,0.241891,False,0.284997,...,0.340696,0.236452,0.404684,0.335716,0.270184,0.279762,0.330756,0.199885,0.258199,0.351869
592,210526-3-06,0.349153,False,False,0.24265,False,0.264708,False,False,0.296188,...,0.341664,0.324421,0.227505,0.334419,0.267027,0.289566,0.332709,0.362922,0.236892,0.335285
593,210526-3-10,0.349359,False,False,False,False,False,False,0.268924,False,...,0.332579,0.135524,0.417246,0.360561,0.247998,0.284100,0.236567,0.292642,0.252750,0.414588
594,210526-3-16,False,False,False,False,False,False,0.269604,False,False,...,0.139024,0.115723,0.195422,0.201630,0.176378,0.275663,0.159476,0.283438,0.249698,0.044350


In [None]:
# Drop the SpecID column
X_test = joined_graph.drop(columns=['SpecID', 'Status'])

# Target variable
y_test = joined_graph['Status']


# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)