#### This Version Creates the Graph in Neo4J then runs the Queries.

In [2]:
from neo4j import GraphDatabase
import os
import pandas as pd
import shutil
import subprocess

This moves the nodes and relationships files from data/current_working_graph into the DBMS import folder.

At the moment this uses local paths, but the rest should be automated.

In [3]:
# Define the source directory
source_dir = os.path.abspath('../../data/graph_1_7')

# Define the destination directory
# This should be the import directory of your graph database
destination_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e/import'

# Specify the filenames to move
filenames = ['nodes.csv', 'relationships.csv']

for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    
    # Move each file
    shutil.copy(source_path, destination_path)

Then run the admin import command.

In [4]:
# Define the working directory
working_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e/bin'

# Construct the command
#command = [
#    './bin/neo4j-admin', 'database', 'import', 'full',
#    '--nodes=import/nodes.csv',
#    '--relationships=import/relationships.csv', 'neo4j'
#]

command = 'neo4j-admin database import full --nodes=import/nodes.csv --relationships=import/relationships.csv neo4j'

# Execute the command


result = subprocess.run(command, shell=True, cwd=working_dir, capture_output=True, text=True)

# Check if the command was successful
if result.returncode == 0:
    print("Import successful")
    print(result.stdout)
else:
    print("Error during import:")
    print(result.stderr)

Import successful
Neo4j version: 5.12.0
Importing the contents of these files into C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e\data\databases\neo4j:
Nodes:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e\import\nodes.csv

Relationships:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e\import\relationships.csv


Available resources:
  Total machine memory: 31.86GiB
  Free machine memory: 17.16GiB
  Max heap memory : 910.5MiB
  Max worker threads: 8
  Configured max memory: 14.72GiB
  High parallel IO: true

Cypher type normalization is enabled (disable with --normalize-types=false):
  Property type of 'WaveNumber' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-9d28ea38-eb69-4f60-bf9f-cb475b02884e\import\nodes.csv
  Property type of 'Absorbance' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\re

Run and connect to the Neo4j Database

In [89]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"              # Neo4J username
#password = os.environ['NEO4J_Password']           # Neo4J password
password = '26622002'

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Ensure you close the driver connection when your program ends
def close_driver():
    driver.close()

**First create the gds Graph Projection**

In [90]:
def project_graph(tx):
    query = """
    CALL gds.graph.project(
      'myGraph', 
      ['Normal', 'Hyperglycemia', 'Hypoglycemia'],
      {
        LINK: {
          orientation: 'UNDIRECTED',
          properties: 'DIST'
        }
      }
    )
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(project_graph)

Define the graph algorithms.

In [100]:
def run_pagerank_centrality(tx):
    query = """
    CALL gds.pageRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [101]:
def run_degree_centrality(tx):
    query = """
    CALL gds.degree.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [102]:
def run_eigenvector_centrality(tx):
    query = """
    CALL gds.eigenvector.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [103]:
def run_articlerank_centrality(tx):
    query = """
    CALL gds.articleRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [104]:
def run_label_propagation_algorithm(tx):
    query = """
    CALL gds.labelPropagation.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [105]:
def run_leiden_algorithm(tx):
    query = """
    CALL gds.leiden.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [106]:
def run_louvain_algorithm(tx):
    query = """
    CALL gds.louvain.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

Execute the algorithms and store the results in a Dataframe.

In [107]:
# Use a session to execute the queries and retrieve the results
with driver.session() as session:
    pagerank_results = session.execute_read(run_pagerank_centrality)
    degree_results = session.execute_read(run_degree_centrality)
    eigenvector_results = session.execute_read(run_eigenvector_centrality)
    articlerank_results = session.execute_read(run_articlerank_centrality)
    label_propagation_results = session.execute_read(run_label_propagation_algorithm)
    leiden_results = session.execute_read(run_leiden_algorithm)
    louvain_results = session.execute_read(run_louvain_algorithm)

In [108]:
pagerank_df = pd.DataFrame(pagerank_results, columns=['name', 'id', 'PageRank'])
degree_df = pd.DataFrame(degree_results, columns=['name', 'id', 'DegreeCentrality'])
eigenvector_df = pd.DataFrame(eigenvector_results, columns=['name', 'id', 'EigenvectorCentrality'])
articlerank_df = pd.DataFrame(articlerank_results, columns=['name', 'id', 'ArticleRank'])
#label_propagation_df = pd.DataFrame(label_propagation_results, columns=['name', 'id', 'LabelPropagation'])
leiden_df = pd.DataFrame(leiden_results, columns=['name', 'id', 'Leiden'])
louvain_df = pd.DataFrame(louvain_results, columns=['name', 'id', 'Louvain'])

In [56]:
pagerank_df = pagerank_df.groupby("name").mean().reset_index()

In [65]:
degree_df = degree_df.groupby("name").mean().reset_index()
eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
articlerank_df = articlerank_df.groupby("name").mean().reset_index()
eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
leiden_df = leiden_df.groupby("name").first().reset_index()
louvain_df = louvain_df.groupby("name").first().reset_index()

In [119]:
merged_df = pagerank_df
for df in [degree_df, eigenvector_df, articlerank_df, leiden_df, louvain_df]:
    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')

In [120]:
df = merged_df.rename(columns={'name' : 'SpecID'})
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain
0,210504-1-17,6255593,7.809362,2.201659e+06,0.036183,7.713423,16,4957
1,210504-1-19,6260862,7.535027,1.919783e+06,0.033121,7.434956,108,4957
2,210504-1-14,6247705,7.204064,1.779399e+06,0.029518,7.108150,16,1129
3,210211-1-49,1180546,7.048049,2.684871e+06,0.030367,6.985697,182,14225
4,210504-3-28,6550730,7.022263,2.010274e+06,0.031021,6.939504,135,49909
...,...,...,...,...,...,...,...,...
51922,210414-3-28,5586351,0.215405,1.459729e+04,0.000263,0.214341,0,1877
51923,210524-1-48,7717638,0.213816,1.913730e+04,0.000300,0.212926,113,40585
51924,210114-2-01,395402,0.213385,5.190698e+03,0.000162,0.209961,114,40585
51925,210114-1-32,345354,0.207456,3.595212e+03,0.000117,0.203962,23,40585


Delete the projection

In [87]:
def delete_projection(tx):
    query = """
    CALL gds.graph.drop('myGraph')
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(delete_projection)

In [88]:
close_driver()

In [121]:
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain
0,210504-1-17,6255593,7.809362,2.201659e+06,0.036183,7.713423,16,4957
1,210504-1-19,6260862,7.535027,1.919783e+06,0.033121,7.434956,108,4957
2,210504-1-14,6247705,7.204064,1.779399e+06,0.029518,7.108150,16,1129
3,210211-1-49,1180546,7.048049,2.684871e+06,0.030367,6.985697,182,14225
4,210504-3-28,6550730,7.022263,2.010274e+06,0.031021,6.939504,135,49909
...,...,...,...,...,...,...,...,...
51922,210414-3-28,5586351,0.215405,1.459729e+04,0.000263,0.214341,0,1877
51923,210524-1-48,7717638,0.213816,1.913730e+04,0.000300,0.212926,113,40585
51924,210114-2-01,395402,0.213385,5.190698e+03,0.000162,0.209961,114,40585
51925,210114-1-32,345354,0.207456,3.595212e+03,0.000117,0.203962,23,40585


In [122]:
status_df = pd.read_csv('../../data/graph_1_6/nodes.csv')

In [123]:
status_df.drop(columns=['Unnamed: 0'], inplace=True)

In [124]:
graph_df = pd.merge(df, status_df, left_on="SpecID", right_on="SpecID:ID", how="inner")
graph_df.drop(columns=["SpecID:ID"], inplace=True)

In [125]:
graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)

In [126]:
graph_df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain,Status
0,210504-1-17,6255593,7.809362,2.201659e+06,0.036183,7.713423,16,4957,Hypoglycemia
1,210504-1-17,6258100,1.791832,4.732940e+05,0.007804,1.770915,16,4957,Hypoglycemia
2,210504-1-17,6258060,1.660951,4.432969e+05,0.007273,1.642099,16,4957,Hypoglycemia
3,210504-1-17,6255822,1.077741,2.760473e+05,0.004528,1.066175,16,4957,Hypoglycemia
4,210504-1-17,6256192,0.822288,2.012374e+05,0.003320,0.813848,16,4957,Hypoglycemia
...,...,...,...,...,...,...,...,...,...
51922,210419-1-22,4781060,0.413492,2.048538e+04,0.000508,0.401739,23,40585,Hypoglycemia
51923,210407-1-17,5159418,0.397417,1.080500e+04,0.000419,0.379487,23,40585,Hypoglycemia
51924,210407-1-16,5156791,0.395436,1.064650e+04,0.000414,0.377579,23,40585,Hypoglycemia
51925,210419-2-16,4893556,0.379015,1.418735e+04,0.000418,0.367111,23,40585,Hypoglycemia


In [136]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = graph_df.drop(columns=['SpecID', 'Status', 'id'])

# Target variable
y = graph_df['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)


Random Forest Accuracy: 0.5470826112073945
Extra Trees Accuracy: 0.5418833044482958


In [147]:
rf_predictions

array(['Hyperglycemia', 'Hyperglycemia', 'Normal', ..., 'Normal',
       'Normal', 'Hypoglycemia'], dtype=object)

In [137]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [138]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [139]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.5470826112073945


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.54      0.52      0.53      3445
 Hypoglycemia       0.59      0.62      0.60      4075
       Normal       0.50      0.47      0.48      2866

     accuracy                           0.55     10386
    macro avg       0.54      0.54      0.54     10386
 weighted avg       0.55      0.55      0.55     10386


Confusion Matrix:
[[1808  960  677]
 [ 852 2528  695]
 [ 715  805 1346]]


In [140]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.5418833044482958


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.53      0.53      0.53      3445
 Hypoglycemia       0.59      0.62      0.60      4075
       Normal       0.49      0.45      0.47      2866

     accuracy                           0.54     10386
    macro avg       0.53      0.53      0.53     10386
 weighted avg       0.54      0.54      0.54     10386


Confusion Matrix:
[[1821  969  655]
 [ 847 2508  720]
 [ 770  797 1299]]


In [141]:
get_feature_importances(rf_model, X)

Unnamed: 0,Feature,Importance
1,DegreeCentrality,0.236116
2,EigenvectorCentrality,0.217989
0,PageRank,0.199219
3,ArticleRank,0.198369
4,Leiden,0.076681
5,Louvain,0.071626


In [142]:
get_feature_importances(et_model, X)

Unnamed: 0,Feature,Importance
1,DegreeCentrality,0.229732
2,EigenvectorCentrality,0.213488
0,PageRank,0.204956
3,ArticleRank,0.204462
4,Leiden,0.073796
5,Louvain,0.073566


In [143]:
unique_values = graph_df['SpecID'].unique().tolist()

In [144]:
train, test = train_test_split(unique_values, test_size=0.2, random_state=1234)

In [145]:
train_df = graph_df[graph_df['SpecID'].isin(train)]
test_df = graph_df[graph_df['SpecID'].isin(test)]

In [146]:
# Drop the SpecID column
X_train = train_df.drop(columns=['SpecID', 'Status', 'id'])

# Target variable
y_train= train_df['Status']

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

In [151]:
def ensemble_pred(model, X_test):
    X_test["predictions"] = model.predict(X_test)
    return X_test


In [153]:
X_test = test_df.drop(columns=['SpecID', 'Status', 'id'])
y_test = test_df.drop(columns=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden', 'Louvain'])

In [154]:
result = ensemble_pred(rf_model, X_test)

In [157]:
result_df = result.join(y_test)

In [158]:
result_df

Unnamed: 0,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain,predictions,SpecID,id,Status
323,6.793975,1.823863e+06,0.030026,6.708783,16,4957,Normal,210310-1-30,2711517,Normal
324,1.038554,2.496767e+05,0.004134,1.027140,16,4957,Hyperglycemia,210310-1-30,2713377,Normal
325,0.996186,2.418471e+05,0.003956,0.985449,16,4957,Hypoglycemia,210310-1-30,2711739,Normal
326,0.975149,2.344802e+05,0.003873,0.964610,16,4957,Hyperglycemia,210310-1-30,2713420,Normal
327,0.916040,2.163733e+05,0.003636,0.906085,16,4957,Hypoglycemia,210310-1-30,2712462,Normal
...,...,...,...,...,...,...,...,...,...,...
51840,0.547907,8.535565e+04,0.001639,0.541444,0,1877,Hypoglycemia,210225-2-05,1988721,Hypoglycemia
51841,0.370648,4.470480e+04,0.000848,0.366957,0,1877,Hypoglycemia,210225-2-05,1988854,Hypoglycemia
51851,0.745803,2.053807e+05,0.003040,0.739060,56,13670,Normal,210419-2-45,4978259,Hypoglycemia
51852,0.628424,1.134980e+05,0.001970,0.620749,55,40223,Hypoglycemia,210419-2-45,4977884,Hypoglycemia


In [162]:
mode_df = result_df.groupby("SpecID")[["predictions", "Status"]].agg(lambda x: x.mode().iloc[0])


In [166]:
# Calculate the number of matching predictions and statuses
matching_values = (mode_df['predictions'] == mode_df['Status']).sum()

# Calculate the total number of rows
total_rows = len(mode_df)

# Calculate the percentage of matching predictions and statuses
percentage_matching = (matching_values / total_rows) * 100

In [167]:
percentage_matching

57.38255033557047