#### This Version Creates the Graph in Neo4J then runs the Queries.

In [1]:
from neo4j import GraphDatabase
import os
import pandas as pd
import shutil
import subprocess

This moves the nodes and relationships files from data/current_working_graph into the DBMS import folder.

At the moment this uses local paths, but the rest should be automated.

In [2]:
# Define the source directory
source_dir = os.path.abspath('../../data/graph_1_7')

# Define the destination directory
# This should be the import directory of your graph database
destination_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7/import'

# Specify the filenames to move
filenames = ['nodes.csv', 'relationships.csv']

for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    
    # Move each file
    shutil.copy(source_path, destination_path)

Then run the admin import command.

In [3]:
# Define the working directory
working_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7/bin'

# Construct the command
#command = [
#    './bin/neo4j-admin', 'database', 'import', 'full',
#    '--nodes=import/nodes.csv',
#    '--relationships=import/relationships.csv', 'neo4j'
#]

command = 'neo4j-admin database import full --nodes=import/nodes.csv --relationships=import/relationships.csv neo4j'

# Execute the command


result = subprocess.run(command, shell=True, cwd=working_dir, capture_output=True, text=True)

# Check if the command was successful
if result.returncode == 0:
    print("Import successful")
    print(result.stdout)
else:
    print("Error during import:")
    print(result.stderr)

Import successful
Neo4j version: 5.12.0
Importing the contents of these files into C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7\data\databases\neo4j:
Nodes:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7\import\nodes.csv

Relationships:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7\import\relationships.csv


Available resources:
  Total machine memory: 31.86GiB
  Free machine memory: 17.81GiB
  Max heap memory : 910.5MiB
  Max worker threads: 8
  Configured max memory: 15.34GiB
  High parallel IO: true

Cypher type normalization is enabled (disable with --normalize-types=false):
  Property type of 'WaveNumber' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-3712adfd-37a2-4dfd-bcf5-032f598d16d7\import\nodes.csv
  Property type of 'Absorbance' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\re

Run and connect to the Neo4j Database

In [2]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"              # Neo4J username
#password = os.environ['NEO4J_Password']           # Neo4J password
password = '26622002'

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Ensure you close the driver connection when your program ends
def close_driver():
    driver.close()

**First create the gds Graph Projection**

In [3]:
def project_graph(tx):
    query = """
    CALL gds.graph.project(
      'myGraph', 
      ['Normal', 'Hyperglycemia', 'Hypoglycemia'],
      {
        LINK: {
          orientation: 'UNDIRECTED',
          properties: 'DIST'
        }
      }
    )
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(project_graph)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: A graph with name 'myGraph' already exists.}

Define the graph algorithms.

In [5]:
def run_pagerank_centrality(tx):
    query = """
    CALL gds.pageRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [6]:
def run_degree_centrality(tx):
    query = """
    CALL gds.degree.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [7]:
def run_eigenvector_centrality(tx):
    query = """
    CALL gds.eigenvector.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [8]:
def run_articlerank_centrality(tx):
    query = """
    CALL gds.articleRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [9]:
def run_label_propagation_algorithm(tx):
    query = """
    CALL gds.labelPropagation.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [10]:
def run_leiden_algorithm(tx):
    query = """
    CALL gds.leiden.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [11]:
def run_louvain_algorithm(tx):
    query = """
    CALL gds.louvain.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [13]:
def run_node2vec_algorithm(tx):
    query = """
    CALL gds.node2vec.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

In [86]:
def run_fastRP_algorithm(tx):
    query = """
    CALL gds.fastRP.stream('myGraph',
        { relationshipWeightProperty: 'DIST',
         randomSeed:1234,
         embeddingDimension: 128
        }
    )
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["embedding"]) for record in results]

Execute the algorithms and store the results in a Dataframe.

In [12]:
# Use a session to execute the queries and retrieve the results
with driver.session() as session:
    pagerank_results = session.execute_read(run_pagerank_centrality)
    degree_results = session.execute_read(run_degree_centrality)
    eigenvector_results = session.execute_read(run_eigenvector_centrality)
    articlerank_results = session.execute_read(run_articlerank_centrality)
    #label_propagation_results = session.execute_read(run_label_propagation_algorithm)
    leiden_results = session.execute_read(run_leiden_algorithm)
    #louvain_results = session.execute_read(run_louvain_algorithm)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.louvain.stream`: Caused by: java.lang.OutOfMemoryError: Java heap space}

In [13]:
pagerank_df = pd.DataFrame(pagerank_results, columns=['name', 'id', 'PageRank'])
degree_df = pd.DataFrame(degree_results, columns=['name', 'id', 'DegreeCentrality'])
eigenvector_df = pd.DataFrame(eigenvector_results, columns=['name', 'id', 'EigenvectorCentrality'])
articlerank_df = pd.DataFrame(articlerank_results, columns=['name', 'id', 'ArticleRank'])
#label_propagation_df = pd.DataFrame(label_propagation_results, columns=['name', 'id', 'LabelPropagation'])
leiden_df = pd.DataFrame(leiden_results, columns=['name', 'id', 'Leiden'])
#louvain_df = pd.DataFrame(louvain_results, columns=['name', 'id', 'Louvain'])

In [None]:
pagerank_df = pagerank_df.groupby("name").mean().reset_index()

In [65]:
degree_df = degree_df.groupby("name").mean().reset_index()
eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
articlerank_df = articlerank_df.groupby("name").mean().reset_index()
eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
leiden_df = leiden_df.groupby("name").first().reset_index()
louvain_df = louvain_df.groupby("name").first().reset_index()

In [14]:
#merged_df = pagerank_df
#for df in [degree_df, eigenvector_df, articlerank_df, leiden_df, louvain_df]:
#    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')

merged_df = pagerank_df
for df in [degree_df, eigenvector_df, articlerank_df, leiden_df]:
    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')

In [15]:
df = merged_df.rename(columns={'name' : 'SpecID'})
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden
0,210114-1-36,355802,4.021481,114.216307,0.018755,0.174456,2450
1,210414-1-17,5422904,4.014793,114.483616,0.018796,0.174505,2531
2,210303-1-38,2205573,3.818242,102.728080,0.016931,0.172002,1491
3,210114-1-35,353177,3.809055,106.912309,0.017554,0.172916,2449
4,210120-2-22,714166,3.791822,103.517365,0.017031,0.172177,2301
...,...,...,...,...,...,...,...
65218,210504-2-08,6386143,0.437653,6.960308,0.001152,0.151498,271
65219,210331-1-47,4338749,0.434173,6.907642,0.001142,0.151487,271
65220,210324-2-00,3818844,0.433897,6.998643,0.001156,0.151507,271
65221,210504-2-02,6370334,0.426986,6.850804,0.001134,0.151474,271


In [87]:
with driver.session() as session:
    fastRP_results = session.execute_read(run_fastRP_algorithm)

fastRP_df = pd.DataFrame(fastRP_results, columns=['SpecID', 'id', 'embeddings'])

# Expand the embeddings list into separate columns
embeddings_df = pd.DataFrame(fastRP_df['embeddings'].tolist(), index=fastRP_df.index)

# Optionally, rename the new columns
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

# Join the new embeddings columns to the original DataFrame
fastRP_df = pd.concat([fastRP_df.drop(['embeddings'], axis=1), embeddings_df], axis=1)
fastRP_df.to_csv('../../data/fastRP_embeddings.csv', index=False)
fastRP_df.head()

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,210421-1-12,5015747,-0.083897,-0.088126,0.129636,-0.011458,0.012046,0.454714,0.077193,-0.334761,...,0.130383,0.109211,0.468703,0.036231,-0.154496,-0.278995,-0.121792,0.054464,0.032787,0.120078
1,210421-1-12,5015823,-0.029143,-0.071627,0.080513,-0.012518,0.015735,0.436146,0.04264,-0.319604,...,0.139672,0.073673,0.481912,0.017425,-0.146291,-0.332198,-0.125615,0.08513,0.013962,0.122058
2,210421-1-12,5015933,-0.065649,-0.076738,0.115101,0.002256,-0.012525,0.466665,0.065633,-0.347313,...,0.131187,0.092518,0.482005,0.030858,-0.157134,-0.297612,-0.115741,0.072406,0.032348,0.151767
3,210421-1-12,5016052,-0.068021,-0.082257,0.108146,0.034965,-0.018199,0.449365,0.067154,-0.335976,...,0.124901,0.103151,0.493451,0.081179,-0.147588,-0.300893,-0.079562,0.07862,0.037989,0.144762
4,210421-1-12,5016132,-0.072639,-0.068135,0.106439,0.012042,0.013381,0.447809,0.037788,-0.345055,...,0.132869,0.094144,0.476964,0.034397,-0.164034,-0.30594,-0.097593,0.076277,0.031379,0.142097


In [88]:
fastRP_df

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,210421-1-12,5015747,-0.083897,-0.088126,0.129636,-0.011458,0.012046,0.454714,0.077193,-0.334761,...,0.130383,0.109211,0.468703,0.036231,-0.154496,-0.278995,-0.121792,0.054464,0.032787,0.120078
1,210421-1-12,5015823,-0.029143,-0.071627,0.080513,-0.012518,0.015735,0.436146,0.042640,-0.319604,...,0.139672,0.073673,0.481912,0.017425,-0.146291,-0.332198,-0.125615,0.085130,0.013962,0.122058
2,210421-1-12,5015933,-0.065649,-0.076738,0.115101,0.002256,-0.012525,0.466665,0.065633,-0.347313,...,0.131187,0.092518,0.482005,0.030858,-0.157134,-0.297612,-0.115741,0.072406,0.032348,0.151767
3,210421-1-12,5016052,-0.068021,-0.082257,0.108146,0.034965,-0.018199,0.449365,0.067154,-0.335976,...,0.124901,0.103151,0.493451,0.081179,-0.147588,-0.300893,-0.079562,0.078620,0.037989,0.144762
4,210421-1-12,5016132,-0.072639,-0.068135,0.106439,0.012042,0.013381,0.447809,0.037788,-0.345055,...,0.132869,0.094144,0.476964,0.034397,-0.164034,-0.305940,-0.097593,0.076277,0.031379,0.142097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65218,210421-1-12,5015144,-0.013738,-0.084442,0.112246,0.065560,0.024290,0.481016,0.050764,-0.305816,...,0.104606,0.153296,0.439762,0.060891,-0.089327,-0.310392,-0.174691,0.069514,0.016734,0.146203
65219,210421-1-12,5015202,0.008089,-0.078856,0.102037,0.073806,0.028963,0.487772,0.035117,-0.293706,...,0.093562,0.160191,0.437551,0.054878,-0.075073,-0.316523,-0.188039,0.071264,0.025373,0.145364
65220,210421-1-12,5015253,-0.108615,-0.072524,0.140635,-0.017185,-0.018298,0.459807,0.128565,-0.314512,...,0.138734,0.111135,0.400943,0.052551,-0.155923,-0.295256,-0.128258,0.063318,-0.008409,0.131710
65221,210421-1-12,5015372,-0.020248,-0.090490,0.106660,-0.033297,-0.046091,0.417534,0.048086,-0.315869,...,0.089134,0.114127,0.521180,0.090410,-0.072637,-0.320001,-0.020104,0.001187,0.079467,0.187885


Delete the projection

In [87]:
def delete_projection(tx):
    query = """
    CALL gds.graph.drop('myGraph')
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(delete_projection)

In [88]:
close_driver()

In [94]:
status_df

Unnamed: 0,SpecID,WaveNumber:float,Status:LABEL,Absorbance:float,GridSlot,Node_ID:ID,OriginalWaveNumber:float
0,201210-1-00,0.126141,Normal,0.852790,"(5, 34)",76,251.93622
1,201210-1-00,0.188070,Normal,0.018559,"(7, 0)",257,375.62643
2,201210-1-00,0.211679,Normal,-0.032742,"(8, -1)",326,422.77905
3,201210-1-00,0.232208,Normal,0.088690,"(9, 3)",386,463.78131
4,201210-1-00,0.266766,Normal,0.100782,"(10, 4)",487,532.80182
...,...,...,...,...,...,...,...
65218,210526-3-49,0.810447,Hyperglycemia,0.147085,"(32, 5)",8009841,1618.67880
65219,210526-3-49,0.842610,Hyperglycemia,0.074500,"(33, 2)",8009935,1682.91580
65220,210526-3-49,0.865192,Hyperglycemia,0.058155,"(34, 2)",8010001,1728.01820
65221,210526-3-49,0.930543,Hyperglycemia,0.111614,"(37, 4)",8010192,1858.54210


In [95]:
fastRP_df

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,210421-1-12,5015747,-0.083897,-0.088126,0.129636,-0.011458,0.012046,0.454714,0.077193,-0.334761,...,0.130383,0.109211,0.468703,0.036231,-0.154496,-0.278995,-0.121792,0.054464,0.032787,0.120078
1,210421-1-12,5015823,-0.029143,-0.071627,0.080513,-0.012518,0.015735,0.436146,0.042640,-0.319604,...,0.139672,0.073673,0.481912,0.017425,-0.146291,-0.332198,-0.125615,0.085130,0.013962,0.122058
2,210421-1-12,5015933,-0.065649,-0.076738,0.115101,0.002256,-0.012525,0.466665,0.065633,-0.347313,...,0.131187,0.092518,0.482005,0.030858,-0.157134,-0.297612,-0.115741,0.072406,0.032348,0.151767
3,210421-1-12,5016052,-0.068021,-0.082257,0.108146,0.034965,-0.018199,0.449365,0.067154,-0.335976,...,0.124901,0.103151,0.493451,0.081179,-0.147588,-0.300893,-0.079562,0.078620,0.037989,0.144762
4,210421-1-12,5016132,-0.072639,-0.068135,0.106439,0.012042,0.013381,0.447809,0.037788,-0.345055,...,0.132869,0.094144,0.476964,0.034397,-0.164034,-0.305940,-0.097593,0.076277,0.031379,0.142097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65218,210421-1-12,5015144,-0.013738,-0.084442,0.112246,0.065560,0.024290,0.481016,0.050764,-0.305816,...,0.104606,0.153296,0.439762,0.060891,-0.089327,-0.310392,-0.174691,0.069514,0.016734,0.146203
65219,210421-1-12,5015202,0.008089,-0.078856,0.102037,0.073806,0.028963,0.487772,0.035117,-0.293706,...,0.093562,0.160191,0.437551,0.054878,-0.075073,-0.316523,-0.188039,0.071264,0.025373,0.145364
65220,210421-1-12,5015253,-0.108615,-0.072524,0.140635,-0.017185,-0.018298,0.459807,0.128565,-0.314512,...,0.138734,0.111135,0.400943,0.052551,-0.155923,-0.295256,-0.128258,0.063318,-0.008409,0.131710
65221,210421-1-12,5015372,-0.020248,-0.090490,0.106660,-0.033297,-0.046091,0.417534,0.048086,-0.315869,...,0.089134,0.114127,0.521180,0.090410,-0.072637,-0.320001,-0.020104,0.001187,0.079467,0.187885


In [46]:
status_df = pd.read_csv('../../data/graph_1_7/nodes.csv')

In [97]:
df['id'] = df['id'].astype(int)

In [99]:
fastRP_df['id'] = fastRP_df['id'].astype(int)

In [62]:
graph_df = pd.merge(df, status_df, left_on="id", right_on="Node_ID:ID", how="inner")
graph_df.drop(columns=["Node_ID:ID"], inplace=True)

In [49]:
graph_df = pd.merge(df, status_df, left_on="SpecID", right_on="SpecID", how="inner")
graph_df.drop(columns=["SpecID"], inplace=True)

In [176]:
graph_df = pd.merge(fastRP_df, status_df, left_on="id", right_on="Node_ID:ID", how="inner")
graph_df.drop(columns=["Node_ID:ID"], inplace=True)

In [177]:
graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)
graph_df.drop(columns=['SpecID_y', 'WaveNumber:float'], inplace=True)
graph_df.rename(columns={'SpecID_x':'SpecID', 'OriginalWaveNumber:float':'WaveNumber'}, inplace=True)
graph_df.drop(columns=['Absorbance:float', 'GridSlot', 'WaveNumber'], inplace=True)

In [178]:
graph_df

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
0,210421-1-12,5015747,-0.083897,-0.088126,0.129636,-0.011458,0.012046,0.454714,0.077193,-0.334761,...,0.109211,0.468703,0.036231,-0.154496,-0.278995,-0.121792,0.054464,0.032787,0.120078,Normal
1,210421-1-12,5015823,-0.029143,-0.071627,0.080513,-0.012518,0.015735,0.436146,0.042640,-0.319604,...,0.073673,0.481912,0.017425,-0.146291,-0.332198,-0.125615,0.085130,0.013962,0.122058,Normal
2,210421-1-12,5015933,-0.065649,-0.076738,0.115101,0.002256,-0.012525,0.466665,0.065633,-0.347313,...,0.092518,0.482005,0.030858,-0.157134,-0.297612,-0.115741,0.072406,0.032348,0.151767,Normal
3,210421-1-12,5016052,-0.068021,-0.082257,0.108146,0.034965,-0.018199,0.449365,0.067154,-0.335976,...,0.103151,0.493451,0.081179,-0.147588,-0.300893,-0.079562,0.078620,0.037989,0.144762,Normal
4,210421-1-12,5016132,-0.072639,-0.068135,0.106439,0.012042,0.013381,0.447809,0.037788,-0.345055,...,0.094144,0.476964,0.034397,-0.164034,-0.305940,-0.097593,0.076277,0.031379,0.142097,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65218,210421-1-12,5015144,-0.013738,-0.084442,0.112246,0.065560,0.024290,0.481016,0.050764,-0.305816,...,0.153296,0.439762,0.060891,-0.089327,-0.310392,-0.174691,0.069514,0.016734,0.146203,Normal
65219,210421-1-12,5015202,0.008089,-0.078856,0.102037,0.073806,0.028963,0.487772,0.035117,-0.293706,...,0.160191,0.437551,0.054878,-0.075073,-0.316523,-0.188039,0.071264,0.025373,0.145364,Normal
65220,210421-1-12,5015253,-0.108615,-0.072524,0.140635,-0.017185,-0.018298,0.459807,0.128565,-0.314512,...,0.111135,0.400943,0.052551,-0.155923,-0.295256,-0.128258,0.063318,-0.008409,0.131710,Normal
65221,210421-1-12,5015372,-0.020248,-0.090490,0.106660,-0.033297,-0.046091,0.417534,0.048086,-0.315869,...,0.114127,0.521180,0.090410,-0.072637,-0.320001,-0.020104,0.001187,0.079467,0.187885,Normal


In [29]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
graph_df['Bin'] = graph_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

In [30]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
#peak_bins = graph_df.pivot_table(index='SpecID', columns='Bin', values=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden', 'Louvain'], aggfunc='mean')
peak_bins = graph_df.pivot_table(index='SpecID', columns='Bin', values=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden'], aggfunc='mean')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = graph_df[['SpecID', 'Status']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

  peak_bins.fillna(False, inplace=True)


In [31]:
peak_bins

Unnamed: 0_level_0,ArticleRank_1000-1025,ArticleRank_1025-1050,ArticleRank_1050-1075,ArticleRank_1075-1100,ArticleRank_1100-1125,ArticleRank_1125-1150,ArticleRank_1150-1175,ArticleRank_1175-1200,ArticleRank_1200-1225,ArticleRank_1225-1250,...,PageRank_775-800,PageRank_800-825,PageRank_825-850,PageRank_850-875,PageRank_875-900,PageRank_900-925,PageRank_925-950,PageRank_950-975,PageRank_975-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.153244,False,0.152988,False,False,0.154166,False,0.155169,False,0.154143,...,False,0.982512,False,0.791774,False,0.680668,False,False,False,Normal
201210-1-01,0.153407,False,0.153162,False,False,0.153907,False,0.154375,False,0.153797,...,False,False,False,0.771427,0.906656,False,0.677312,False,False,Normal
201210-1-02,0.153581,0.154186,False,0.154596,False,0.153665,False,False,False,0.154342,...,False,False,False,False,False,1.567248,False,False,False,Normal
201210-1-03,False,False,False,False,False,False,0.152369,False,False,False,...,False,False,False,False,False,0.509197,False,0.68257,False,Normal
201210-1-04,0.153863,False,0.153966,False,False,False,False,0.153768,False,False,...,0.855642,False,1.033631,0.850927,False,0.901168,False,False,False,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.15371,False,False,False,False,0.153264,False,False,0.153429,False,...,0.92356,False,False,0.800594,False,False,False,False,False,Hyperglycemia
210526-3-46,0.153813,False,False,0.156754,False,0.15398,0.154929,False,False,0.153903,...,False,False,0.746308,False,0.745403,0.771761,False,False,False,Hyperglycemia
210526-3-47,0.153917,False,0.155041,False,0.153946,False,False,False,0.156267,0.15376,...,1.22049,False,False,0.76691,False,0.691066,False,0.728195,False,Hyperglycemia
210526-3-48,0.153276,False,False,0.154361,False,0.153173,False,False,0.153889,False,...,0.913872,False,0.771541,False,0.814756,0.767688,False,0.838691,False,Hyperglycemia


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.6614 +/- 0.0251
ExtraTreesClassifier Cross-Validation Accuracy: 0.6607 +/- 0.0312


In [42]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

In [43]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.6486042692939245


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.67      0.60      0.63       203
 Hypoglycemia       0.64      0.67      0.65       200
       Normal       0.64      0.68      0.66       206

     accuracy                           0.65       609
    macro avg       0.65      0.65      0.65       609
 weighted avg       0.65      0.65      0.65       609


Confusion Matrix:
[[122  31  50]
 [ 39 133  28]
 [ 21  45 140]]


In [44]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.6436781609195402


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.72      0.58      0.64       203
 Hypoglycemia       0.60      0.70      0.65       200
       Normal       0.63      0.65      0.64       206

     accuracy                           0.64       609
    macro avg       0.65      0.64      0.64       609
 weighted avg       0.65      0.64      0.64       609


Confusion Matrix:
[[118  39  46]
 [ 28 140  32]
 [ 18  54 134]]


In [181]:
graph_df.drop(columns=['id'], inplace=True)
graph_df = graph_df.groupby(['SpecID', 'Status']).max().reset_index()

In [182]:
graph_df

Unnamed: 0,SpecID,Status,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,201210-1-00,Normal,0.409790,-0.095152,-0.258437,-0.090914,0.169929,-0.331700,-0.079243,-0.140690,...,0.241945,-0.003233,0.062672,-0.104219,0.075871,0.045382,0.042710,0.183797,-0.457323,-0.206323
1,201210-1-01,Normal,0.462761,-0.041611,-0.116416,-0.085614,0.094492,-0.084112,0.025095,0.353951,...,0.033884,0.014642,-0.294941,0.262950,0.174502,0.020558,-0.038236,-0.109635,0.188239,0.078873
2,201210-1-02,Normal,-0.126493,0.120699,-0.259624,0.118807,0.219271,-0.033150,0.006986,0.106648,...,0.131455,0.115707,0.306810,0.098764,0.179321,0.165603,0.204185,0.187192,0.301455,-0.050529
3,201210-1-03,Normal,-0.113795,-0.109296,0.190316,0.049774,-0.098691,-0.136490,0.026262,-0.272153,...,0.173264,0.024008,-0.142276,0.256540,-0.237659,0.071721,-0.206544,0.039240,0.192381,-0.006646
4,201210-1-04,Normal,0.089898,-0.198704,-0.078786,-0.123567,0.411126,0.217254,0.262497,0.138253,...,-0.085165,-0.322281,-0.062971,-0.014023,-0.045492,0.149446,0.078786,-0.028270,-0.003088,-0.277523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,210526-3-45,Hyperglycemia,0.221217,0.225263,0.089135,-0.375069,0.208700,0.078060,-0.184773,-0.199067,...,0.089781,-0.043855,-0.269609,-0.288412,0.182395,-0.114441,-0.322940,0.073619,0.133897,0.048475
3041,210526-3-46,Hyperglycemia,0.043268,-0.172677,-0.119328,-0.196907,0.341100,-0.078425,-0.174734,0.304696,...,0.013648,0.075949,0.094076,0.284334,0.026128,-0.132205,0.127972,0.055021,-0.284999,-0.022013
3042,210526-3-47,Hyperglycemia,-0.003306,0.096393,0.097831,0.291358,-0.160119,0.123625,-0.008911,0.169901,...,-0.025432,0.240909,-0.026445,0.023795,0.178017,0.473302,0.071724,0.003903,0.154816,-0.003512
3043,210526-3-48,Hyperglycemia,-0.063648,-0.061418,-0.140955,0.312578,0.260706,-0.046204,0.059897,0.141903,...,-0.002642,-0.105517,0.078277,0.276146,0.174825,-0.071319,0.246788,0.107247,0.205819,0.014434


In [183]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = graph_df.drop(columns=['SpecID', 'Status'])

# Target variable
y = graph_df['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)


Random Forest Accuracy: 0.3448275862068966
Extra Trees Accuracy: 0.35960591133004927


In [123]:
rf_predictions

array(['Hypoglycemia', 'Hyperglycemia', 'Hyperglycemia', ..., 'Normal',
       'Hyperglycemia', 'Hyperglycemia'], dtype=object)

In [132]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [133]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(25)

    return top_10_features

In [179]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.3399014778325123


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.37      0.20      0.26       194
 Hypoglycemia       0.31      0.49      0.38       193
       Normal       0.36      0.33      0.35       222

     accuracy                           0.34       609
    macro avg       0.35      0.34      0.33       609
 weighted avg       0.35      0.34      0.33       609


Confusion Matrix:
[[ 38  92  64]
 [ 31  95  67]
 [ 33 115  74]]


In [180]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.3530377668308703


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.38      0.16      0.22       194
 Hypoglycemia       0.34      0.54      0.41       193
       Normal       0.37      0.36      0.36       222

     accuracy                           0.35       609
    macro avg       0.36      0.35      0.33       609
 weighted avg       0.36      0.35      0.34       609


Confusion Matrix:
[[ 31  90  73]
 [ 25 104  64]
 [ 26 116  80]]


In [136]:
get_feature_importances(rf_model, X)

Unnamed: 0,Feature,Importance
86,embedding_86,0.009065
63,embedding_63,0.008861
84,embedding_84,0.008758
125,embedding_125,0.008741
79,embedding_79,0.008651
26,embedding_26,0.00863
110,embedding_110,0.008596
97,embedding_97,0.008592
70,embedding_70,0.008515
111,embedding_111,0.008485


In [137]:
get_feature_importances(et_model, X)

Unnamed: 0,Feature,Importance
110,embedding_110,0.008557
20,embedding_20,0.008436
117,embedding_117,0.008277
43,embedding_43,0.008205
19,embedding_19,0.008194
111,embedding_111,0.008175
75,embedding_75,0.00817
34,embedding_34,0.008149
8,embedding_8,0.008142
50,embedding_50,0.008141


In [138]:
graph_df

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Status
0,210421-1-12,5015747,-0.083897,-0.088126,0.129636,-0.011458,0.012046,0.454714,0.077193,-0.334761,...,0.109211,0.468703,0.036231,-0.154496,-0.278995,-0.121792,0.054464,0.032787,0.120078,Normal
1,210421-1-12,5015823,-0.029143,-0.071627,0.080513,-0.012518,0.015735,0.436146,0.042640,-0.319604,...,0.073673,0.481912,0.017425,-0.146291,-0.332198,-0.125615,0.085130,0.013962,0.122058,Normal
2,210421-1-12,5015933,-0.065649,-0.076738,0.115101,0.002256,-0.012525,0.466665,0.065633,-0.347313,...,0.092518,0.482005,0.030858,-0.157134,-0.297612,-0.115741,0.072406,0.032348,0.151767,Normal
3,210421-1-12,5016052,-0.068021,-0.082257,0.108146,0.034965,-0.018199,0.449365,0.067154,-0.335976,...,0.103151,0.493451,0.081179,-0.147588,-0.300893,-0.079562,0.078620,0.037989,0.144762,Normal
4,210421-1-12,5016132,-0.072639,-0.068135,0.106439,0.012042,0.013381,0.447809,0.037788,-0.345055,...,0.094144,0.476964,0.034397,-0.164034,-0.305940,-0.097593,0.076277,0.031379,0.142097,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65218,210421-1-12,5015144,-0.013738,-0.084442,0.112246,0.065560,0.024290,0.481016,0.050764,-0.305816,...,0.153296,0.439762,0.060891,-0.089327,-0.310392,-0.174691,0.069514,0.016734,0.146203,Normal
65219,210421-1-12,5015202,0.008089,-0.078856,0.102037,0.073806,0.028963,0.487772,0.035117,-0.293706,...,0.160191,0.437551,0.054878,-0.075073,-0.316523,-0.188039,0.071264,0.025373,0.145364,Normal
65220,210421-1-12,5015253,-0.108615,-0.072524,0.140635,-0.017185,-0.018298,0.459807,0.128565,-0.314512,...,0.111135,0.400943,0.052551,-0.155923,-0.295256,-0.128258,0.063318,-0.008409,0.131710,Normal
65221,210421-1-12,5015372,-0.020248,-0.090490,0.106660,-0.033297,-0.046091,0.417534,0.048086,-0.315869,...,0.114127,0.521180,0.090410,-0.072637,-0.320001,-0.020104,0.001187,0.079467,0.187885,Normal


In [139]:
unique_values = graph_df['SpecID'].unique().tolist()

In [140]:
train, test = train_test_split(unique_values, test_size=0.2, random_state=1234)

In [141]:
train_df = graph_df[graph_df['SpecID'].isin(train)]
test_df = graph_df[graph_df['SpecID'].isin(test)]

In [143]:
value_counts = test_df['Status'].value_counts()

print(value_counts)

Status
Normal           4732
Hypoglycemia     4445
Hyperglycemia    3718
Name: count, dtype: int64


In [144]:
# Drop the SpecID column
X_train = train_df.drop(columns=['SpecID', 'Status', 'id'])

# Target variable
y_train= train_df['Status']

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

In [154]:
def ensemble_pred(model, X_test):
    prediction = model.predict(X_test)
    return prediction


In [161]:
X_test = test_df.drop(columns=['SpecID', 'Status', 'id'])
y_test = test_df['Status']

In [165]:
result = ensemble_pred(et_model, X_test)

In [166]:
accuracy = accuracy_score(y_test, result)

In [167]:
accuracy

0.3433113609926328

In [148]:
result_df = result.join(y_test)

In [149]:
result_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,predictions,SpecID,id,Status
187,0.074743,-0.415970,0.613765,0.201123,0.133562,-0.116933,0.028692,0.173704,0.211737,0.150171,...,-0.010628,0.047397,0.000789,0.114177,-0.126641,0.086094,Hypoglycemia,210421-1-21,5040987,Normal
188,0.076432,-0.396298,0.595790,0.222759,0.138462,-0.117403,0.002030,0.190719,0.212822,0.170548,...,-0.020865,0.050474,0.023122,0.085597,-0.134768,0.110571,Hypoglycemia,210421-1-21,5041132,Normal
189,0.055294,-0.402428,0.595596,0.226997,0.135008,-0.124002,0.006198,0.169250,0.211475,0.173995,...,-0.021711,0.040263,0.037192,0.109837,-0.129213,0.118357,Hypoglycemia,210421-1-21,5041203,Normal
190,0.074283,-0.398744,0.596125,0.215075,0.152756,-0.124699,0.001297,0.196083,0.200662,0.164035,...,-0.018757,0.042078,0.028636,0.097731,-0.130284,0.107098,Hypoglycemia,210421-1-21,5041296,Normal
191,0.100249,-0.390776,0.581041,0.232668,0.130775,-0.092609,-0.049520,0.194864,0.224587,0.157190,...,-0.030285,0.064996,0.093184,0.048962,-0.170189,0.085308,Hypoglycemia,210421-1-21,5041356,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65161,0.100057,0.040536,-0.255624,0.010198,-0.221306,-0.424600,-0.172966,0.014593,-0.000857,-0.292851,...,-0.161723,0.213540,0.258775,-0.075005,0.016291,0.066482,Normal,210421-1-09,5135055,Normal
65162,0.105455,0.042327,-0.253205,0.004443,-0.210196,-0.427927,-0.179131,0.008624,-0.001742,-0.297020,...,-0.162230,0.216889,0.259210,-0.083966,0.011858,0.077052,Normal,210421-1-09,5135269,Normal
65163,0.107495,0.036649,-0.251965,0.003877,-0.213531,-0.431670,-0.180962,0.011332,0.000449,-0.298128,...,-0.164517,0.212953,0.260288,-0.081891,0.013264,0.069571,Normal,210421-1-09,5135355,Normal
65164,0.104413,0.041694,-0.252355,0.005855,-0.213241,-0.431924,-0.177525,0.012054,-0.001264,-0.298398,...,-0.161550,0.217441,0.262780,-0.081620,0.012481,0.072131,Normal,210421-1-09,5135421,Normal


In [150]:
mode_df = result_df.groupby("SpecID")[["predictions", "Status"]].agg(lambda x: x.mode().iloc[0])


In [151]:
mode_df

Unnamed: 0_level_0,predictions,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1
201210-1-01,Hypoglycemia,Normal
201210-1-02,Normal,Normal
201210-1-12,Normal,Normal
201210-1-22,Normal,Normal
201210-1-34,Normal,Normal
...,...,...
210526-3-27,Hypoglycemia,Hyperglycemia
210526-3-34,Hypoglycemia,Hyperglycemia
210526-3-40,Normal,Hyperglycemia
210526-3-45,Hyperglycemia,Hyperglycemia


In [152]:
# Calculate the number of matching predictions and statuses
matching_values = (mode_df['predictions'] == mode_df['Status']).sum()

# Calculate the total number of rows
total_rows = len(mode_df)

# Calculate the percentage of matching predictions and statuses
percentage_matching = (matching_values / total_rows) * 100

In [153]:
percentage_matching

36.7816091954023