#### This Version Creates the Graph in Neo4J then runs the Queries.

In [1]:
from neo4j import GraphDatabase
import os
import pandas as pd
import shutil
import subprocess

This moves the nodes and relationships files from data/current_working_graph into the DBMS import folder.

At the moment this uses local paths, but the rest should be automated.

In [3]:
# Define the source directory
source_dir = os.path.abspath('../../data/graph_2_2')

# Define the destination directory
# This should be the import directory of your graph database
destination_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff/import'

# Specify the filenames to move
filenames = ['nodes.csv', 'relationships.csv']

for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    
    # Move each file
    shutil.copy(source_path, destination_path)

Then run the admin import command.

In [5]:
# Define the working directory
working_dir = 'C:/Users/stang/.Neo4jDesktop/relate-data/dbmss/dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff/bin'

# Construct the command
# command = [
#     './bin/neo4j-admin', 'database', 'import', 'full',
#     '--nodes=import/nodes.csv',
#     '--relationships=import/relationships.csv', 'neo4j'
# ]

command = 'neo4j-admin database import full --nodes=import/nodes.csv --relationships=import/relationships.csv neo4j'

# Execute the command


result = subprocess.run(command, shell=True, cwd=working_dir, capture_output=True, text=True)
#result = subprocess.run(command, cwd=working_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Check if the command was successful
if result.returncode == 0:
    print("Import successful")
    print(result.stdout)
else:
    print("Error during import:")
    print(result.stderr)

Import successful
Neo4j version: 5.12.0
Importing the contents of these files into C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff\data\databases\neo4j:
Nodes:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff\import\nodes.csv

Relationships:
  C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff\import\relationships.csv


Available resources:
  Total machine memory: 31.86GiB
  Free machine memory: 24.79GiB
  Max heap memory : 910.5MiB
  Max worker threads: 8
  Configured max memory: 21.58GiB
  High parallel IO: true

Cypher type normalization is enabled (disable with --normalize-types=false):
  Property type of 'WaveNumber' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\relate-data\dbmss\dbms-099935b8-400b-4b6a-9a81-2c0bfbafffff\import\nodes.csv
  Property type of 'Absorbance' normalized from 'float' --> 'double' in C:\Users\stang\.Neo4jDesktop\re

Run and connect to the Neo4j Database

In [6]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"              # Neo4J username
#password = os.environ['NEO4J_Password']           # Neo4J password
password = '26622002'

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Ensure you close the driver connection when your program ends
def close_driver():
    driver.close()

**First create the gds Graph Projection**

In [7]:
def project_graph(tx):
    query = """
    CALL gds.graph.project(
      'myGraph', 
      ['Normal', 'Hyperglycemia', 'Hypoglycemia'],
      {
        SAMPLE: {
          orientation: 'UNDIRECTED',
          properties: 'DIST'
        },
        GRID: {
          orientation: 'UNDIRECTED',
          properties: 'DIST'
        }
      }
    )
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(project_graph)

Define the graph algorithms.

In [8]:
def run_pagerank_centrality(tx):
    query = """
    CALL gds.pageRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [9]:
def run_degree_centrality(tx):
    query = """
    CALL gds.degree.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [10]:
def run_eigenvector_centrality(tx):
    query = """
    CALL gds.eigenvector.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [11]:
def run_articlerank_centrality(tx):
    query = """
    CALL gds.articleRank.stream('myGraph', {
        relationshipWeightProperty: 'DIST'
    })
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, score
    ORDER BY score DESC, name ASC
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["score"]) for record in results]

In [12]:
def run_label_propagation_algorithm(tx):
    query = """
    CALL gds.labelPropagation.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [13]:
def run_leiden_algorithm(tx):
    query = """
    CALL gds.leiden.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [14]:
def run_louvain_algorithm(tx):
    query = """
    CALL gds.louvain.stream('myGraph', { relationshipWeightProperty: 'DIST' })
    YIELD nodeId, communityId AS Community
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, Community
    ORDER BY Community, name
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["Community"]) for record in results]

In [15]:
def run_node2vec_algorithm(tx):
    query = """
    CALL gds.node2vec.stream('myGraph', { relationshipWeightProperty: 'Weight' })
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["embedding"]) for record in results]

In [17]:
def run_fastRP_algorithm(tx):
    query = """
    CALL gds.fastRP.stream('myGraph',
        { relationshipWeightProperty: 'DIST',
         randomSeed:1234,
         embeddingDimension: 128
        }
    )
    YIELD nodeId, embedding
    RETURN gds.util.asNode(nodeId).SpecID AS name, gds.util.asNode(nodeId).Node_ID AS id, embedding
    """
    results = tx.run(query)
    return [(record["name"], record["id"], record["embedding"]) for record in results]

Execute the algorithms and store the results in a Dataframe.

In [18]:
# Use a session to execute the queries and retrieve the results
with driver.session() as session:
    pagerank_results = session.execute_read(run_pagerank_centrality)
    degree_results = session.execute_read(run_degree_centrality)
    eigenvector_results = session.execute_read(run_eigenvector_centrality)
    articlerank_results = session.execute_read(run_articlerank_centrality)
    label_propagation_results = session.execute_read(run_label_propagation_algorithm)
    leiden_results = session.execute_read(run_leiden_algorithm)
    louvain_results = session.execute_read(run_louvain_algorithm)

In [19]:
pagerank_df = pd.DataFrame(pagerank_results, columns=['name', 'id', 'PageRank'])
degree_df = pd.DataFrame(degree_results, columns=['name', 'id', 'DegreeCentrality'])
eigenvector_df = pd.DataFrame(eigenvector_results, columns=['name', 'id', 'EigenvectorCentrality'])
articlerank_df = pd.DataFrame(articlerank_results, columns=['name', 'id', 'ArticleRank'])
label_propagation_df = pd.DataFrame(label_propagation_results, columns=['name', 'id', 'LabelPropagation'])
leiden_df = pd.DataFrame(leiden_results, columns=['name', 'id', 'Leiden'])
louvain_df = pd.DataFrame(louvain_results, columns=['name', 'id', 'Louvain'])

In [None]:
#pagerank_df = pagerank_df.groupby("name").mean().reset_index()

In [65]:
#degree_df = degree_df.groupby("name").mean().reset_index()
#eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
#articlerank_df = articlerank_df.groupby("name").mean().reset_index()
#eigenvector_df = eigenvector_df.groupby("name").mean().reset_index()
#leiden_df = leiden_df.groupby("name").first().reset_index()
#louvain_df = louvain_df.groupby("name").first().reset_index()

In [66]:
merged_df = pagerank_df
for df in [degree_df, eigenvector_df, articlerank_df, leiden_df, louvain_df]:
    merged_df = pd.merge(merged_df, df, on=['id', 'name'], how='left')

In [67]:
df = merged_df.rename(columns={'name' : 'SpecID'})
df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain
0,210304-2-31,2009001,1.853073,811.456929,0.008884,0.336744,95,49713
1,210504-1-45,4929244,1.811661,1017.602961,0.009206,0.372945,193,51944
2,210505-1-15,5185929,1.791882,624.369305,0.008927,0.300624,273,15943
3,210121-2-29,775933,1.775784,1038.387532,0.009158,0.377575,193,51944
4,210315-1-34,2527830,1.769859,1034.863133,0.009263,0.376764,193,51944
...,...,...,...,...,...,...,...,...
52104,210315-2-18,2596688,0.278659,25.511975,0.000420,0.156682,0,34090
52105,210211-1-01,820666,0.274955,5.746208,0.000248,0.151619,244,49531
52106,210510-1-10,5261432,0.274524,32.366027,0.000427,0.158173,0,11416
52107,201210-1-02,4853,0.260108,32.301397,0.000394,0.158190,0,34090


In [22]:
with driver.session() as session:
    fastRP_results = session.execute_read(run_fastRP_algorithm)

fastRP_df = pd.DataFrame(fastRP_results, columns=['SpecID', 'id', 'embeddings'])

# Expand the embeddings list into separate columns
embeddings_df = pd.DataFrame(fastRP_df['embeddings'].tolist(), index=fastRP_df.index)

# Optionally, rename the new columns
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

# Join the new embeddings columns to the original DataFrame
fastRP_df = pd.concat([fastRP_df.drop(['embeddings'], axis=1), embeddings_df], axis=1)
fastRP_df.to_csv('../../data/fastRP_embeddings.csv', index=False)
fastRP_df.head()

Unnamed: 0,SpecID,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,201210-1-00,96,-0.034085,0.041641,-0.148699,0.064528,0.151397,0.138017,-0.155,0.068691,...,-0.219438,-0.270504,-0.079899,-0.11013,-0.218276,0.152772,0.186025,0.045926,0.097555,-0.133072
1,201210-1-00,182,0.019081,0.258899,0.305456,0.085388,0.076383,-0.157789,-0.034043,-0.049448,...,-0.152795,0.193432,0.188455,0.225509,-0.073644,0.096683,0.068912,0.075919,-0.07308,-0.005588
2,201210-1-00,252,-0.039509,-0.191453,-0.035222,0.048801,-0.141076,0.223371,-0.024153,0.224085,...,-0.021248,-0.263184,0.122908,0.107629,-0.181609,0.121673,0.053356,-0.108575,-0.279535,-0.214511
3,201210-1-00,317,-0.115161,-0.336973,0.250838,0.164044,0.192468,0.080238,-0.217521,-0.187406,...,-0.436042,0.085385,0.188318,0.244077,-0.064672,0.016553,-0.028742,0.111902,-0.025005,0.107116
4,201210-1-00,433,-0.082257,0.161392,0.084249,0.055595,-0.019202,0.0493,-0.206302,-0.284901,...,0.023703,0.261858,0.117705,-0.014765,0.10841,0.057037,-0.332421,-0.259595,0.236897,-0.05687


Delete the projection

In [87]:
def delete_projection(tx):
    query = """
    CALL gds.graph.drop('myGraph')
    """
    tx.run(query)

# Use a session to execute the graph projection
with driver.session() as session:
    session.execute_write(delete_projection)

In [88]:
close_driver()

In [72]:
type(status_df['Node_ID'][0])

numpy.int64

In [58]:
status_df = pd.read_csv('../../data/graph_2_2/peaks.csv')

In [61]:
status_df.drop(columns=['Unnamed: 0', 'index'], inplace=True)

In [73]:
fastRP_df['id'] = fastRP_df['id'].astype(int)

In [74]:
df['id'] = df['id'].astype(int)

In [75]:
graph_df = pd.merge(df, status_df, left_on="id", right_on="Node_ID", how="inner")
graph_df.drop(columns=["Node_ID"], inplace=True)

In [76]:
fast_graph_df = pd.merge(fastRP_df, status_df, left_on="id", right_on="Node_ID", how="inner")
fast_graph_df.drop(columns=["Node_ID"], inplace=True)

In [79]:
graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)
graph_df.drop(columns=['SpecID_y','Absorbance', 'Widths', 'Prominences', 'GridSlot'], inplace=True)
graph_df.rename(columns={'SpecID_x':'SpecID'}, inplace=True)

In [80]:
fast_graph_df.rename(columns={"Status:LABEL":"Status"}, inplace=True)

In [81]:
fast_graph_df.drop(columns=['SpecID_y', 'WaveNumber', 'Absorbance', 'Widths', 'Prominences', 'GridSlot'], inplace=True)
fast_graph_df.rename(columns={'SpecID_x':'SpecID'}, inplace=True)

In [83]:
graph_df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain,Seq,WaveNumber,SurID,Status
0,210304-2-31,2009001,1.853073,811.456929,0.008884,0.336744,95,49713,1274,1070.61500,210304-2,Hypoglycemia
1,210504-1-45,4929244,1.811661,1017.602961,0.009206,0.372945,193,51944,1692,1356.26430,210504-1,Hypoglycemia
2,210505-1-15,5185929,1.791882,624.369305,0.008927,0.300624,273,15943,2252,1738.95210,210505-1,Hypoglycemia
3,210121-2-29,775933,1.775784,1038.387532,0.009158,0.377575,193,51944,1704,1364.46470,210121-2,Hyperglycemia
4,210315-1-34,2527830,1.769859,1034.863133,0.009263,0.376764,193,51944,1706,1365.83140,210315-1,Hypoglycemia
...,...,...,...,...,...,...,...,...,...,...,...,...
52104,210315-2-18,2596688,0.278659,25.511975,0.000420,0.156682,0,34090,898,813.66742,210315-2,Hypoglycemia
52105,210211-1-01,820666,0.274955,5.746208,0.000248,0.151619,244,49531,1359,1128.70150,210211-1,Hyperglycemia
52106,210510-1-10,5261432,0.274524,32.366027,0.000427,0.158173,0,11416,1942,1527.10710,210510-1,Hyperglycemia
52107,201210-1-02,4853,0.260108,32.301397,0.000394,0.158190,0,34090,1048,916.17310,201210-1,Normal


In [38]:
graph_df

Unnamed: 0,SpecID,id,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,Leiden,Louvain,WaveNumber:float,Status
0,210304-2-31,2009001,1.853073,811.456929,0.008884,0.336744,95,49713,1070.61500,Hypoglycemia
1,210504-1-45,4929244,1.811661,1017.602961,0.009206,0.372945,193,51944,1356.26430,Hypoglycemia
2,210505-1-15,5185929,1.791882,624.369305,0.008927,0.300624,273,15943,1738.95210,Hypoglycemia
3,210121-2-29,775933,1.775784,1038.387532,0.009158,0.377575,193,51944,1364.46470,Hyperglycemia
4,210315-1-34,2527830,1.769859,1034.863133,0.009263,0.376764,193,51944,1365.83140,Hypoglycemia
...,...,...,...,...,...,...,...,...,...,...
52104,210315-2-18,2596688,0.278659,25.511975,0.000420,0.156682,0,34090,813.66742,Hypoglycemia
52105,210211-1-01,820666,0.274955,5.746208,0.000248,0.151619,244,49531,1128.70150,Hyperglycemia
52106,210510-1-10,5261432,0.274524,32.366027,0.000427,0.158173,0,11416,1527.10710,Hyperglycemia
52107,201210-1-02,4853,0.260108,32.301397,0.000394,0.158190,0,34090,916.17310,Normal


In [85]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 25

# Add a "Bin" column to the DataFrame
graph_df['Bin'] = graph_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

In [92]:
# Pivot table with 'Absorbance', 'PeakWidths', and 'PeakProminences' as values
peak_bins = graph_df.pivot_table(index='SpecID', columns='Bin', values=['PageRank', 'DegreeCentrality', 'EigenvectorCentrality', 'ArticleRank', 'Leiden', 'Louvain'], aggfunc='max')
peak_bins.columns = [f"{col[0]}_{col[1]}" for col in peak_bins.columns]  # Combine column names
peak_bins.reset_index(inplace=True)

# Merge with 'Status' information
statuses = graph_df[['SpecID', 'Status', 'SurID']].drop_duplicates()
peak_bins = pd.merge(peak_bins, statuses, on='SpecID')

# Set 'SpecID' as the index
peak_bins.set_index('SpecID', inplace=True)

# Fill NaN values with False
peak_bins.fillna(False, inplace=True)

  peak_bins.fillna(False, inplace=True)


In [32]:
def replace_floats_with_one(value):
    if isinstance(value, float):
        return 1
    else:
        return value
    
peak_bins = peak_bins.applymap(replace_floats_with_one)

In [93]:
peak_bins

Unnamed: 0_level_0,ArticleRank_1000-1025,ArticleRank_1025-1050,ArticleRank_1050-1075,ArticleRank_1075-1100,ArticleRank_1100-1125,ArticleRank_1125-1150,ArticleRank_1150-1175,ArticleRank_1175-1200,ArticleRank_1200-1225,ArticleRank_1225-1250,...,PageRank_800-825,PageRank_825-850,PageRank_850-875,PageRank_875-900,PageRank_900-925,PageRank_925-950,PageRank_950-975,PageRank_975-1000,Status,SurID
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,0.377243,False,0.304774,False,False,0.360503,False,0.275656,False,0.326807,...,1.052032,False,1.15953,1.096286,False,False,False,False,Normal,201210-1
201210-1-01,0.36651,False,False,0.354269,False,False,0.28349,0.285225,False,False,...,False,False,1.217112,False,0.727182,0.841889,False,False,Normal,201210-1
201210-1-02,0.276308,0.194877,False,False,False,0.264689,False,False,False,False,...,False,0.859342,False,False,0.260108,False,False,False,Normal,201210-1
201210-1-03,False,False,0.332091,False,False,False,0.165042,False,False,0.325528,...,False,False,False,False,0.395384,False,0.567746,False,Normal,201210-1
201210-1-04,0.376716,False,0.330092,False,False,False,False,0.192462,False,False,...,False,False,1.204772,False,False,False,False,False,Normal,201210-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,0.372079,False,False,False,False,0.356112,False,False,0.283235,False,...,False,False,1.080167,False,False,False,False,False,Hyperglycemia,210526-3
210526-3-46,0.365058,False,False,False,False,0.348996,False,False,False,False,...,False,False,1.038994,False,False,False,False,False,Hyperglycemia,210526-3
210526-3-47,0.361417,False,False,False,0.342851,False,False,False,False,0.318186,...,False,False,1.022673,False,1.100306,False,False,False,Hyperglycemia,210526-3
210526-3-48,False,False,False,False,False,0.354387,False,False,False,False,...,False,False,1.030168,False,False,False,False,0.824658,Hyperglycemia,210526-3


In [88]:
graph_bins = peak_bins.reset_index()

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, GroupKFold
import numpy as np

def evaluate_extra_trees(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Extra Trees classifier
    et = ExtraTreesClassifier(random_state=1234)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)

    scores = []
    for train_index, test_index in cv.split(X, y, groups):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the Extra Trees classifier
        et.fit(X_train, y_train)
        predictions = et.predict(X_test)
        
        # Evaluate the model
        score = accuracy_score(y_test, predictions)
        scores.append(score)
    
    # Displaying the results
    print(f'{et.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

In [94]:
evaluate_extra_trees(peak_bins)

ExtraTreesClassifier Cross-Validation Accuracy: 0.4911 +/- 0.0982


In [91]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, GroupKFold
import numpy as np

# Splitting the dataframe into features (X) and target variable (y)
X = peak_bins.drop(['Status'], axis=1)
y = peak_bins['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.7287 +/- 0.0221
ExtraTreesClassifier Cross-Validation Accuracy: 0.7369 +/- 0.0203


In [44]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

In [45]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [46]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [47]:
calculate_metrics(y_test, rf_predictions)

Overall Accuracy: 0.7175697865353038


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.78      0.65      0.71       203
 Hypoglycemia       0.68      0.79      0.73       200
       Normal       0.71      0.72      0.71       206

     accuracy                           0.72       609
    macro avg       0.72      0.72      0.72       609
 weighted avg       0.72      0.72      0.72       609


Confusion Matrix:
[[132  35  36]
 [ 19 157  24]
 [ 19  39 148]]


In [48]:
calculate_metrics(y_test, et_predictions)

Overall Accuracy: 0.7257799671592775


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.79      0.66      0.72       203
 Hypoglycemia       0.71      0.77      0.74       200
       Normal       0.69      0.75      0.72       206

     accuracy                           0.73       609
    macro avg       0.73      0.73      0.73       609
 weighted avg       0.73      0.73      0.73       609


Confusion Matrix:
[[134  29  40]
 [ 18 154  28]
 [ 17  35 154]]


In [49]:
get_feature_importances(rf_model, X)

Unnamed: 0,Feature,Importance
17,ArticleRank_1425-1450,0.010179
36,ArticleRank_500-525,0.009755
129,EigenvectorCentrality_1425-1450,0.009561
324,PageRank_700-725,0.009462
73,DegreeCentrality_1425-1450,0.009326
100,DegreeCentrality_700-725,0.009167
92,DegreeCentrality_500-525,0.009157
204,Leiden_500-525,0.009136
224,Louvain_1000-1025,0.009001
148,EigenvectorCentrality_500-525,0.008771


In [50]:
get_feature_importances(et_model, X)

Unnamed: 0,Feature,Importance
92,DegreeCentrality_500-525,0.009592
204,Leiden_500-525,0.009293
224,Louvain_1000-1025,0.008664
36,ArticleRank_500-525,0.007942
44,ArticleRank_700-725,0.00771
73,DegreeCentrality_1425-1450,0.0071
156,EigenvectorCentrality_700-725,0.007028
228,Louvain_1100-1125,0.007003
275,Louvain_875-900,0.006969
129,EigenvectorCentrality_1425-1450,0.006693


## FastRp

In [51]:
fast_graph_df.drop(columns=['id'], inplace=True)
fast_graph_df = fast_graph_df.groupby(['SpecID', 'Status']).max().reset_index()

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = fast_graph_df.drop(columns=['SpecID', 'Status'])

# Target variable
y = fast_graph_df['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)

Random Forest Accuracy: 0.5862068965517241
Extra Trees Accuracy: 0.6272577996715928


Fast rp mixed with traditional graph metrics

In [53]:
joined_graph = pd.merge(graph_bins, fast_graph_df, on=['SpecID', 'Status'], how='inner')

In [54]:
joined_graph

Unnamed: 0,SpecID,ArticleRank_1000-1025,ArticleRank_1025-1050,ArticleRank_1050-1075,ArticleRank_1075-1100,ArticleRank_1100-1125,ArticleRank_1125-1150,ArticleRank_1150-1175,ArticleRank_1175-1200,ArticleRank_1200-1225,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,201210-1-00,0.377243,False,0.304774,False,False,0.360503,False,0.275656,False,...,0.254272,0.261858,0.362218,0.404168,0.332189,0.180705,0.512382,0.402452,0.417022,0.393863
1,201210-1-01,0.36651,False,False,0.354269,False,False,0.28349,0.285225,False,...,0.254648,0.453185,0.289818,0.404793,0.321037,0.336126,0.295307,0.257868,0.420118,0.397554
2,201210-1-02,0.276308,0.194877,False,False,False,0.264689,False,False,False,...,0.291559,0.445567,0.376586,0.150653,0.104942,0.208960,0.422336,0.196503,0.420174,0.414566
3,201210-1-03,False,False,0.332091,False,False,False,0.165042,False,False,...,0.290457,0.260612,0.379054,0.181635,0.274770,0.247041,0.514742,0.398636,0.207893,0.159639
4,201210-1-04,0.376716,False,0.330092,False,False,False,False,0.192462,False,...,0.189807,0.119364,0.173903,0.403614,0.453226,0.330556,0.515092,0.331227,0.425598,0.395960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,210526-3-45,0.372079,False,False,False,False,0.356112,False,False,0.283235,...,0.253513,0.281850,0.288725,0.275310,0.332380,0.278387,0.298098,0.256754,0.302817,0.389422
3041,210526-3-46,0.365058,False,False,False,False,0.348996,False,False,False,...,0.252873,0.284895,0.304018,0.275977,0.331285,0.276398,0.298461,0.234096,0.301175,0.388441
3042,210526-3-47,0.361417,False,False,False,0.342851,False,False,False,False,...,0.252261,0.281995,0.306367,0.402505,0.207833,0.276781,0.345942,0.400106,0.197900,0.389009
3043,210526-3-48,False,False,False,False,False,0.354387,False,False,False,...,0.252712,0.285383,0.305059,0.402426,0.331710,0.333195,0.185470,0.234843,0.302757,0.224213


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming graph_df is your DataFrame

# Drop the SpecID column
X = joined_graph.drop(columns=['SpecID', 'Status'])

# Target variable
y = joined_graph['Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
rf_model.fit(X_train, y_train)

# Extra Trees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=1234)
et_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
et_predictions = et_model.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)

Random Forest Accuracy: 0.6814449917898193
Extra Trees Accuracy: 0.7241379310344828


In [57]:
# Drop the SpecID column
X = joined_graph.drop(columns=['SpecID', 'Status'])

# Target variable
y = joined_graph['Status']

# Creating the classifiers
rf = RandomForestClassifier(random_state=1234)
et = ExtraTreesClassifier(random_state=1234)

# Combining the classifiers into a list
classifiers = [rf, et]

# Performing 10-fold cross-validation for each classifier
for clf in classifiers:
    
    # Using StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
    
    # Getting cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    # Displaying the results
    print(f'{clf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

RandomForestClassifier Cross-Validation Accuracy: 0.7077 +/- 0.0162
ExtraTreesClassifier Cross-Validation Accuracy: 0.7297 +/- 0.0221
