In [2]:
import requests
import json
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import pandas as pd
from py2neo import Graph, Node, Relationship
import math
import os

In [None]:
###DISGENET######
# Insert graph to Neo4j
import os
import pandas as pd
import math
from neo4j import GraphDatabase

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
graph = GraphDatabase.driver(uri, auth=(user, password))

def merge_node_disease(label, disease_name, disease_id, source):
    merge_query = f"""
    MERGE (n:{label} {{name: $name, id: $id}})
    ON CREATE SET n.source = $source
    RETURN n
    """
    with graph.session() as session:
        return session.run(merge_query, name=disease_name, id=disease_id, source=source).single()

def merge_node_gene(label, gene_symbol, gene_ens_id, source):
    merge_query = f"""
    MERGE (n:{label} {{symbol: $symbol, id: $id}})
    ON CREATE SET n.source = $source
    RETURN n
    """
    with graph.session() as session:
        return session.run(merge_query, symbol=gene_symbol, id=gene_ens_id, source=source).single()

def merge_relationship(disease_id, gene_id, association_type, pmid, source, score):
    properties = []
    params = {'disease_id': disease_id, 'gene_id': gene_id}

    if pmid is not None:
        properties.append("pmid: $pmid")
        params['pmid'] = pmid
    if source is not None:
        properties.append("source: $source")
        params['source'] = source
    if score is not None:
        properties.append("score: $score")
        params['score'] = score

    properties_str = ", ".join(properties)

    relationship_query = f"""
    MATCH (d:Disease {{id: $disease_id}})
    MATCH (g:Gene {{id: $gene_id}})
    MERGE (d)-[r:{association_type} {{{properties_str}}}]->(g)
    RETURN r
    """
    with graph.session() as session:
        return session.run(relationship_query, **params).single()

# Read CSV file
directory = "data/DISGENET/api-call-results"
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Load the CSV file into a dataframe
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        # Iterate through the DataFrame rows
        for idx, row in df.iterrows():
            try:
                # Extract values and handle NaNs
                gene_symbol = row.get("gene_symbol", "").strip()
                gene_id = row.get("gene_id", "")
                disease_name = row.get("disease_name", "").strip()
                disease_id = row.get("disease_id", "")
                
                # Concatenate DisGeNET with the source from the row
                source_row = row.get("source", "").strip()
                source = f"DisGeNET + {source_row}"

                score = row.get("score", None)
                pmid = row.get("pmid", None)

                # Convert pmid to int if it's not NaN
                if isinstance(pmid, float) and math.isnan(pmid):
                    pmid = None
                elif isinstance(pmid, float):
                    pmid = None
                else:
                    try:
                        pmid = int(pmid) if pmid is not None else None
                    except (ValueError, TypeError):
                        pmid = None

                # Check if gene and disease identifiers are not empty
                if not gene_symbol or not gene_id or not disease_name or not disease_id:
                    print(f"Skipping row {idx} due to missing data")
                    continue

                # Merge or create nodes with the source tag
                merge_node_disease("Disease", disease_name, disease_id, source)
                merge_node_gene("Gene", gene_symbol, gene_id, source)

                # Create relationship between disease and gene with the source tag
                merge_relationship(disease_id, gene_id, "ASSOCIATION", pmid, source, score)
                
            except Exception as e:
                print(f"Error processing row {idx}: {e}")
                print(filename)


In [None]:
# OpenTargets import to Neo4j
import os
import sys
import pandas as pd
import re
from neo4j import GraphDatabase

# Adjust the sys.path to include the parent directory if needed
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import custom module after adjusting sys.path
import opentarget_disease_filter
#import filtered_diseases

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
graph = GraphDatabase.driver(uri, auth=(user, password))

def merge_node_disease(label, disease_name, disease_id, source):
    merge_query = f"""
    MERGE (n:{label} {{name: $name, id: $id}})
    ON CREATE SET n.source = $source
    RETURN n
    """
    with graph.session() as session:
        return session.run(merge_query, name=disease_name, id=disease_id, source=source).single()

def merge_node_target(label, gene_symbol, source):
    merge_query = f"""
    MERGE (n:{label} {{symbol: $symbol}})
    ON CREATE SET n.source = $source
    RETURN n
    """
    with graph.session() as session:
        return session.run(merge_query, symbol=gene_symbol, source=source).single()

def merge_relationship(disease_name, gene_symbol, association_type, score, source):
    properties = ["source: $source"]
    params = {'disease_name': disease_name, 'gene_symbol': gene_symbol, 'source': source}
    
    if score is not None:
        properties.append("score: $score")
        params['score'] = score

    properties_str = ", ".join(properties)
    
    relationship_query = f"""
    MATCH (d:Disease {{name: $disease_name}})
    MATCH (g:Gene {{symbol: $gene_symbol}})
    MERGE (d)-[r:{association_type} {{{properties_str}}}]->(g)
    RETURN r
    """
    with graph.session() as session:
        return session.run(relationship_query, **params).single()

directory = "data/OpenTargets/data"
disease_names = opentarget_disease_filter.disease_names_filter

for filename in os.listdir(directory):
    if filename.endswith(".tsv"):
        file_path = os.path.join(directory, filename)
        # Load the TSV file into a dataframe
        df = pd.read_csv(file_path, sep="\t", header=0)
        
        for idx, row in df.iterrows():
            try:
                # Extract values and handle NaNs
                gene_symbol = row.get("symbol", "")
                score = row.get("globalScore", None)
                
                # Extract disease_id from the filename and map it to disease_name
                match = re.search(r'OT-(.*?)-associated', filename)
                if match:
                    disease_id = match.group(1)
                disease_name = disease_names.get(disease_id, None)
                
                if disease_name:
                    print("imported", disease_name)
                    # Combine OpenTargets with the source column (if available)
                    source = "OpenTargets + " + row.get("source", "").strip()

                    # Merge disease node, gene node, and the relationship with source
                    merge_node_disease("Disease", disease_name, disease_id, source)
                    merge_node_target("Gene", gene_symbol, source)
                    merge_relationship(disease_name, gene_symbol, "ASSOCIATION", score, source)
            except Exception as e:
                print(f"Error processing row {idx} in file {filename}: {e}")

print("Data import completed.")


In [None]:
###INDRA import to Neo4j
import pandas as pd
import ast
import os
from neo4j import GraphDatabase

# Function to detect the type of node (disease, chemical, or gene)
def detect_neo4j_node_type(my_dict):
    node_type = "Entity"
    if "MESH" in my_dict:
        node_type = "Disease"
    if "CHEBI" in my_dict:
        node_type = "Chemical"
    if "HGNC" in my_dict:
        node_type = "Gene"
    return node_type 

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to upload triples to Neo4j with a source tag
def upload_triples_neo4j(triple, disease_name):
    print("Uploading triples in Neo4j")
    with driver.session() as session:
        try:
            subj = triple['subj']
            obj = triple['obj']
            rel_type = triple['type']
            subj_type = detect_neo4j_node_type(triple["subj_namespace"])
            obj_type = detect_neo4j_node_type(triple["obj_namespace"])

            # Process PMIDs handling single integers, lists, and NaN
            pmids = []
            pmids_string_list = triple['pmids']

            if pd.notna(pmids_string_list):
                if isinstance(pmids_string_list, (int, float)):
                    pmids = [int(pmids_string_list)]
                elif isinstance(pmids_string_list, str):
                    if pmids_string_list.startswith("[") and pmids_string_list.endswith("]"):
                        pmids = ast.literal_eval(pmids_string_list)
                        pmids = [int(float(pmid)) for pmid in pmids if isinstance(pmid, (int, float))]
                    else:
                        pmids = [int(float(pmids_string_list))]

            evid = triple["evid_sentence"] if triple["evid_sentence"] else "NoEvidence"
            source = "INDRA + " + triple.get('source', 'Unknown')

            cypher_query = f"""
            MERGE (a:{subj_type} {{name: $subj}})
            MERGE (b:{obj_type} {{name: $obj}})
            MERGE (a)-[r:{rel_type} {{pmids: $pmids, disease_name: $disease_name, evid_sentence: $evid_sentence, source: $source}}]->(b)
            """
            session.run(cypher_query, subj=subj, obj=obj, pmids=pmids, disease_name=disease_name, evid_sentence=evid, source=source)

        except Exception as e:
            print("Cannot import this row to Neo4j", e)

# Close the driver
driver.close()

if __name__ == "__main__":
    directory = "data/INDRA/data"
    api_key = "f2be320e-22f7-471a-b457-326a3ebb5a84"
    
    for filename in os.listdir(directory):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(directory, filename)
            disease_name = filename.split(".xlsx")[0]
            # Load the Excel file into a DataFrame
            df = pd.read_excel(file_path)
            
            # Data cleaning: Drop rows where 'subj', 'obj', 'pmids' are missing, and filter 'belief' > 0.85
            df = df.dropna(subset=['subj', 'obj', 'pmids', 'score (belief)'])
            df = df[df['pmids'].astype(bool)]  # Further filter out any rows where pmids is empty
            df = df[df['score (belief)'] > 0.85]       # Filter rows where belief is > 0.85

            # Process each row in the cleaned DataFrame
            for index, row in df.iterrows():
                upload_triples_neo4j(row, disease_name)




In [None]:
###import DrugBank###
import pandas as pd
import os
from neo4j import GraphDatabase

# Function to detect the Neo4j node type (not used in this case but kept for flexibility)
def detect_neo4j_node_type(my_dict):
    node_type = "Entity"
    if "MESH" in my_dict:
        node_type = "disease"
    if "CHEBI" in my_dict:
        node_type = "chemical"
    if "HGNC" in my_dict:
        node_type = "gene"
    return node_type

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to upload triples to Neo4j with a source tag
def upload_triples_neo4j(triple, disease_name):
    print(f"Uploading triples for disease: {disease_name}")
    with driver.session() as session:
        try:
            # Extract values from the triple
            subj = disease_name  # The disease name is the subject
            subj_type = "Disease"  # Type of the subject node is Disease
            obj = triple['Drug Name']  # The drug is the object
            obj_type = "Drug"  # Type of the object node is Drug
            rel_type = "ASSOCIATION"  # Type of the relationship
            drug_id = triple["Primary ID"]  # Drug ID from DrugBank
            pmid = int(triple["PubMed ID"])  # PubMed ID
            
            # Define the source tag for DrugBank
            source = "DrugBank + " + triple.get('Source', 'Unknown')

            # Create nodes and relationships, including the source and pmid
            cypher_query = f"""
            MERGE (a:{subj_type} {{name: $subj}})
            MERGE (b:{obj_type} {{name: $obj, id: $drug_id}})
            MERGE (a)-[r:{rel_type} {{pmid: $pmid, source: $source}}]->(b)
            """

            session.run(cypher_query, subj=subj, obj=obj, drug_id=drug_id, pmid=pmid, source=source)

        except Exception as e:
            print(f"Cannot import row for disease {disease_name}: {e}")

# Close the driver
driver.close()

if __name__ == "__main__":
    directory = "data/DRUGBANK/csv_output"
    
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            disease_name = filename.split(".csv")[0]  # Extract disease name from filename
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            # Process each row in the DataFrame
            for index, row in df.iterrows():
                upload_triples_neo4j(row, disease_name)


In [None]:
## Import Pubtator results ###
import pandas as pd
from neo4j import GraphDatabase

# Replace these with your actual connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(user, password))

def validate_type(node_type):
    """Validates the node type, returning 'undefined' if the type is an integer or None."""
    if isinstance(node_type, int) or node_type is None:
        return "undefined"
    return node_type

def upload_triples_neo4j(triple):
    """Uploads a triple to Neo4j."""
    print(f"Uploading triple: {triple['node1_text']} -[{triple['relation_type']}]-> {triple['node2_text']}")
    with driver.session() as session:
        try:
            subj = triple['node1_text']
            subj_type = validate_type(triple['role1_type'])
            obj = triple['node2_text']
            obj_type = validate_type(triple['role2_type'])
            rel_type = triple['relation_type'].upper()
            pmid = int(triple['pmid'])
            score = float(triple['score'])
            evidence = triple['evidence']

            # Combine PubTator with additional source information (if available)
            source = "PubTator + " + triple.get('source', 'Unknown')

            # Create nodes and relationship with additional properties including source
            cypher_query = f"""
            MERGE (a:{subj_type} {{name: $subj}})
            MERGE (b:{obj_type} {{name: $obj}})
            MERGE (a)-[r:{rel_type} {{pmid: $pmid, score: $score, evidence: $evidence, source: $source}}]->(b)
            """

            session.run(cypher_query, subj=subj, obj=obj, pmid=pmid, score=score, evidence=evidence, source=source)
        except Exception as e:
            print("Cannot import this row to Neo4j:", e)

# Close the driver connection
driver.close()

if __name__ == "__main__":
    # Load the Excel file into a DataFrame (replace with your actual file path)
    excel_file_path = "data/PubTator/pmc_triples_100.xlsx"  # Replace with the actual path

    df = pd.read_excel(excel_file_path)

    # Process each row in the DataFrame and upload to Neo4j
    for index, row in df.iterrows():
        upload_triples_neo4j(row)


In [None]:
#Sherpa upload
import os
from bel_json_importer.n4j_meta import Neo4jClient
from bel_json_importer.n4j_bel import Neo4jBel
paths = []
for path, _, files in os.walk("data/Sherpa"): #substitute it with "data" to laod covid and NDD and sherpa triples only
    for file in files:
        print(file)
        if file.endswith(".json"):
            print(path)
            paths.append(os.path.join(path, file))
neo = Neo4jClient(
    uri="bolt://localhost:7687", database="neo4j", user="neo4j", password="12345678"
)
#Add all three graphs covid ad pd and comorbidity
n4jbel = Neo4jBel(client=neo)
for path in paths:
    n4jbel.import_json(input_path=path, update_from_protein2gene=False) #Maria added True

print("Done")

#remember to add this fr convininece:

'match(n)-[r]->(m) where "sherpa" in r.annotationDatasource set r.source = "sherpa"'

In [None]:
#CBM uplaod
import os
from bel_json_importer.n4j_meta import Neo4jClient
from bel_json_importer.n4j_bel import Neo4jBel
paths = []
for path, _, files in os.walk("data/CBM/data"): #substitute it with "data" to laod covid and NDD and sherpa triples only
    for file in files:
        print(file)
        if file.endswith(".json"):
            print(path)
            paths.append(os.path.join(path, file))
neo = Neo4jClient(
    uri="bolt://localhost:7687", database="neo4j", user="neo4j", password="12345678"
)
#Add all three graphs covid ad pd and comorbidity
n4jbel = Neo4jBel(client=neo)
for path in paths:
    n4jbel.import_json(input_path=path, update_from_protein2gene=False) #Maria added True

print("Done")

In [None]:
#UPLOAD SCAI AD PD NDD COVID graph
import os
from bel_json_importer.n4j_meta import Neo4jClient
from bel_json_importer.n4j_bel import Neo4jBel
paths = []
for path, _, files in os.walk("data/SCAI-graphs"): #substitute it with "data" to laod covid and NDD and sherpa triples only
    for file in files:
        print(file)
        if file.endswith(".json"):
            print(path)
            paths.append(os.path.join(path, file))
neo = Neo4jClient(
    uri="bolt://localhost:7687", database="neo4j", user="neo4j", password="12345678"
)
#Add all three graphs covid ad pd and comorbidity
n4jbel = Neo4jBel(client=neo)
for path in paths:
    n4jbel.import_json(input_path=path, update_from_protein2gene=False) #Maria added True

print("Done")

In [None]:
#import KEGG
import pandas as pd
import re
import os
from neo4j import GraphDatabase

# Define the directory containing the CSV files
csv_dir = file_path = r'C:\Users\nbabaiha\Documents\GitHub\COMMUTE\commute\compare-curated-sources\kegg-api-responses-complete\updated_csv_files'


# Function to clean up information
def clean_name(name):
    return re.sub(r'\s*\[.*?\]|\(.*?\)', '', name).strip()

# Connect to Neo4j
def connect_to_neo4j(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to create nodes and relationships in Neo4j
def upload_nodes_and_relationships(driver, records):
    with driver.session() as session:
        for record in records:
            name = record.get('NAME', '')

            # Ensure each field is a string before splitting, using empty list if None
            genes = str(record.get('GENE_SYMBOL', '')).split('; ') if isinstance(record.get('GENE_SYMBOL', ''), str) else []
            drugs = str(record.get('DRUG', '')).split('; ') if isinstance(record.get('DRUG', ''), str) else []
            pathways = str(record.get('PATHWAY', '')).split('; ') if isinstance(record.get('PATHWAY', ''), str) else []
            networks = str(record.get('NETWORK', '')).split('; ') if isinstance(record.get('NETWORK', ''), str) else []

            # Log the processed data to confirm it is being read correctly
            print(f"Processing Disease Node: {name}")
            print(f"  Genes: {genes}")
            print(f"  Drugs: {drugs}")
            print(f"  Pathways: {pathways}")
            print(f"  Networks: {networks}")

            # Create NAME node (Disease)
            session.run("""
            MERGE (n:Disease {name: $name})
            """, name=name)
            
            # Create and relate each gene to the disease
            for gene in genes:
                if gene:
                    cleaned_gene = clean_name(gene)
                    print(f"  Creating Gene Node: {cleaned_gene}")
                    session.run("""
                    MERGE (g:Gene {name: $cleaned_gene})
                    MERGE (n:Disease {name: $name})
                    MERGE (n)-[r:ASSOCIATION]->(g)
                    ON CREATE SET r.source = 'KEGG'
                    """, cleaned_gene=cleaned_gene, name=name)

            # Create and relate each drug to the disease
            for drug in drugs:
                if drug:
                    cleaned_drug = clean_name(drug)
                    print(f"  Creating Drug Node: {cleaned_drug}")
                    session.run("""
                    MERGE (d:Drug {name: $cleaned_drug})
                    MERGE (n:Disease {name: $name})
                    MERGE (n)-[r:TREATED_BY]->(d)
                    ON CREATE SET r.source = 'KEGG'
                    """, cleaned_drug=cleaned_drug, name=name)

            # Create and relate each pathway to the disease
            for pathway in pathways:
                if pathway:
                    cleaned_pathway = clean_name(pathway)
                    print(f"  Creating Pathway Node: {cleaned_pathway}")
                    session.run("""
                    MERGE (p:Pathway {name: $cleaned_pathway})
                    MERGE (n:Disease {name: $name})
                    MERGE (n)-[r:INVOLVED_IN]->(p)
                    ON CREATE SET r.source = 'KEGG'
                    """, cleaned_pathway=cleaned_pathway, name=name)

            # Create and relate each network to the disease
            for network in networks:
                if network:
                    cleaned_network = clean_name(network)
                    print(f"  Creating Network Node: {cleaned_network}")
                    session.run("""
                    MERGE (nw:Network {name: $cleaned_network})
                    MERGE (n:Disease {name: $name})
                    MERGE (n)-[r:PART_OF_NETWORK]->(nw)
                    ON CREATE SET r.source = 'KEGG'
                    """, cleaned_network=cleaned_network, name=name)

        print("Upload completed successfully!")

# Your Neo4j credentials and connection details
neo4j_uri = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "12345678"

# Connect to the Neo4j instance
driver = connect_to_neo4j(neo4j_uri, neo4j_user, neo4j_password)

# Process each CSV file in the directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_dir, filename)
        data = pd.read_csv(file_path)

        if 'NAME' not in data.columns:
            raise ValueError(f"The file {filename} must contain a 'NAME' column.")

        # Create a list of dictionaries for each row in the file
        records = data.to_dict('records')

        # Upload nodes and relationships for each record
        print(f"Uploading records from {filename} to Neo4j...")
        upload_nodes_and_relationships(driver, records)

# Close the Neo4j driver connection
driver.close()



