In [9]:
import neo4j
import json
import logging
import csv
from collections import defaultdict
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [10]:
NEO4J_URI = "bolt://localhost:7687" 
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Reactomeisgood"  
NEO4J_DATABASE = "neo4j"  # Default database name

CURIE_PREFIX_MAPPING = {
    'UniProt': 'UniProtKB',
    'UCSC': 'UniProtKB',
    'Guide to Pharmacology': 'GTOPDB',
    'ChEBI': 'CHEBI',
    'REACT': 'REACTOME',
    'COMPOUND': 'KEGG.COMPOUND',
    'PubChem Compound': 'PUBCHEM.COMPOUND',
    'PubChem Substance': 'PUBCHEM.COMPOUND',
    'KEGG Glycan': 'KEGG.GLYCAN',
    'NCBI Entrez Gene': 'NCBIGene',
    'NCBI Gene': 'NCBIGene',
    'BioGPS Gene': 'NCBIGene',
    'ClinVar': 'CLINVAR'
}
"NCBI Gene"
"UCSC"
"OMIM"
"ENSEMBL"
"BioGPS Gene"
"COSMIC (genes)"
"CTD Gene"
"dbSNP Gene"
"HGNC"
"KEGG"
"Monarch"
"FlyBase"
"Wormbase"
"dictyBase"
" mitochondrial;GRP75;HSPA9B;mt-HSP70"

In [12]:
class BiolinkCompliantGeneRegulationParser:
    """
    Parser that generates biolink-compliant gene regulation output
    """
    
    def __init__(self, uri, user, password, database="neo4j"):
        self.driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
        self.database = database
        
        # Results storage for the three types
        self.direct_regulations = []      # Group Q: Regulator -> (±)regulates_expression_of -> Gene Q
        self.indirect_components = []     # Group X -> Q: Gene X -> is_part_of -> Gene Q  
        self.polarity_aware_regulations = [] # Group X -> Q: Gene X -> (±)regulates_expression_of -> Gene Q
        
        # Output nodes and edges
        self.nodes = {}
        self.edges = []
        
    def close(self):
        self.driver.close()
    
    def extract_direct_regulations(self):
        """
        Extract Group Q: Direct Regulation
        Regulator -> (±)regulates_expression_of -> Gene Q
        """
        logger.info("Extracting direct regulations (Group Q)...")
        
        query = """
        MATCH (r:ReactionLikeEvent)-[reg_rel:regulatedBy]->(reg:Regulation),
        (reg)-[regulator_rel:regulator]->(regulator),
        (r)-[output_rel]->(product:EntityWithAccessionedSequence),
        (product)-[ref_entity_rel:referenceEntity]->(rgp:ReferenceGeneProduct),
        (rgp)-[ref_gene_rel:referenceGene]->(gene:ReferenceDNASequence)
        WHERE r.displayName CONTAINS "Expression"
        
        RETURN DISTINCT
        regulator.displayName as regulator_name,
        regulator.stId as regulator_stId,
        labels(regulator) as regulator_labels,
        gene.geneName as gene_name,
        gene.identifier as gene_identifier,
        gene.databaseName as gene_database,
        CASE 
        WHEN toLower(reg.displayName) CONTAINS "positive" THEN "increases_expression_of"
        WHEN toLower(reg.displayName) CONTAINS "negative" THEN "decreases_expression_of"
        ELSE "regulates_expression_of"
        END AS predicate,
        reg.displayName as regulation_name,
        r.displayName as reaction_name
        """
        
        with self.driver.session(database=self.database) as session:
            result = session.run(query)
            
            for record in result:
                # Create regulator node ID
                regulator_id = f"REACTOME:{record['regulator_stId']}" if record['regulator_stId'] else f"REACTOME:{record['regulator_name']}"
                
                # Create gene node ID  
                gene_id = f"{record['gene_database']}:{record['gene_identifier']}" if record['gene_database'] and record['gene_identifier'] else f"REACTOME:{record['gene_name']}"
                
                # Add regulator node
                self.add_node(regulator_id, record['regulator_name'], ['biolink:NamedThing', 'biolink:MolecularEntity'], record['regulator_labels'])
                
                # Add gene node
                self.add_node(gene_id, record['gene_name'], ['biolink:NamedThing', 'biolink:Gene'])
                
                # Create edge
                edge = {
                    'subject': regulator_id,
                    'predicate': f"biolink:{record['predicate']}",
                    'object': gene_id,
                    'group_type': 'Direct Regulation (Group Q)',
                    'biological_logic': 'A regulator controls a gene expression event producing gene Q',
                    'edge_properties': {
                        'knowledge_level': 'knowledge_assertion',
                        'agent_type': 'manual_agent',
                        'regulation_name': record['regulation_name'],
                        'reaction_name': record['reaction_name']
                    }
                }
                
                self.direct_regulations.append(edge)
                self.edges.append(edge)
            
            logger.info(f"Found {len(self.direct_regulations)} direct regulations")
    
    def extract_indirect_component_relationships(self):
        """
        Extract Group X -> Q: Indirect Regulation via Complex Components
        Gene X -> is_part_of -> Gene Q
        """
        logger.info("Extracting indirect component relationships (Group X -> Q)...")
        
        query = """
        MATCH 
        (r)-[reg_rel:regulatedBy]->(reg),
        (reg)-[regulator_rel:regulator]->(regulator),
        (r)-[output_rel]->(product),
        (product)-[ref_entity_rel:referenceEntity]->(rgp),
        (rgp)-[ref_gene_rel:referenceGene]->(target_gene),
        path = (regulator)-[:hasComponent*]->(member),
        (member)-[sub_ref_entity_rel:referenceEntity]->(sub_rgp),
        (sub_rgp)-[sub_ref_gene_rel:referenceGene]->(component_gene)
        WHERE r.displayName CONTAINS "Expression"
        
        RETURN DISTINCT
        component_gene.geneName as component_gene_name,
        component_gene.identifier as component_gene_id,
        component_gene.databaseName as component_gene_db,
        target_gene.geneName as target_gene_name,
        target_gene.identifier as target_gene_id,  
        target_gene.databaseName as target_gene_db,
        regulator.displayName as complex_name,
        length(path) as component_depth,
        member.displayName as member_name
        """
        
        with self.driver.session(database=self.database) as session:
            result = session.run(query)
            
            for record in result:
                # Create component gene node ID
                comp_gene_id = f"{record['component_gene_db']}:{record['component_gene_id']}" if record['component_gene_db'] and record['component_gene_id'] else f"REACTOME:{record['component_gene_name']}"
                
                # Create target gene node ID
                target_gene_id = f"{record['target_gene_db']}:{record['target_gene_id']}" if record['target_gene_db'] and record['target_gene_id'] else f"REACTOME:{record['target_gene_name']}"
                
                # Add nodes
                self.add_node(comp_gene_id, record['component_gene_name'], ['biolink:NamedThing', 'biolink:Gene'])
                self.add_node(target_gene_id, record['target_gene_name'], ['biolink:NamedThing', 'biolink:Gene'])
                
                # Create edge
                edge = {
                    'subject': comp_gene_id,
                    'predicate': 'biolink:is_part_of',
                    'object': target_gene_id,
                    'group_type': 'Indirect Regulation via Complex Components (Group X -> Q)',
                    'biological_logic': 'Genes (X) encode components of a complex that regulates gene Q',
                    'edge_properties': {
                        'knowledge_level': 'knowledge_assertion',
                        'agent_type': 'manual_agent',
                        'complex_context': record['complex_name'],
                        'component_depth': record['component_depth'],
                        'member_name': record['member_name']
                    }
                }
                
                self.indirect_components.append(edge)
                self.edges.append(edge)
            
            logger.info(f"Found {len(self.indirect_components)} indirect component relationships")
    
    def extract_polarity_aware_regulations(self):
        """
        Extract Group X -> Q: Polarity-Aware Indirect Regulation
        Gene X -> (±)regulates_expression_of -> Gene Q
        """
        logger.info("Extracting polarity-aware indirect regulations (Group X -> Q with polarity)...")
        
        query = """
        MATCH 
        (r:ReactionLikeEvent)-[reg_rel:regulatedBy]->(reg:Regulation),
        (reg)-[regulator_rel:regulator]->(regulator:Complex),
        (r)-[output_rel]->(product:EntityWithAccessionedSequence),
        (product)-[ref_entity_rel:referenceEntity]->(rgp:ReferenceGeneProduct),
        (rgp)-[ref_gene_rel:referenceGene]->(target_gene:ReferenceDNASequence),
        path = (regulator)-[:hasComponent*]->(member),
        (member)-[sub_ref_entity_rel:referenceEntity]->(sub_rgp:ReferenceGeneProduct),
        (sub_rgp)-[sub_ref_gene_rel:referenceGene]->(component_gene:ReferenceDNASequence)
        WHERE r.displayName CONTAINS "Expression"
        
        RETURN DISTINCT
        component_gene.geneName as component_gene_name,
        component_gene.identifier as component_gene_id,
        component_gene.databaseName as component_gene_db,
        target_gene.geneName as target_gene_name,
        target_gene.identifier as target_gene_id,
        target_gene.databaseName as target_gene_db,
        CASE 
        WHEN toLower(reg.displayName) CONTAINS "positive" THEN "increases_expression_of"
        WHEN toLower(reg.displayName) CONTAINS "negative" THEN "decreases_expression_of"
        ELSE "regulates_expression_of"
        END AS predicate,
        regulator.displayName as complex_name,
        length(path) as component_depth,
        member.displayName as member_name,
        reg.displayName as regulation_name,
        r.displayName as reaction_name
        """
        
        with self.driver.session(database=self.database) as session:
            result = session.run(query)
            
            for record in result:
                # Create component gene node ID
                comp_gene_id = f"{record['component_gene_db']}:{record['component_gene_id']}" if record['component_gene_db'] and record['component_gene_id'] else f"REACTOME:{record['component_gene_name']}"
                
                # Create target gene node ID
                target_gene_id = f"{record['target_gene_db']}:{record['target_gene_id']}" if record['target_gene_db'] and record['target_gene_id'] else f"REACTOME:{record['target_gene_name']}"
                
                # Add nodes
                self.add_node(comp_gene_id, record['component_gene_name'], ['biolink:NamedThing', 'biolink:Gene'])
                self.add_node(target_gene_id, record['target_gene_name'], ['biolink:NamedThing', 'biolink:Gene'])
                
                # Create edge with biolink qualifiers for complex context
                edge = {
                    'subject': comp_gene_id,
                    'predicate': f"biolink:{record['predicate']}",
                    'object': target_gene_id,
                    'group_type': 'Polarity-Aware Indirect Regulation (X -> Q with polarity)',
                    'biological_logic': 'Same as (2), but preserving regulatory sign (positive/negative)',
                    'edge_properties': {
                        'knowledge_level': 'knowledge_assertion',
                        'agent_type': 'manual_agent',
                        'qualified_predicate': 'biolink:causes',
                        'object_aspect_qualifier': 'expression',
                        'object_direction_qualifier': 'increased' if 'increases' in record['predicate'] else 'decreased' if 'decreases' in record['predicate'] else 'changed',
                        'subject_context_qualifier': record['complex_name'],
                        'object_context_qualifier': 'Cytosol',  # Could be extracted from reaction compartment
                        'complex_context': record['complex_name'],
                        'component_depth': record['component_depth'],
                        'member_name': record['member_name'],
                        'regulation_name': record['regulation_name'],
                        'reaction_name': record['reaction_name']
                    }
                }
                
                self.polarity_aware_regulations.append(edge)
                self.edges.append(edge)
            
            logger.info(f"Found {len(self.polarity_aware_regulations)} polarity-aware regulations")
    
    def add_node(self, node_id, name, categories, labels=None):
        """
        Add a node to the nodes dictionary
        """
        if node_id not in self.nodes:
            node = {
                'id': node_id,
                'name': name,
                'category': categories
            }
            if labels:
                node['reactome_labels'] = labels
            self.nodes[node_id] = node
    
    def generate_biolink_output(self):
        """
        Generate biolink-compliant KGX output files
        """
        logger.info("Generating biolink-compliant output...")
        
        # Nodes file
        nodes_list = list(self.nodes.values())
        with open('biolink_nodes.json', 'w') as f:
            json.dump(nodes_list, f, indent=2)
        
        # Edges file  
        with open('biolink_edges.json', 'w') as f:
            json.dump(self.edges, f, indent=2)
        
        # TSV format for KGX compatibility
        self.generate_kgx_tsv()
        
        # Summary by group type
        self.generate_summary_report()
        
        logger.info(f"Generated biolink output:")
        logger.info(f"  - {len(nodes_list)} unique nodes")
        logger.info(f"  - {len(self.edges)} total edges")
        logger.info(f"    * {len(self.direct_regulations)} direct regulations")
        logger.info(f"    * {len(self.indirect_components)} indirect component relationships")
        logger.info(f"    * {len(self.polarity_aware_regulations)} polarity-aware regulations")
    
    def generate_kgx_tsv(self):
        """
        Generate TSV files compatible with KGX tools
        """
        # Nodes TSV
        with open('biolink_nodes.tsv', 'w', newline='') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(['id', 'name', 'category'])
            for node in self.nodes.values():
                writer.writerow([node['id'], node['name'], '|'.join(node['category'])])
        
        # Edges TSV
        with open('biolink_edges.tsv', 'w', newline='') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(['subject', 'predicate', 'object', 'group_type', 'knowledge_level', 'agent_type'])
            for edge in self.edges:
                writer.writerow([
                    edge['subject'],
                    edge['predicate'], 
                    edge['object'],
                    edge['group_type'],
                    edge['edge_properties'].get('knowledge_level', ''),
                    edge['edge_properties'].get('agent_type', '')
                ])
    
    def generate_summary_report(self):
        """
        Generate a summary report of the extraction
        """
        summary = {
            'extraction_summary': {
                'total_nodes': len(self.nodes),
                'total_edges': len(self.edges),
                'direct_regulations': len(self.direct_regulations),
                'indirect_components': len(self.indirect_components), 
                'polarity_aware_regulations': len(self.polarity_aware_regulations)
            },
            'sample_relationships': {
                'direct_regulation_sample': self.direct_regulations[:3] if self.direct_regulations else [],
                'indirect_component_sample': self.indirect_components[:3] if self.indirect_components else [],
                'polarity_aware_sample': self.polarity_aware_regulations[:3] if self.polarity_aware_regulations else []
            },
            'biolink_compliance': {
                'node_categories_used': list(set([cat for node in self.nodes.values() for cat in node['category']])),
                'predicates_used': list(set([edge['predicate'] for edge in self.edges])),
                'qualifiers_used': ['qualified_predicate', 'object_aspect_qualifier', 'object_direction_qualifier', 'subject_context_qualifier', 'object_context_qualifier']
            }
        }
        
        with open('biolink_summary_report.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        logger.info("Summary report generated: biolink_summary_report.json")
    
    def run_all_extractions(self):
        """
        Run all three extraction types
        """
        logger.info("=" * 60)
        logger.info("BIOLINK-COMPLIANT GENE REGULATION PARSER")
        logger.info("=" * 60)
        
        # Extract all three types
        self.extract_direct_regulations()
        self.extract_indirect_component_relationships()
        self.extract_polarity_aware_regulations()
        
        # Generate biolink output
        self.generate_biolink_output()
        
        logger.info("=" * 60)
        logger.info("BIOLINK-COMPLIANT OUTPUT GENERATED")
        logger.info("=" * 60)
        logger.info("Files created:")
        logger.info("  - biolink_nodes.json / biolink_nodes.tsv")
        logger.info("  - biolink_edges.json / biolink_edges.tsv")
        logger.info("  - biolink_summary_report.json")

def main():
    if NEO4J_PASSWORD == "your_password_here":
        print("Please update NEO4J_PASSWORD in the script!")
        return
    
    parser = BiolinkCompliantGeneRegulationParser(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)
    
    try:
        parser.run_all_extractions()
    except Exception as e:
        logger.error(f"Error: {e}")
    finally:
        parser.close()

if __name__ == "__main__":
    main()

2025-07-18 11:40:36,013 - INFO - BIOLINK-COMPLIANT GENE REGULATION PARSER
2025-07-18 11:40:36,014 - INFO - Extracting direct regulations (Group Q)...
2025-07-18 11:40:37,392 - INFO - Found 8471 direct regulations
2025-07-18 11:40:37,396 - INFO - Extracting indirect component relationships (Group X -> Q)...
2025-07-18 11:40:44,121 - INFO - Found 146800 indirect component relationships
2025-07-18 11:40:44,122 - INFO - Extracting polarity-aware indirect regulations (Group X -> Q with polarity)...
2025-07-18 11:40:48,293 - INFO - Found 147034 polarity-aware regulations
2025-07-18 11:40:48,293 - INFO - Generating biolink-compliant output...
2025-07-18 11:40:50,875 - INFO - Summary report generated: biolink_summary_report.json
2025-07-18 11:40:50,876 - INFO - Generated biolink output:
2025-07-18 11:40:50,876 - INFO -   - 7449 unique nodes
2025-07-18 11:40:50,877 - INFO -   - 302305 total edges
2025-07-18 11:40:50,877 - INFO -     * 8471 direct regulations
2025-07-18 11:40:50,877 - INFO -    