In [10]:
import json

def format_for_cytoscape(neo4j_results):
    """
    Convert Neo4j query results into Cytoscape.js compatible format with visual grouping by node type
    Adding synthetic BGC-Protein and BGC-Genome relationships for visualization
    """
    import random
    
    # Extract the first result dictionary
    data = neo4j_results[0]
    
    # Define colors for each node type with proper case to match visualization
    type_colors = {
        'Protein': '#66c2a5',    # Soft green
        'Genome': '#fc8d62',     # Coral
        'Sample': '#8da0cb',     # Light blue
        'BGC': '#e78ac3',        # Pink
        'Reaction': '#a6d854',   # Lime green
        'Compound': '#ffd92f'    # Yellow
    }
    
    # Node type mapping with proper case
    node_type_mapping = {
        'proteins': 'Protein',
        'genomes': 'Genome',
        'samples': 'Sample',
        'bgcs': 'BGC',
        'reactions': 'Reaction',
        'compounds': 'Compound'
    }
    
    nodes = []
    edges = []
    
    # Process all node types
    node_collections = [
        'proteins',
        'genomes',
        'samples',
        'bgcs',
        'reactions',
        'compounds'
    ]
    
    # Get lists of all proteins, genomes, and BGCs for synthetic relationships
    all_proteins = []
    all_genomes = []
    all_bgcs = []
    
    # Add nodes and collect IDs
    for collection in node_collections:
        for node in data[collection]:
            if node['id'] is not None:
                node_type = node_type_mapping[collection]
                nodes.append({
                    'data': {
                        'id': str(node['id']),
                        'label': str(node.get('name', node['id'])),
                        'type': node_type,
                        'color': type_colors[node_type]
                    }
                })
                
                # Collect IDs for synthetic relationships
                if collection == 'proteins':
                    all_proteins.append(node['id'])
                elif collection == 'genomes':
                    all_genomes.append(node['id'])
                elif collection == 'bgcs':
                    all_bgcs.append(node['id'])
    
    # Process original relationships
    rel_collections = [
        ('genome_sample_rels', 'Genome-Sample'),
        ('genome_protein_rels', 'Genome-Protein'),
        ('protein_reaction_rels', 'Protein-Reaction'),
        ('compound_reaction_rels', 'Compound-Reaction')
    ]
    
    # Add original edges
    for rel_key, rel_group in rel_collections:
        for rel in data[rel_key]:
            if rel['source_id'] is not None and rel['target_id'] is not None:
                edges.append({
                    'data': {
                        'id': f"{rel['source_id']}-{rel['target_id']}",
                        'source': str(rel['source_id']),
                        'target': str(rel['target_id']),
                        'type': rel['type'],
                        'group': rel_group
                    }
                })
    
    # Add synthetic BGC relationships
    print(f"\nCreating synthetic relationships:")
    print(f"Number of BGCs: {len(all_bgcs)}")
    print(f"Number of Proteins: {len(all_proteins)}")
    print(f"Number of Genomes: {len(all_genomes)}")
    
    synthetic_edges_count = 0
    
    # Distribute BGCs among genomes
    bgcs_per_genome = {}
    for bgc_id in all_bgcs:
        genome_id = random.choice(all_genomes)
        if genome_id not in bgcs_per_genome:
            bgcs_per_genome[genome_id] = []
        bgcs_per_genome[genome_id].append(bgc_id)
        
        # Add Genome-BGC relationship
        edges.append({
            'data': {
                'id': f"{genome_id}-{bgc_id}",
                'source': str(genome_id),
                'target': str(bgc_id),
                'type': 'CONTAINS',
                'group': 'Genome-BGC'
            }
        })
        synthetic_edges_count += 1
    
    # Add BGC-Protein relationships
    for bgc_id in all_bgcs:
        # If we have fewer proteins than the desired minimum, use all available proteins
        if len(all_proteins) < 3:
            selected_proteins = all_proteins  # Use all available proteins
        else:
            # Randomly select 5-15 proteins for each BGC
            num_proteins = random.randint(3, min(5, len(all_proteins)))
            selected_proteins = random.sample(all_proteins, num_proteins)
        
        for protein_id in selected_proteins:
            edges.append({
                'data': {
                    'id': f"{bgc_id}-{protein_id}",
                    'source': str(bgc_id),
                    'target': str(protein_id),
                    'type': 'CONTAINS',
                    'group': 'BGC-Protein'
                }
            })
            synthetic_edges_count += 1
    
    print(f"Added {synthetic_edges_count} synthetic relationships")
    print(f"Total nodes: {len(nodes)}")
    print(f"Total edges: {len(edges)}")
    
    return {
        'elements': {
            'nodes': nodes,
            'edges': edges
        }
    }
def get_and_save_kg_visualization(input_json_path, output_json_path):
    """
    Process Neo4j query results and save as Cytoscape.js compatible JSON file
    """
    # Read the Neo4j query results
    with open(input_json_path, 'r') as f:
        neo4j_data = json.load(f)
    
    # Format for Cytoscape
    cytoscape_data = format_for_cytoscape(neo4j_data)
    
    # Save formatted data
    with open(output_json_path, 'w') as f:
        json.dump(cytoscape_data, f, indent=2)
    
    print(f"\nCytoscape visualization data saved to {output_json_path}")
    
    # Debug: Check the saved file
    with open(output_json_path, 'r') as f:
        saved_data = json.load(f)
        print("\nVerifying saved data:")
        print(f"Number of nodes in saved file: {len(saved_data['elements']['nodes'])}")
        print(f"Number of edges in saved file: {len(saved_data['elements']['edges'])}")
        
        # Count synthetic relationships in saved file
        synthetic_count = sum(1 for edge in saved_data['elements']['edges'] 
                            if edge['data']['group'] in ['Genome-BGC', 'BGC-Protein'])
        print(f"Number of synthetic relationships in saved file: {synthetic_count}")
    
# Example usage:
input_file = "KG_sample_data.json"  # Your input file
output_file = "cytoscape_graph.json"  # The file that cytoscape.js will read

get_and_save_kg_visualization(input_file, output_file)


Creating synthetic relationships:
Number of BGCs: 2717
Number of Proteins: 390
Number of Genomes: 383
Added 13634 synthetic relationships
Total nodes: 5963
Total edges: 19945

Cytoscape visualization data saved to cytoscape_graph.json

Verifying saved data:
Number of nodes in saved file: 5963
Number of edges in saved file: 19945
Number of synthetic relationships in saved file: 13634
