In [None]:
#This code processes a JSON file containing Human Phenotype Ontology (HPO) data to extract information about HPO terms, their relationships, and lineages.
import json
import re
from collections import defaultdict

# Extracts HPO term information (ID, label, definition, synonyms) from a node in the JSON data.
def extract_info(node):
    hpo_info = {}
    hp_id_match = re.search(r'(HP_\d+)', node['id'])
    if hp_id_match:
        hp_id = hp_id_match.group(1)
        info_dict = {'label': '', 'definition': '', 'synonyms': []}
        lbl = node.get('lbl', '')
        if lbl:
            info_dict['label'] = lbl
        if 'meta' in node and 'definition' in node['meta']:
            definition_val = node['meta']['definition'].get('val', '')
            if definition_val:
                info_dict['definition'] = definition_val
        if 'meta' in node and 'synonyms' in node['meta']:
            synonyms = [synonym.get('val', '') for synonym in node['meta']['synonyms'] if synonym.get('val', '')]
            if synonyms:
                info_dict['synonyms'] = synonyms
        if info_dict['label'] or info_dict['definition'] or info_dict['synonyms']:
            hpo_info[hp_id] = info_dict
    return hpo_info

# Reads the JSON file, processes each node to extract HPO information, and returns a dictionary of all HPO terms.
def process_json_file(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    all_hpo_info = {}
    for graph in data.get('graphs', []):
        for node in graph.get('nodes', []):
            hpo_info = extract_info(node)
            if hpo_info:
                all_hpo_info.update(hpo_info)
    return all_hpo_info

# Recursively finds all ancestral lineages for a given HPO term.
def find_lineages(term, lineage=[]):
    current_lineage = lineage + [f'{label_map.get(term, "No label")} ({term})']
    if term not in parent_map:
        return [current_lineage]
    lineages = []
    for parent in parent_map[term]:
        lineages.extend(find_lineages(parent, current_lineage))
    return lineages

# Load and process the JSON data
hpo_data = process_json_file('hp.json')

# Reads the JSON file and initializes mappings for node labels and parent-child relationships.
with open('hp.json', 'r') as file:
    data = json.load(file)
nodes = data['graphs'][0]['nodes']
edges = data['graphs'][0]['edges']
label_map = {node['id'].split('/')[-1]: node.get('lbl', 'No label') for node in nodes if 'lbl' in node}
parent_map = defaultdict(list)
for edge in edges:
    sub_id = edge['sub'].split('/')[-1]
    obj_id = edge['obj'].split('/')[-1]
    parent_map[sub_id].append(obj_id)

# Create a dictionary to store all lineages for each term
# Stores all lineages for each HPO term, excluding obsolete terms.
hpo_lineage = {}
for term in label_map.keys():
    if 'obsolete' in label_map.get(term, '').lower():
        continue
    lineages = find_lineages(term)
    hpo_lineage[term] = [' -> '.join(lineage) for lineage in lineages]

# Initialize dictionaries for tracking relationships
immediate_parents = defaultdict(set)
immediate_descendants = defaultdict(set)
all_descendants = defaultdict(set)

# Initializes and updates dictionaries to track immediate and all descendants for each HPO term.
for term, lineages in hpo_lineage.items():
    for lineage in lineages:
        terms = [t.split(' ')[-1].strip('()\n') for t in lineage.split(' -> ')]
        for i, term in enumerate(terms):
            if i < len(terms) - 1:
                immediate_parents[terms[i]].add(terms[i+1])
                immediate_descendants[terms[i+1]].add(terms[i])
            if i > 0:
                for descendant in terms[:i]:
                    all_descendants[terms[i]].add(descendant)

# Adds relationship counts (unique parents, immediate descendants, total descendants) to each HPO term.
for hpo_id in set(immediate_parents.keys()).union(immediate_descendants.keys()).union(all_descendants.keys()):
    if hpo_id not in hpo_data:
        hpo_data[hpo_id] = {"Description": "No description available"}
    hpo_data[hpo_id]["Unique_Parent_Count"] = len(immediate_parents[hpo_id])
    hpo_data[hpo_id]["Immediate_Descendant_Count"] = len(immediate_descendants[hpo_id])
    hpo_data[hpo_id]["Total_Descendant_Count"] = len(all_descendants[hpo_id])

# Adds sorted lineage information to each HPO term.
lineages_by_term = defaultdict(list)
for term, lineages in hpo_lineage.items():
    lineages_by_term[term].extend(lineages)

for hpo_id, lineages in lineages_by_term.items():
    sorted_lineages = sorted(lineages, key=lambda x: len(x.split(' -> ')))
    if hpo_id in hpo_data:
        hpo_data[hpo_id]["lineage"] = sorted_lineages
    else:
        hpo_data[hpo_id] = {
            "Description": "No description available",
            "lineage": sorted_lineages
        }

# Saves the updated HPO data with lineage information to a new JSON file.
with open('hpo_data_with_lineage.json', 'w') as file:
    json.dump(hpo_data, file, indent=4)
print("The final HPO data with lineage information has been saved to 'hpo_data_with_lineage.json'.")

In [None]:
import json
import csv
import re
import sys
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from fastembed import TextEmbedding

# Increase the CSV field size limit to handle large fields
csv.field_size_limit(sys.maxsize)

# Paths to the files
JSON_FILE_PATH = 'hpo_data_with_lineage.json'
CSV_FILE_PATH = 'HPO_addons.csv'
OUTPUT_FILE = 'G2GHPO_metadata_test.npy'
MODEL_NAME = "BAAI/bge-small-en-v1.5"
CSV_OUTPUT_FILE = 'HP_DB_test.csv'  # Output file for manual inspection

# Load the CSV data
csv_data = pd.read_csv(CSV_FILE_PATH)

# Regular expression pattern to remove parentheses and their contents
PATTERN = re.compile(r'\(.*?\)')

def clean_text(text):
    return re.sub(PATTERN, '', text).replace('_', ' ').lower()

def process_json_file(json_file_path, csv_data):
    """Processes the JSON file and integrates additional information from the CSV."""
    data = []
    csv_rows = []  # Rows for the CSV output
    with open(json_file_path, 'r') as file:
        hpo_data = json.load(file)
    for hp_id, details in hpo_data.items():
        formatted_hp_id = hp_id.replace('_', ':')
        unique_info = set()

        # Clean and add label
        label = details.get('label')
        if label:
            unique_info.add(clean_text(label))

        # Clean and add synonyms
        synonyms = details.get('synonyms', [])
        for synonym in synonyms:
            unique_info.add(clean_text(synonym))

        # Clean and add definition
        definition = details.get('definition', '')
        if definition:
            unique_info.add(clean_text(definition))

        # Add CSV info if available
        csv_addons = csv_data[csv_data['HP_ID'] == formatted_hp_id]['info'].tolist()
        for addon in csv_addons:
            unique_info.add(clean_text(addon))

        # Include lineage information
        lineages = details.get('lineage', [])
        for info in unique_info:
            data.append((formatted_hp_id, info, ', '.join(lineages)))
            csv_rows.append({'HP_ID': formatted_hp_id, 'info': info, 'lineage': ', '.join(lineages)})
    return data, csv_rows

def save_to_csv(csv_rows, output_file):
    """ Saves processed data to a CSV file for manual inspection."""
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['HP_ID', 'info', 'lineage'])
        writer.writeheader()
        writer.writerows(csv_rows)
    print(f"Processed data has been written to {output_file} for inspection.")

def calculate_depth(lineage):
    """ Calculates depth of a term based on the lineage hierarchy. """
    return lineage.count("->") + 1

def extract_organ_system(lineage):
    """ Extracts the organ system from the lineage hierarchy. """
    parts = lineage.split("->")
    return parts[1].strip() if len(parts) > 1 else "Unknown"

def create_vector_database(data, output_file, model_name):
    """ Embeds structured data and saves the embeddings and metadata."""
    print("Initializing embeddings model...")
    embedding_model = TextEmbedding(model_name=model_name)
    if os.path.exists(output_file):
        print("Loading existing embedded documents...")
        embedded_documents = list(np.load(output_file, allow_pickle=True))
    else:
        print("Starting with new embedded documents list...")
        embedded_documents = []
    print(f"Data prepared with {len(data)} terms to embed.")
    batch_size = 100
    total_batches = (len(data) + batch_size - 1) // batch_size
    print("Starting the embedding process...")
    for i in tqdm(range(0, len(data), batch_size), total=total_batches, desc="Embedding Texts"):
        batch_data = data[i:i + batch_size]
        for hp_id, cleaned_info, lineage in batch_data:
            try:
                depth = calculate_depth(lineage)
                organ_system = extract_organ_system(lineage)
                embedding = np.array(list(embedding_model.embed([cleaned_info]))[0])
                metadata = {
                    'embedding': embedding,
                    'metadata': {'info': cleaned_info, 'hp_id': hp_id},
                    'lineage': lineage,
                    'organ_system': organ_system,
                    'depth_from_root': depth
                }
                embedded_documents.append(metadata)
            except Exception as e:
                print(f"Failed to embed text due to {e}")
    np.save(output_file, embedded_documents, allow_pickle=True)
    print(f"All embeddings and metadata are saved in: {output_file}")

def main():   
    # Process JSON and integrate CSV data
    data, csv_rows = process_json_file(JSON_FILE_PATH, csv_data)

    # Save the processed data to a CSV for manual inspection
    save_to_csv(csv_rows, CSV_OUTPUT_FILE)
    print(f"The database contains {len(data)} entries ready for embedding.")
    # Embed data and save the vector database
    create_vector_database(data, OUTPUT_FILE, MODEL_NAME)

if __name__ == "__main__":
    main()