In [1]:
from biochatter.llm_connect import GptConversation

conversation = GptConversation(
    model_name="gpt-4",
    prompts={},
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from biochatter.prompts import BioCypherPromptEngine
prompt_engine = BioCypherPromptEngine(
    schema_config_or_info_path="config/schema_config.yaml"
)

In [3]:
help(BioCypherPromptEngine.generate_query)

Help on function generate_query in module biochatter.prompts:

generate_query(self, question: str, query_language: str) -> str
    Wrap entity and property selection and query generation; return the
    generated query.
    
    Args:
        question: A user's question.
    
        query_language: The language of the query to generate.
    
    Returns:
        A database query that could answer the user's question.



In [3]:
query = prompt_engine.generate_query(
    question="Which genes are enriched in astrocytes?",
    ## database_language="Cypher", # an error in the example
    query_language='Cypher',
)

In [46]:
query

"MATCH (g:Gene)-[:IN_ORTHOLOGOUS_GROUP]->(og:OrthologousGroup)\nWHERE og.group_id = '8ZMG8'\nRETURN g.external_gene_name"

In [9]:
query = prompt_engine.generate_query(
    question="Which genes are in the orthologous group '8ZMG8'?",
    ## database_language="Cypher", # an error in the example
    query_language='Cypher',
)

In [16]:
import re

In [60]:
property_regex_where = r'[a-zA-Z]+\.\S+ |[a-zA-Z]+\..+$'
entity_regex = r'\([a-zA-Z]+:[a-zA-Z]+\)'
replace_regex = r' .*'

In [121]:
query

"MATCH (g:Gene)-[:IN_ORTHOLOGOUS_GROUP]->(og:OrthologousGroup)\nWHERE og.group_id = '8ZMG8'\nRETURN g.external_gene_name"

In [122]:
used_entities = re.findall(entity_regex, query)
used_entities

['(g:Gene)', '(og:OrthologousGroup)']

In [123]:
used_properties = re.findall(property_regex_where, query)
used_properties

['og.group_id ', 'g.external_gene_name']

In [124]:
def map_entities_to_labels(entity_list):
    entity_mapping = {}
    for entity in entity_list:
        match = re.match(r"\((\w+):(\w+)\)", entity)
        if match:
            label, entity_type = match.groups()
            entity_mapping[label] = entity_type
    
    return entity_mapping

# Example list of entities
entity_list = used_entities

# Map entities to labels
entity_mapping = map_entities_to_labels(entity_list)

# Display the result
print("Entity Label to Type Mapping:")
for label, entity_type in entity_mapping.items():
    print(f"{label}: {entity_type}")


Entity Label to Type Mapping:
g: Gene
og: OrthologousGroup


In [125]:
used_properties

['og.group_id ', 'g.external_gene_name']

In [126]:
re.match(r"(\w+)\.(\w+)", 'og.group_id '.strip()).groups()

('og', 'group_id')

In [127]:
def map_properties_to_labels(property_list):

    property_mapping = {}
    for property in property_list:
        match = re.match(r"(\w+)\.(\w+)", property)
        if match:
            label, property_type = match.groups()
            property_mapping[label] = property_type
    
    return property_mapping

# Example list of entities
property_list = used_properties

# Map entities to labels
property_mapping = map_properties_to_labels(property_list)

In [128]:
property_mapping

{'og': 'group_id', 'g': 'external_gene_name'}

In [129]:
entity_mapping

{'g': 'Gene', 'og': 'OrthologousGroup'}

In [130]:
def join_dictionaries(dict1, dict2):
    result_dict = {}
    
    for key in dict1:
        if key in dict2:
            result_dict[dict1[key]] = dict2[key]
    
    return result_dict


In [131]:
used_entity_property = join_dictionaries(entity_mapping, property_mapping)

In [132]:
used_entity_property

{'Gene': 'external_gene_name', 'OrthologousGroup': 'group_id'}

In [138]:
used_entity_property.items()

dict_items([('Gene', 'external_gene_name'), ('OrthologousGroup', 'group_id')])

In [140]:
entity='Gene'
property='external_gene_name'

In [141]:
avail_property_entity = list(prompt_engine.entities[entity]['properties'].keys())
avail_property_entity

['external_gene_name']

In [143]:
property in avail_property_entity

True

In [144]:
for entity, property in used_entity_property.items():
    print(entity)
    print(property)
    print(type(property))
    avail_property_entity = list(prompt_engine.entities[entity]['properties'].keys())
    print(avail_property_entity)
    print(property in avail_property_entity)

Gene
external_gene_name
<class 'str'>
['external_gene_name']
True
OrthologousGroup
group_id
<class 'str'>
['eggnog_dataset_name', 'eggnog_dataset_id']
False


In [148]:
entity in prompt_engine.entities.keys()

True

In [145]:

list(prompt_engine.entities[entity]['properties'].keys())

['eggnog_dataset_name', 'eggnog_dataset_id']

In [119]:
avail_property_entity

['eggnog_dataset_name', 'eggnog_dataset_id']

In [None]:
avail_property_entity = list(prompt_engine.entities[entity]['properties'].keys())

In [36]:
all_used_prop = re.findall(get_propertis_regex, query)

In [42]:
all_used_prop

[".group_id = '8ZMG8'", '.external_gene_name']

In [43]:
def substract_property(str):
    return re.sub('\.', '', re.sub(replace_regex, "", str))

In [44]:
used_properties = [substract_property(i) for i in all_used_prop ]

In [45]:
used_properties

['group_id', 'external_gene_name']

In [41]:
prompt_engine.entities

{'Gene': {'represented_as': 'node',
  'preferred_id': 'ensembl_gene_id',
  'label_in_input': 'gene',
  'properties': {'external_gene_name': 'str'}},
 'CellType': {'represented_as': 'node',
  'preferred_id': 'cell_ontology_id',
  'label_in_input': 'cell_type',
  'is_a': 'cell',
  'properties': {'cell_type_name': 'str',
   'tissue_name': 'str',
   'uberon_tissue_id': 'str'}},
 'Species': {'represented_as': 'node',
  'preferred_id': 'ncbi_txid',
  'label_in_input': 'species',
  'is_a': 'cellular organism',
  'properties': {'species_scientific_name': 'str'}},
 'OrthologousGroup': {'represented_as': 'node',
  'preferred_id': 'eggnog_id',
  'label_in_input': 'orthologous_group',
  'is_a': 'gene family',
  'properties': {'eggnog_dataset_name': 'str', 'eggnog_dataset_id': 'str'}}}

In [7]:
success = prompt_engine._select_entities(
    question="Which genes are enriched in cell type named astrocytes?"
)

In [8]:
prompt_engine.selected_entities
## this need to change

['Gene', 'CellType', 'Gene', 'CellType']

In [9]:
prompt_engine.entities['CellType']


{'represented_as': 'node',
 'preferred_id': 'cell_ontology_id',
 'label_in_input': 'cell_type',
 'is_a': 'cell',
 'properties': {'cell_type_name': 'str',
  'tissue_name': 'str',
  'uberon_tissue_id': 'str'}}

In [10]:
help(prompt_engine._select_relationships)

Help on method _select_relationships in module biochatter.prompts:

_select_relationships() -> bool method of biochatter.prompts.BioCypherPromptEngine instance
    Given a question and the preselected entities, select relationships for
    the query.
    
    Args:
        question: A user's question.
    
        entities: A list of entities that are relevant to the question.
    
    Returns:
        True if at least one relationship was selected, False otherwise.
    
    Todo:
        Now we have the problem that we discard all relationships that do
        not have a source and target, if at least one relationship has a
        source and target. At least communicate this all-or-nothing
        behaviour to the user.



In [11]:
prompt_engine.relationships

{'GeneInOrthologousGroup': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'gene to gene family association',
  'label_in_input': 'gene_in_orthologous_group',
  'source': 'Gene',
  'target': 'OrthologousGroup'},
 'GeneFromSpecies': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'gene_from_species',
  'source': 'Gene',
  'target': 'Species'},
 'CellTypeFromSpecies': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'cell_type_from_species',
  'source': 'CellType',
  'target': 'Species'},
 'GeneEnrichedInCellType': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'gene_enriched_in_cell_type',
  'source': 'Gene',
  'target': 'CellType'},
 'GeneEnhancedInCellType': {'represent

In [12]:
success = prompt_engine._select_relationships()

In [13]:
prompt_engine.selected_relationships

['GeneEnrichedInCellType', 'GeneEnrichedInCellType']

In [14]:
prompt_engine.selected_relationship_labels

{'GeneEnrichedInCellType': {'source': 'Gene', 'target': 'CellType'}}

prompt_engine.selected_entities
prompt_engine.selected_entities = list(filter(lambda item: item is not None,prompt_engine.selected_entities ))

In [15]:
e_props = {}
for entity in prompt_engine.selected_entities:
            if prompt_engine.entities[entity].get("properties"):
                e_props[entity] = list(
                    prompt_engine.entities[entity]["properties"].keys()
                )

In [16]:
e_props

{'Gene': ['external_gene_name'],
 'CellType': ['cell_type_name', 'tissue_name', 'uberon_tissue_id']}

In [17]:
success = prompt_engine._select_properties()

In [18]:
prompt_engine.selected_properties

## here even mentioned cell type name, the cell type name property did not get selected

{'Gene': ['external_gene_name']}

In [19]:
prompt_engine.entities['CellType']['properties']

{'cell_type_name': 'str', 'tissue_name': 'str', 'uberon_tissue_id': 'str'}

In [64]:
prompt_engine.selected_properties = {
    'Gene': ['external_gene_name'],
    "CellType": ['cell_type_name'],
}

In [65]:
query = prompt_engine._generate_query(
    question="Which genes are enriched in astrocytes?",
    entities=prompt_engine.selected_entities,
    relationships=prompt_engine.selected_relationship_labels,
    properties=prompt_engine.selected_properties,
    query_language="Cypher",
)

In [66]:
query

"MATCH (g:Gene)-[:GeneEnrichedInCellType]->(c:CellType)\nWHERE c.cell_type_name = 'astrocytes'\nRETURN g.external_gene_name"

In [67]:
## after some manual work the query can be generated

In [68]:
[ entity for entity in prompt_engine.selected_properties.keys()]

['Gene', 'CellType']

In [69]:
prompt_engine.selected_properties['Gene']

['external_gene_name']

In [70]:
selected_property_entity = prompt_engine.selected_properties['Gene']

In [71]:
avail_property_entity = list(prompt_engine.entities['Gene']['properties'].keys())

In [75]:
all(i in avail_property_entity for i in selected_property_entity)

True

In [73]:
selected_property_entity

['external_gene_name']

In [74]:
avail_property_entity

['external_gene_name']

In [76]:
def check_property_exists(entity):
    selected_property_entity = prompt_engine.selected_properties[entity]
    avail_property_entity = list(prompt_engine.entities[entity]['properties'].keys())
    return all(i in avail_property_entity for i in selected_property_entity)

In [78]:
passed_entities = [ entity for entity in prompt_engine.selected_properties.keys() if check_property_exists(entity)]

In [80]:
# what pct of entities passed
len(passed_entities) / len(prompt_engine.selected_properties.keys())

1.0

In [86]:
prompt_engine.selected_properties

{'Gene': ['external_gene_name'], 'CellType': ['cell_type_name']}

In [85]:
prompt_engine.relationships['GeneInOrthologousGroup']['property']

KeyError: 'property'

In [149]:
query

"MATCH (g:Gene)-[:IN_ORTHOLOGOUS_GROUP]->(og:OrthologousGroup)\nWHERE og.group_id = '8ZMG8'\nRETURN g.external_gene_name"

In [152]:
prompt_engine.relationships

{'GeneInOrthologousGroup': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'gene to gene family association',
  'label_in_input': 'gene_in_orthologous_group',
  'source': 'Gene',
  'target': 'OrthologousGroup'},
 'GeneFromSpecies': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'gene_from_species',
  'source': 'Gene',
  'target': 'Species'},
 'CellTypeFromSpecies': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'cell_type_from_species',
  'source': 'CellType',
  'target': 'Species'},
 'GeneEnrichedInCellType': {'represented_as': 'edge',
  'is_relationship': True,
  'present_in_knowledge_graph': True,
  'is_a': 'association',
  'label_in_input': 'gene_enriched_in_cell_type',
  'source': 'Gene',
  'target': 'CellType'},
 'GeneEnhancedInCellType': {'represent

In [155]:
check_property_exists(prompt_engine, query)

[True, False]


In [164]:
query_alt = "MATCH (ct:CellType {cell_type_name: 'Astro'})<-[:GeneEnrichedInCellType]-(g:Gene)\nRETURN g)"

In [None]:
r"\((\w+):(\w+) \{(\w+).+\}\)"

In [214]:
query_alt

"MATCH (ct:CellType {cell_type_name: 'Astro'})<-[:GeneEnrichedInCellType]-(g:Gene)\nRETURN g)"

In [218]:
re.search(r"\((\w+):\w+ \{(\w+): ", query_alt).groups()

('ct', 'cell_type_name')

In [256]:
def substract_property(string, replace_regex):
    return re.sub('\.', '', re.sub(replace_regex, "", string))

def map_entities_to_labels(entity_list):
    entity_mapping = {}
    for entity in entity_list:
        match = re.match(r"\((\w+):(\w+)", entity)
        if match:
            label, entity_type = match.groups()
            entity_mapping[label] = entity_type
    
    return entity_mapping

def map_where_properties_to_labels(property_list):
    property_mapping = {}
    for property in property_list:
        match = re.match(r"(\w+)\.(\w+)", property)
        if match:
            label, property_type = match.groups()
            property_mapping[label] = property_type
    
    return property_mapping

def map_bracket_properties_to_labels(property_list):
    property_mapping = {}
    for property in property_list:
        match = re.search(r"\((\w+):\w+ \{(\w+):", property)
        if match:
            label, property_type = match.groups()
            property_mapping[label] = property_type
    
    return property_mapping

def join_dictionaries(dict1, dict2):
    result_dict = {}
    for key in dict1:
        if key in dict2:
            result_dict[dict1[key]] = dict2[key]
    
    return result_dict

def get_used_property_from_query(query):

    if "WHERE" in query:
        property_regex_where = r'[a-zA-Z]+\.\S+ |[a-zA-Z]+\..+$'
        used_properties = re.findall(property_regex_where, query)
        used_properties = [ i.strip() for i in used_properties]
        property_mapping = map_where_properties_to_labels(used_properties)

    elif '{' in query:
        property_regex_bracket = r"\(\w+:\w+ \{\w+: "
        used_properties = re.findall(property_regex_bracket, query)
        used_properties = [ i.strip() for i in used_properties]
        property_mapping = map_bracket_properties_to_labels(used_properties)

    # all entities involved in the query with the variable name
    entity_regex = r'\([a-zA-Z]+:[a-zA-Z]+'
    used_entities = re.findall(entity_regex, query)
    used_entities = [ i.strip() for i in used_entities]

    # map entity and property via the variable name
    entity_mapping = map_entities_to_labels(used_entities)
    
    # get all the entity and respective properties used in the cypher query
    used_entity_property = join_dictionaries(entity_mapping, property_mapping)

    return entity_mapping, property_mapping, used_entity_property




def check_property_exists(prompt_engine, query):
    
    score = []

    used_entity_property = get_used_property_from_query(query)[2]

    for entity, property in used_entity_property.items():
        if entity in prompt_engine.entities.keys():
            avail_property_entity = list(prompt_engine.entities[entity]['properties'].keys())
        elif entity in prompt_engine.relationships.keys():
            avail_property_entity = list(prompt_engine.relationships[entity]['properties'].keys())
        score.append(property in avail_property_entity)
        
    print(score)


In [257]:
check_property_exists( prompt_engine, query)

[True, False]


In [252]:
check_property_exists( prompt_engine, query_alt)

[True]


In [253]:
get_used_property_from_query(query)

({'g': 'Gene', 'og': 'OrthologousGroup'},
 {'og': 'group_id', 'g': 'external_gene_name'},
 {'Gene': 'external_gene_name', 'OrthologousGroup': 'group_id'})

In [254]:
get_used_property_from_query(query_alt)

({'ct': 'CellType', 'g': 'Gene'},
 {'ct': 'cell_type_name'},
 {'CellType': 'cell_type_name'})

In [255]:
query_alt

"MATCH (ct:CellType {cell_type_name: 'Astro'})<-[:GeneEnrichedInCellType]-(g:Gene)\nRETURN g)"

In [236]:
property_regex_bracket = r"\(\w+:\w+ \{\w+: "
used_properties = re.findall(property_regex_bracket, query_alt)
used_properties = [ i.strip() for i in used_properties]
property_mapping = map_bracket_properties_to_labels(used_properties)

In [241]:
used_properties

['(ct:CellType {cell_type_name:']

In [239]:
map_bracket_properties_to_labels(used_properties)

{}

In [242]:
re.search(r"\((\w+):\w+ \{(\w+):"])

In [265]:
'random' in prompt_engine.entities['Gene']

False

In [267]:
re.match(r"(\w+)\.(\w+)", 'r.GENE_EXPRESSED_IN_CELL_TYPE').groups()

('r', 'GENE_EXPRESSED_IN_CELL_TYPE')

In [268]:
prompt_engine.selected_properties

{'Gene': {'external_gene_name': []}}