In [4]:
from biochatter.llm_connect import GptConversation

conversation = GptConversation(
    model_name="gpt-3.5-turbo",
    prompts={},
)

In [55]:
from biochatter.prompts import BioCypherPromptEngine
prompt_engine = BioCypherPromptEngine(
    schema_config_or_info_path="config/schema_config.yaml"
)

In [56]:
help(BioCypherPromptEngine.generate_query)

Help on function generate_query in module biochatter.prompts:

generate_query(self, question: str, query_language: str) -> str
    Wrap entity and property selection and query generation; return the
    generated query.
    
    Args:
        question: A user's question.
    
        query_language: The language of the query to generate.
    
    Returns:
        A database query that could answer the user's question.



query = prompt_engine.generate_query(
    question="Which genes are enriched in astrocytes?",
    ## database_language="Cypher", # an error in the example
    query_language='Cypher',
)

In [57]:
prompt_engine.entities

{'Gene': {'represented_as': 'node',
  'preferred_id': 'ensembl_gene_id',
  'label_in_input': 'gene',
  'properties': {'ncbi_gene_name': 'str'}},
 'CellType': {'represented_as': 'node',
  'preferred_id': 'cell_ontology_id',
  'label_in_input': 'cell_type',
  'is_a': 'anatomical entity',
  'properties': {'cell_type_name': 'str',
   'tissue_name': 'str',
   'uberon_tissue_id': 'str'}},
 'Species': {'represented_as': 'node',
  'preferred_id': 'ncbi_txid',
  'label_in_input': 'species',
  'is_a': 'cellular organism',
  'properties': {'species_scientific_name': 'str'}},
 'OrthologousGroup': {'represented_as': 'node',
  'preferred_id': 'eggnog_id',
  'label_in_input': 'orthologous_group',
  'is_a': 'gene family',
  'properties': {'eggnog_dataset_name': 'str', 'eggnog_dataset_id': 'str'}}}

In [58]:
success = prompt_engine._select_entities(
    question="Which genes are enriched in cell type named astrocytes?"
)

In [59]:
prompt_engine.selected_entities
## this need to change

['Gene', 'CellType']

In [60]:
prompt_engine.entities['CellType']


{'represented_as': 'node',
 'preferred_id': 'cell_ontology_id',
 'label_in_input': 'cell_type',
 'is_a': 'anatomical entity',
 'properties': {'cell_type_name': 'str',
  'tissue_name': 'str',
  'uberon_tissue_id': 'str'}}

In [61]:
help(prompt_engine._select_relationships)

Help on method _select_relationships in module biochatter.prompts:

_select_relationships() -> bool method of biochatter.prompts.BioCypherPromptEngine instance
    Given a question and the preselected entities, select relationships for
    the query.
    
    Args:
        question: A user's question.
    
        entities: A list of entities that are relevant to the question.
    
    Returns:
        True if at least one relationship was selected, False otherwise.
    
    Todo:
        Now we have the problem that we discard all relationships that do
        not have a source and target, if at least one relationship has a
        source and target. At least communicate this all-or-nothing
        behaviour to the user.



In [62]:
prompt_engine.relationships

{'GeneInOrthologousGroup': {'represented_as': 'edge',
  'is_a': 'gene to gene family association',
  'label_in_input': 'gene_in_orthologous_group',
  'source': 'Gene',
  'target': 'OrthologousGroup'},
 'GeneFromSpecies': {'represented_as': 'edge',
  'is_a': 'association',
  'label_in_input': 'gene_from_species',
  'source': 'Gene',
  'target': 'Species'},
 'CellTypeFromSpecies': {'represented_as': 'edge',
  'is_a': 'association',
  'label_in_input': 'cell_type_from_species',
  'source': 'CellType',
  'target': 'Species'},
 'GeneEnrichedInCellType': {'represented_as': 'edge',
  'is_a': 'association',
  'label_in_input': 'gene_enriched_in_cell_type',
  'source': 'Gene',
  'target': 'CellType'},
 'GeneEnhancedInCellType': {'represented_as': 'edge',
  'is_a': 'association',
  'label_in_input': 'gene_enhanced_in_cell_type',
  'source': 'Gene',
  'target': 'CellType'}}

In [63]:
success = prompt_engine._select_relationships()

In [64]:
prompt_engine.selected_relationships

['GeneEnrichedInCellType']

In [65]:
prompt_engine.selected_relationship_labels

{'GeneEnrichedInCellType': {'source': 'Gene', 'target': 'CellType'}}

In [66]:
prompt_engine.selected_entities
prompt_engine.selected_entities = list(filter(lambda item: item is not None,prompt_engine.selected_entities ))

In [71]:
prompt_engine.selected_entities

['Gene', 'CellType']

In [72]:
success = prompt_engine._select_properties()

In [73]:
prompt_engine.entities['CellType']['properties']

{'cell_type_name': 'str', 'tissue_name': 'str', 'uberon_tissue_id': 'str'}

In [74]:
prompt_engine.selected_properties

## here even mentioned cell type name, the cell type name property did not get selected

{'Gene': ['ncbi_gene_name']}

In [75]:
prompt_engine.selected_properties = {
    'Gene': ['ncbi_gene_name'],
    "CellType": ['cell_type_name'],
}

In [77]:
query = prompt_engine._generate_query(
    question="Which genes are enriched in astrocytes?",
    entities=prompt_engine.selected_entities,
    relationships=prompt_engine.selected_relationship_labels,
    properties=prompt_engine.selected_properties,
    query_language="Cypher",
)

In [78]:
query

"MATCH (g:Gene)-[:GeneEnrichedInCellType]->(c:CellType)\nWHERE c.cell_type_name = 'astrocytes'\nRETURN g.ncbi_gene_name"

In [None]:
## after some manual work the query can be generated