# Sample Builder - Parametric Queries - Part II

**In this notebook:**

- process data extracted from KG,
- generate sample queries - part II,
- save the data to a file.

There are 31 generating functions in this collection.

## Workspace Setup

In [None]:
%pip install neo4j
%pip install python-levenshtein

In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/cypherGen/'

Mounted at /content/drive
/content/drive/MyDrive/cypherGen


In [None]:
import neo4j
import pandas as pd
import random
import itertools

# Import the local modules
from utils.utilities import *
from utils.graph_utils import *

## Data Preparation for Sample Building

In [None]:
# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/cypherGen/datas/'

# File names
schema_file = 'schema_file.json'
#formatted_schema_file = 'formatted_schema.txt'
node_instances_file = 'node_instances_file.json'
rels_instances_file = 'rels_instances_file.json'

# Read data from JSON files
jschema = read_json(data_path+schema_file)
node_instances = read_json(data_path+node_instances_file)
rels_instances = read_json(data_path+rels_instances_file)

In [None]:
# List of node labesl
nodes = get_nodes_list(jschema)

# Read the nodes with their properties and datatypes
node_props_types = jschema['node_props']

# Read the relationship properties with their datatypes
rel_props_types = jschema['rel_props']

# Read the relationships as a list of triples
relationships = jschema['relationships']

# Extract node labels and properties with data type: STRING
string_properties = get_nodes_properties_of_datatype(jschema, nodes,'STRING')

# Extract node labels and properties with data type: INTEGER
integer_properties = get_nodes_properties_of_datatype(jschema, nodes, 'INTEGER')

# Extract node labels and properties with data type: DATE
date_properties = get_nodes_properties_of_datatype(jschema, nodes, 'DATE')

In [None]:
# Extract and parse n instances of specified datatype

string_parsed = parse_node_instances_datatype(jschema, node_instances, nodes,'STRING')
string_parsed = filter_empty_sublists(string_parsed)

integer_parsed =  parse_node_instances_datatype(jschema, node_instances, nodes,'INTEGER')
integer_parsed = filter_empty_sublists(integer_parsed)

date_parsed =  parse_node_instances_datatype(jschema, node_instances, nodes,'DATE')
date_parsed = filter_empty_sublists(date_parsed)

# All node instances parsed
dtypes_parsed = string_parsed+integer_parsed+date_parsed

In [None]:
# Parse relationships instances

string_string_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'STRING')
string_integer_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'INTEGER')
string_date_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'DATE')
integer_integer_rels = filter_relationships_instances(jschema, rels_instances, 'INTEGER', 'INTEGER')
date_date_rels = filter_relationships_instances(jschema, rels_instances, 'DATE', 'DATE')
all_rels = string_string_rels+string_integer_rels+string_date_rels+integer_integer_rels+date_date_rels

In [None]:
# List to collect the samples
trainer=[]

# Function to select a number of samples of each type
def collect_samples(sampler, M=100):
    M = min(M, len(sampler))
    rsampler = random.sample(sampler, M)
    return rsampler

## Query the Nodes

In [None]:
# Count nodes of given label
def count_nodes_of_given_label():
    def prompter(label_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the total number of {label_1} in the graph!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN count(n)"
                   }
        return message
    sampler = []
    for label in nodes:
        temp_dict = prompter(label)
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_1 = count_nodes_of_given_label()
# Print information about the sampler set
print(f"There are {len(sampler_1)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_1)
# Display an example for inspection
sampler_1[0]

There are 9 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the total number of Article in the graph!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) RETURN count(n)'}

In [None]:
# Find node by property
def find_node_by_property():
    def prompter(label_1, prop_1, val_1):
        # Extract subschema for the variables of interest
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} for which {prop_1} is {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1} {{{prop_1}:'{val_1}'}}) RETURN n"
                   }
        return message

    sampler=[]
    for e in dtypes_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler


# Build the set
sampler_2 =  find_node_by_property()
# Print information about the sampler set
print(f"There are {len(sampler_2)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_2)
# Display an example for inspection
sampler_2[0]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Article for which abstract is   Using matrix inversion and determinant evaluation techniques we prove several\nsummation and transformation formulas for terminating, balanced,\nvery-well-poised, elliptic hypergeometric series.\n!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH (n:Article {abstract:'  Using matrix inversion and determinant evaluation techniques we 

In [None]:
# Find node with property that starts with substring
def find_node_by_start_substring():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} for which {prop_1} starts with {val_1[:3]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[:3]}' RETURN n"
                   }
        return message

    sampler=[]
    for e in string_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_3 = find_node_by_start_substring()
# Print information about the sampler set
print(f"There are {len(sampler_3)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_3)
# Display an example for inspection
sampler_3[0]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Article for which abstract starts with   U!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH (n:Article) WHERE n.abstract STARTS WITH '  U' RETURN n"}

In [None]:
# Find properties of nodes with property that starts with substring
def return_properties_for_prop_by_start_substring():
    def prompter(label_1, prop_1, val_1, prop_2, prop_3):
        # Extract subschema for the variables of interest
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} for which the {prop_1} starts with {val_1[:2]} and return the {prop_1}, {prop_2} and {prop_3}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[:2]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}, n.{prop_3} AS {prop_3}"
                   }
        return message

    sampler=[]

    for entry in string_parsed:
        label_1 = entry[0]
        prop_1 = entry[1]
        val_1 = entry[2]

        if len(node_props_types[label_1]) > 1:
                prop_2 = node_props_types[label_1][0]['property']
                prop_3 = node_props_types[label_1][1]['property']
        else:
                prop_2 = prop_3 = node_props_types[label_1][0]['property']

        temp_dict = prompter(label_1, prop_1, val_1, prop_2, prop_3)
        sampler.append(temp_dict)
    return sampler

# Build the set
sampler_4 = return_properties_for_prop_by_start_substring()
# Print information about the sampler set
print(f"There are {len(sampler_4)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_4)
# Display an example for inspection
sampler_4[20]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Topic for which the description starts with Th and return the description, cluster and description!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Topic) WHERE n.description STARTS WITH 'Th' RETURN n.description AS description, n.cluster AS cluster, n.description AS description"}

## Query the Paths

In [None]:
# Find all one hoops from given node
def find_node_neighbours():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find all nodes directly connected to the {label_1} that has {prop_1} equal to {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH path=(:{label_1} {{{prop_1}:'{val_1}'}})-->() RETURN path"
                   }
        return message

    sampler=[]
    for e in dtypes_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_5 = find_node_neighbours()
# Print information about the sampler set
print(f"There are {len(sampler_5)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_5)
# Display an example for inspection
sampler_5[10]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find all nodes directly connected to the Article that has comments equal to 44 pages!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH path=(:Article {comments:'44 pages'})-->() RETURN path"}

In [None]:
# Node count by property and relation
def find_node_relation_count():
    def prompter(label_1, prop_1,rel_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch all the {label_1} and return the {prop_1} and the number of nodes connected to them via {rel_1}.""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1} AS {prop_1}, size((n)<-[:{rel_1}]-()) AS count"
                   }
        return message

    sampler=[]
    for e in all_rels:
        for k, v in e[4].items():
            temp_dict = prompter(e[0], k, e[2])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_6 = find_node_relation_count()
# Print information about the sampler set
print(f"There are {len(sampler_6)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_6)
# Display an example for inspection
sampler_6[0]

There are 144 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch all the Article and return the name and the number of nodes connected to them via HAS_KEY.',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) RETURN n.name AS name, size((n)<-[:HAS_KEY]-()) AS count'}

In [None]:
# Node count by property and relation
def find_node_relation_node_count():
    def prompter(label_1, prop_1,rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find all the {label_1} and return their {prop_1} along with the count of {label_2} that are linked via {rel_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) RETURN n.{prop_1} AS {prop_1}, count(m) AS count"
                   }
        return message

    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_7 = find_node_relation_node_count()
# Print information about the sampler set
print(f"There are {len(sampler_7)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_7)
# Display an example for inspection
sampler_7[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find all the Article and return their abstract along with the count of Keyword that are linked via HAS_KEY!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) RETURN n.abstract AS abstract, count(m) AS count'}

## Filter with Nodes and Paths

In [None]:
# Count nodes of given label
def find_node_property_count():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the total number of {label_1} that have the {prop_1} recorded!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE EXISTS(n.{prop_1}) RETURN count(n)"
                   }
        return message

    sampler = []

    for e in dtypes_parsed:
        temp_dict = prompter(e[0], e[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_8 = find_node_property_count()
# Print information about the sampler set
print(f"There are {len(sampler_8)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_8)
# Display an example for inspection
sampler_8[0]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the total number of Article that have the abstract recorded!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE EXISTS(n.abstract) RETURN count(n)'}

In [None]:
# Count nodes of given label
def find_node_notproperty_count():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the total number of {label_1} for which the {prop_1} is missing!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE NOT EXISTS(n.{prop_1}) RETURN count(n)"
                   }
        return message
    sampler = []

    for e in dtypes_parsed:
        temp_dict = prompter(e[0], e[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_9 = find_node_notproperty_count()
# Print information about the sampler set
print(f"There are {len(sampler_9)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_9)
# Display an example for inspection
sampler_9[0]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the total number of Article for which the abstract is missing!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE NOT EXISTS(n.abstract) RETURN count(n)'}

In [None]:
# Pattern check
def find_notrelationships():
    def prompter(label_1, rel_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch five {label_1} that are not linked through {rel_1} relationships!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (p:{label_1}) WHERE NOT EXISTS ((p)-[:{rel_1}]->()) RETURN p LIMIT 5"
        }
        return message

    sampler=[]
    for e in all_rels:
        temp_dict = prompter(e[0], e[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_10 = find_notrelationships()
# Print information about the sampler set
print(f"There are {len(sampler_10)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_10)
# Display an example for inspection
sampler_10[0]

There are 72 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch five Article that are not linked through HAS_KEY relationships!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (p:Article) WHERE NOT EXISTS ((p)-[:HAS_KEY]->()) RETURN p LIMIT 5'}

In [None]:
# Pattern check
def find_yesrelationships():
    def prompter(label_1, rel_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find four {label_1} that have {rel_1} links!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (p:{label_1}) WHERE EXISTS ((p)-[:{rel_1}]->()) RETURN p LIMIT 4"
        }
        return message

    sampler=[]
    for e in all_rels:
        temp_dict = prompter(e[0], e[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_11 = find_yesrelationships()
# Print information about the sampler set
print(f"There are {len(sampler_11)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_11)
# Display an example for inspection
sampler_11[0]

There are 72 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find four Article that have HAS_KEY links!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (p:Article) WHERE EXISTS ((p)-[:HAS_KEY]->()) RETURN p LIMIT 4'}

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count():
    def prompter(label_1, prop_1, rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""For each {label_1}, find the number of {label_2} linked via {rel_1} and retrieve the {prop_1} of the {label_1} and the {label_2} counts in ascending order!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS {label_2.lower()}_count ORDER BY {label_2.lower()}_count"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_12 = find_node_relation_ordered_count()
# Print information about the sampler set
print(f"There are {len(sampler_12)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_12)
# Display an example for inspection
sampler_12[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'For each Article, find the number of Keyword linked via HAS_KEY and retrieve the abstract of the Article and the Keyword counts in ascending order!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) WITH DISTINCT n, m RETURN n.abstrac

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count_desc():
    def prompter(label_1, prop_1,rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""For each {label_1} find its {prop_1} and the count of {label_2} linked via {rel_1}, and retrieve seven results in desc order of the counts!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count ORDER BY count DESC LIMIT 7"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_13 = find_node_relation_ordered_count_desc()
# Print information about the sampler set
print(f"There are {len(sampler_13)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_13)
# Display an example for inspection
sampler_13[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'For each Article find its abstract and the count of Keyword linked via HAS_KEY, and retrieve seven results in desc order of the counts!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) WITH DISTINCT n, m RETURN n.abstract AS abstrac

In [None]:
 # Node count by property and relation
def find_node_relation_ordered_count_filter():
    def prompter(label_1, prop_1, rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""For each {label_1} and its {prop_1}, count the {label_2} connected through {rel_1} and fetch the {prop_1} and the counts that are greater than 5, starting with the largest {prop_1} and count!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count WHERE count > 5 RETURN {prop_1}, count ORDER BY {prop_1} DESC, count DESC"
        }
        return message

    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_14 = find_node_relation_ordered_count_filter()
# Print information about the sampler set
print(f"There are {len(sampler_14)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_14)
# Display an example for inspection
sampler_14[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'For each Article and its abstract, count the Keyword connected through HAS_KEY and fetch the abstract and the counts that are greater than 5, starting with the largest abstract and count!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyw

## Aggregations

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count_collect():
    def prompter(label_1, prop_1, rel_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the {prop_1} of the {label_1} that are linked via {rel_1} to more than three {label_2}, and list {label_2} {prop_2} and {label_2} counts, ordering by {label_2} count and limiting to the top six results!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count, COLLECT(m.{prop_2}) as {prop_2} WHERE count > 3 RETURN {prop_1}, count, {prop_2} ORDER BY count LIMIT 6"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            for kk, vv in e[4].items():
                temp_dict = prompter(e[0], k, e[2], e[3], kk)
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_15 = find_node_relation_ordered_count_collect()
# Print information about the sampler set
print(f"There are {len(sampler_15)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_15)
# Display an example for inspection
sampler_15[0]

There are 408 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the abstract of the Article that are linked via HAS_KEY to more than three Keyword, and list Keyword name and Keyword counts, ordering by Keyword count and limiting to the top six results!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->

In [None]:
# Node count by property and relation
def aggregate_integers_by_string():
    def prompter(label_1, prop_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""For each nonull {prop_1} of the {label_1}, how many times does it appear, and what are the minimum, maximum and average values of {prop_2} associated to it?""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NOT NULL WITH DISTINCT n WITH n.{prop_1} as {prop_1}, COUNT(n) AS count, min(n.{prop_2}) AS min({prop_2}), max(n.{prop_2}) AS max({prop_2}), avg(n.{prop_2}) AS avg({prop_2}) RETURN {prop_1}, count, min({prop_2}), max({prop_2}), avg({prop_2})"
        }
        return message
    sampler=[]
    for e in integer_parsed:
        for ee in integer_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
aggregate_integers_by_string()[1]

# Build the set
sampler_16 = aggregate_integers_by_string()
# Print information about the sampler set
print(f"There are {len(sampler_16)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_16)
# Display an example for inspection
sampler_16[0]

There are 25 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'For each nonull article_id of the Article, how many times does it appear, and what are the minimum, maximum and average values of article_id associated to it?',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE n.article_id IS NOT NULL WITH DISTINCT n WITH n.article_id as article_id, COUNT(n) AS count, min(n.article_id) AS min(article_id), max(n.article_id) A

In [None]:
# Node count by property and relation
def aggregate_numerical_by_integer():
    def prompter(label_1, prop_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} counts where {prop_1} is smaller than ten, and return the maximum, minimum and average values of the {prop_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} < 10 WITH DISTINCT n WITH n.{prop_1} as {prop_1}, COUNT(n) AS count, min(n.{prop_2}) AS min({prop_2}), max(n.{prop_2}) AS max({prop_2}), avg(n.{prop_2}) AS avg({prop_2}) RETURN {prop_1}, count, min({prop_2}), max({prop_2}), avg({prop_2})"
        }
        return message
    sampler=[]
    for e in integer_parsed:
        for ee in integer_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_17 = aggregate_numerical_by_integer()
# Print information about the sampler set
print(f"There are {len(sampler_17)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_17)
# Display an example for inspection
sampler_17[0]

There are 25 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Article counts where article_id is smaller than ten, and return the maximum, minimum and average values of the article_id!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE n.article_id < 10 WITH DISTINCT n WITH n.article_id as article_id, COUNT(n) AS count, min(n.article_id) AS min(article_id), max(n.article_id) AS max(article_id), avg(n.article_i

## Filter with WHERE and WITH

In [None]:
# Node count by property and relation
def find_node_property_by_condition_on_node():
    def prompter(label_1, prop_1, rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {prop_1} of {label_1} that each have more than one hundred {label_2} nodes connected via {rel_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count WHERE {label_2} > 100 RETURN {prop_1}"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_18 = find_node_property_by_condition_on_node()
# Print information about the sampler set
print(f"There are {len(sampler_18)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_18)
# Display an example for inspection
sampler_18[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the abstract of Article that each have more than one hundred Keyword nodes connected via HAS_KEY!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) WITH DISTINCT n, m WITH n.abstract AS abstract, count(m) AS count WHERE Keyword 

In [None]:
# Node count by property and relation
def find_node_property_with_count_limit():
    def prompter(label_1, prop_1, rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Search for the {prop_1} values from 20 {label_1} that are linked to {label_2} via {rel_1} and return {prop_1} along with the respective {label_2} counts!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count LIMIT 20"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_19 = find_node_property_with_count_limit()
# Print information about the sampler set
print(f"There are {len(sampler_19)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_19)
# Display an example for inspection
sampler_19[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Search for the abstract values from 20 Article that are linked to Keyword via HAS_KEY and return abstract along with the respective Keyword counts!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) WITH DISTINCT n, m RETURN n.abstrac

In [None]:
# Node count by property and relation
def find_node_property_with_rel():
    def prompter(label_1, prop_1, rel_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the {prop_1} from those {label_1} that are connected to {label_2} via a {rel_1}, and return the respective counts of {label_2} together with the {prop_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count"
        }
        return message
    sampler=[]
    for e in all_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[3])
            sampler.append(temp_dict)
    return sampler

# Build the set
sampler_20 = find_node_property_with_rel()
# Print information about the sampler set
print(f"There are {len(sampler_20)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_20)
# Display an example for inspection
sampler_20[0]

There are 200 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the abstract from those Article that are connected to Keyword via a HAS_KEY, and return the respective counts of Keyword together with the abstract!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) -[:HAS_KEY]->(m:Keyword) WITH DISTINCT n, m RETURN n.

## Temporal functions

In [None]:
# Node count by property and relation
def find_property_after_date():
    def prompter(label_1, prop_1, prop_2):

        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find all {prop_1} for {label_1} that have {prop_2} after January 1, 2020!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_2}) > date('2020-01-01') RETURN n.{prop_1}"
        }
        return message

    sampler=[]
    for e in dtypes_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_21 = find_property_after_date()
# Print information about the sampler set
print(f"There are {len(sampler_21)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_21)
# Display an example for inspection
sampler_21[0]

There are 16 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find all update_date for UpdateDate that have update_date after January 1, 2020!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': "MATCH (n:UpdateDate) WHERE date(n.update_date) > date('2020-01-01') RETURN n.update_date"}

In [None]:
# Node count by property and relation
def find_property_in_year():
    def prompter(label_1, prop_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find all {prop_1} for {label_1} that have {prop_2} in 2020!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_2}).year = 2020 RETURN n.{prop_1}"
        }
        return message

    sampler=[]
    for e in dtypes_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_22 = find_property_in_year()
# Print information about the sampler set
print(f"There are {len(sampler_22)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_22)
# Display an example for inspection
sampler_22[0]

There are 16 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find all update_date for UpdateDate that have update_date in 2020!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE date(n.update_date).year = 2020 RETURN n.update_date'}

In [None]:
# Node count by property and relation
def find_property_in_month():
    def prompter(label_1, prop_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1,], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find how many {prop_1} for {label_1} have {prop_2} in June!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_2}).month = 6 RETURN count(n.{prop_1})"
        }
        return message

    sampler=[]
    for e in dtypes_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_23 = find_property_in_month()
# Print information about the sampler set
print(f"There are {len(sampler_23)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_23)
# Display an example for inspection
sampler_23[12]

There are 16 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find how many update_date for UpdateDate have update_date in June!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE date(n.update_date).month = 6 RETURN count(n.update_date)'}

In [None]:
# Node count for interval
def find_count_in_interval():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""How many {label_1} have {prop_1} between January 1, 2010 and January 1, 2015?!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} >= date('2010-01-01') AND n.{prop_1} <= date('2015-01-01') RETURN count(n) AS {label_1}s"
        }
        return message

    sampler=[]

    for ee in date_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_24 = find_count_in_interval()
# Print information about the sampler set
print(f"There are {len(sampler_24)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_24)
# Display an example for inspection
sampler_24[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'How many UpdateDate have update_date between January 1, 2010 and January 1, 2015?!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': "MATCH (n:UpdateDate) WHERE n.update_date >= date('2010-01-01') AND n.update_date <= date('2015-01-01') RETURN count(n) AS UpdateDates"}

In [None]:
# Node count for interval
def find_property_after_hour():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find how many {label_1}s were {prop_1} after 6PM, January 1, 2020?""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} >= datetime('2010-01-01T18:00:00') RETURN count(n) AS {label_1}s"
        }
        return message

    sampler=[]

    for ee in date_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_25 = find_property_after_hour()
# Print information about the sampler set
print(f"There are {len(sampler_25)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_25)
# Display an example for inspection
sampler_25[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find how many UpdateDates were update_date after 6PM, January 1, 2020?',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': "MATCH (n:UpdateDate) WHERE n.update_date >= datetime('2010-01-01T18:00:00') RETURN count(n) AS UpdateDates"}

In [None]:
# Node count for interval
def find_nodes_today():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""List {label_1} that have {prop_1} in the last 24 hours!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} > datetime() - duration('P1D') RETURN n"
        }
        return message

    sampler=[]

    for ee in date_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_26 = find_nodes_today()
# Print information about the sampler set
print(f"There are {len(sampler_26)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_26)
# Display an example for inspection
sampler_26[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'List UpdateDate that have update_date in the last 24 hours!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': "MATCH (n:UpdateDate) WHERE n.update_date > datetime() - duration('P1D') RETURN n"}

In [None]:
# Node count for interval
def find_nodes_monday():
    def prompter(label_1, prop_1):

        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""How many {label_1} have {prop_1} on a Monday?""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_1}).weekday = 1 RETURN count(n)"
        }
        return message

    sampler=[]
    datetime_parsed=[]
    for ee in date_parsed+datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_27 = find_nodes_monday()
# Print information about the sampler set
print(f"There are {len(sampler_27)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_27)
# Display an example for inspection
sampler_27[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'How many UpdateDate have update_date on a Monday?',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE date(n.update_date).weekday = 1 RETURN count(n)'}

In [None]:
# Node count for interval
def find_nodes_midnight():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Which {label_1} have {prop_1} at exactly midnight?""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1}.hour = 0 AND n.{prop_1}.minute=0 RETURN DISTINCT n"
        }
        return message

    sampler=[]
    datetime_parsed=[]
    for ee in date_parsed+datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_28 = find_nodes_midnight()
# Print information about the sampler set
print(f"There are {len(sampler_28)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_28)
# Display an example for inspection
sampler_28[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Which UpdateDate have update_date at exactly midnight?',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE n.update_date.hour = 0 AND n.update_date.minute=0 RETURN DISTINCT n'}

In [None]:
# Node count by property and relation
def find_node_aggregation_date():
    def prompter(label_1, prop_1, rel_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find {prop_1} values for {label_1} that are connected through {rel_1} to {label_2} with {prop_2} later than January 1, 2018 and return both the {prop_1} and the count of associated {label_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WHERE m.{prop_2} > date('2018-01-01') WITH n, count(m) AS {label_2}_Count ORDER BY {label_2}_Count DESC LIMIT 1 RETURN n.{prop_1}, {label_2}_Count"
        }
        return message

    sampler=[]
    for e in date_parsed:
        for er in all_rels:
            if e[0] == er[4]:
                for k, v in er[1].items():
                    temp_dict = prompter(er[0], k, er[2], e[0], e[1])
                    sampler.append(temp_dict)

    return sampler

# Build the set
sampler_29 = find_node_aggregation_date()
# Print information about the sampler set
print(f"There are {len(sampler_29)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_29)
# Display an example for inspection
sampler_29

There are 0 queries in this subset.


[]

In [None]:
# Node count by property and relation
def find_node_aggregation_date_rels():
    def prompter(label_1, prop_1, rel_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Calculate the average {prop_2} for {label_2} that are linked to {label_1} via {rel_1} and have {prop_2} date falling between January 1, 2018 and December 31, 2020!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[:{rel_1}]->(m1:{label_2}) WHERE m1.{prop_2} < date('2020-12-31') WITH n MATCH (n) -[:{rel_1}]->(m2:{label_2}) WHERE m2.{prop_2} > date('2012-01-01') WITH n MATCH (n:{label_1}) -[:{rel_1}]->(m:{label_2}) RETURN n.{prop_1}, avg(m.{prop_2}.year) AS avg({prop_2})"
        }
        return message

    sampler=[]
    for e in date_parsed:
        for er in all_rels:
            if e[0] == er[4]:
                for k, v in er[1].items():
                    temp_dict = prompter(er[0], k, er[2], e[0], e[1])
                    sampler.append(temp_dict)

    return sampler

# Build the set
sampler_30 = find_node_aggregation_date_rels()
# Print information about the sampler set
print(f"There are {len(sampler_30)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_30)
# Display an example for inspection
sampler_30


There are 0 queries in this subset.


[]

## Data Saving

In [None]:
# The number of fine-tuning pairs collected
len(trainer)

1671

In [None]:
# Save data to a file
trainer_two = "trainer_two.json"
write_json(trainer, data_path+trainer_two)