# Data Building: Querying the Graph

## Workspace Setup

In [None]:
# Load credentials and other environment variables
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

# Graph database credentials
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
pwd = os.getenv("NEO4J_PWD")

# Relevant files
schema_prompt_file = os.getenv("SCHEMA_PROMPT")
schema_full_file = os.getenv("SCHEMA_FULL")
schema_simplified_file = os.getenv("SCHEMA_SIMPLIFIED")
nodes_file = os.getenv("NODES_LIST")
node_props_file = os.getenv("NODES_PROPERTIES")
rels_props_file = os.getenv("RELATIONSHIPS_PROPERTIES")
rels_file = os.getenv("RELATIONSHIPS_LIST")
instances_nodes_file = os.getenv("INSTANCES_NODES")
instances_relationships_file = os.getenv("INSTANCES_RELATIONSHIPS")

In [None]:
# Set the working directory and import the local modules
import sys
from pathlib import Path
parent_dir = Path.cwd()
sys.path.append(str(parent_dir))

import random

from helpers.utilities import *
from helpers.graph_utils import *
from helpers.neo4j_schema import Neo4jSchema

In [None]:
# Instantiate the schema utilities module
sutils = Neo4jSchema(uri, user, pwd)

## Load Sample Data

In [None]:
#### If graph data is saved - read from files

# Read the nodes list
nodes = read_json(nodes_file)

# Read the nodes with their properties
node_props_types = read_json(node_props_file)

# Read the relationship properties
rel_props_types = read_json(rels_props_file)

# Read the relationships
rels = read_json(rels_file)

# Read the nodes instances
instances_nodes = read_json(instances_nodes_file)

# Read the relationships instances
instances_rels = read_json(instances_relationships_file)

In [None]:
# Parse the extracted node instances for a given data type
string_parsed = parse_instances(nodes, node_props_types, 'STRING', instances_nodes)
integer_parsed = parse_instances(nodes, node_props_types, 'INTEGER', instances_nodes)
float_parsed = parse_instances(nodes, node_props_types, 'FLOAT', instances_nodes)
boolean_parsed = parse_instances(nodes, node_props_types, 'BOOLEAN', instances_nodes)
date_parsed = parse_instances(nodes, node_props_types, 'DATE', instances_nodes)
datetime_parsed = parse_instances(nodes, node_props_types, 'DATE_TIME', instances_nodes)

# Combine all parsed node instances
all_parsed = string_parsed+integer_parsed+float_parsed+boolean_parsed+date_parsed+datetime_parsed

In [None]:
# Sample output
date_parsed[2]

In [None]:
# Extract nodes and properties with data type: STRING
string_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'STRING')
# Extract nodes and properties with data type: INTEGER
integer_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'INTEGER')
# Extract nodes and properties with data type: BOOLEAN
boolean_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'BOOLEAN')
# Extract nodes and properties with data type: DATE
date_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'DATE')
# Extract nodes and properties with data type: DATE_TIME
datetime_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'DATE_TIME')
# Extract nodes and properties with data type: FLOAT
float_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'FLOAT')


In [None]:
# Add parsed properties to node instances
instances_nodes = add_selected_properties(instances_nodes, string_properties, 'string_properties')
instances_nodes = add_selected_properties(instances_nodes, integer_properties, 'integer_properties')
instances_nodes = add_selected_properties(instances_nodes, boolean_properties, 'boolean_properties')
instances_nodes = add_selected_properties(instances_nodes, date_properties, 'date_properties')
instances_nodes = add_selected_properties(instances_nodes, datetime_properties, 'datetime_properties')

In [None]:
# Filter source and target nodes properties for relationships instances
string_to_string = filter_relationships_instances(node_props_types,
nodes, instances_rels, 'STRING', 'STRING')
# Sample output
string_to_string[0]

In [None]:
# Filter source and target nodes properties for relationships instances
date_to_string = filter_relationships_instances(node_props_types,
nodes, instances_rels, 'DATE', 'STRING')
# Sample output
date_to_string[0]

In [None]:
# Filter source and target nodes properties for relationships instances
integer_to_string = filter_relationships_instances(node_props_types,
nodes, instances_rels, 'INTEGER', 'STRING')
# Sample output
integer_to_string[0]

## Fine-Tuning Dataset - Cypher Book Based

### Conventions and Notations

- Node variables: var_i
- Node labels: Label_i
- Node properties: prop_i
- Node property values: val_i
- Relationship types: rtype_i
- Relationship properties: rprop_i
- Relationship properties values: rval_i
- Expression: expr_i
- Alias: alias_i

### Querying the Nodes

In [None]:
trainer = []

In [None]:
# Test the Neo4j connection
qc = "MATCH (p:NodeLabel) RETURN count(p)"
sutils.conn.query(qc)

In [None]:
# Count nodes of given label
def count_nodes_of_given_label():
    def prompter(label):
        return {"section": "cypher_query_nodes",
            "question": f"""Find the total number of {label} nodes in the graph!""" ,
            "context": "In Cypher, to find all the nodes with a specific label NodeLabel and return their number, use the query MATCH (n:NodeLabel) RETURN count(n)",
            "cypher": f"MATCH (n:{label}) RETURN count(n)"}
    sampler = []
    for label in nodes:
        temp_dict = prompter(label)
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
count_nodes_of_given_label()[1]

In [None]:
# The number of available queries
len(count_nodes_of_given_label())

In [None]:
# Build sample
sample_1 = count_nodes_of_given_label()
trainer = trainer + sample_1

In [None]:
# Node distribution
def find_nodes_distribution():
    def prompter(label):
        return {"section": "cypher_query_nodes",
            "question": f"""Fetch the distribution of labels for the {label} nodes, in case they have multiple labels!""" ,
            "context": "In Cypher, to find all the labels distribution of a given node with NodeLabel and return their corresponding counts, use the query MATCH (n:NodeLabel) RETURN labels(n) AS labels, count(n) AS counts",
            "cypher": f"MATCH (n:{label}) RETURN labels(n) AS labels, count(n) AS counts"}
    sampler = []
    for label in nodes:
        temp_dict = prompter(label)
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_nodes_distribution()[1]

In [None]:
# The number of available queries
len(find_nodes_distribution())

In [None]:
# Build sample
sample_2 = find_nodes_distribution()
trainer = trainer + sample_2

In [None]:
# Find node by property
def find_node_by_property():
    def prompter(label, prop, val):
        return {"section": "cypher_query_nodes",
        "question": f"""Find the {label} nodes for which {prop} is {val}. """ ,
        "context": " In Cypher, to find nodes where a given property takes a specific value, use the query MATCH (n:NodeLabel {prop: val}) RETURN n",
        "cypher": f"MATCH (n:{label} {{{prop}:'{val}'}}) RETURN n"}

    sampler=[]
    for e in all_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_by_property()[0]

In [None]:
# The number of available queries
len(find_node_by_property())

In [None]:
# Build sample
sample_3 = random.sample(find_node_by_property(), 1000)
trainer = trainer + sample_3

In [None]:
# Find node with property that starts with substring
def find_node_by_start_substring():
    def prompter(label, prop, val):
        return {"section": "cypher_query_nodes",
        "question": f"""Find the {label} nodes for which {prop} starts with {val[:2]}. """ ,
        "context": " In Cypher, to find nodes where a given property starts with a 'substring', use the query MATCH (n:NodeLabel) WHERE n.prop STARTS WITH 'substring' RETURN n",
        "cypher": f"MATCH (n:{label}) WHERE n.{prop} STARTS WITH '{val[:3]}' RETURN n"}

    sampler=[]
    for e in string_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_by_start_substring()[0]

In [None]:
# The number of available queries
len(find_node_by_start_substring())

In [None]:
# Build sample
sample_4 = random.sample(find_node_by_start_substring(), 1000)
trainer = trainer + sample_4

In [None]:
# Find properties of nodes with property that starts with substring

def return_properties_for_prop_by_start_substring():
    def prompter(label, prop_1, val_1, prop_2, prop_3):
        return {"section": "cypher_query_nodes",
        "question": f"""Find the {label} nodes for which {prop_1} starts with {val_1[:2]} and return the {prop_1}, {prop_2} and {prop_3}. """ ,
        "context": " ",
        "cypher": f"MATCH (n:{label}) WHERE n.{prop_1} STARTS WITH '{val_1[:2]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}, n.{prop_3} AS {prop_3}"}

    sampler=[]

    for entry in string_parsed:
        label = entry[0]
        prop_1 = entry[1]
        val_1 = entry[2]
        for entry in node_props_types:
            if entry['labels'] == label:
                prop_2 = entry['properties'][0]['property']
                prop_3 = entry['properties'][1]['property']


                temp_dict = prompter(label, prop_1, val_1, prop_2, prop_3)
                sampler.append(temp_dict)
    return sampler



# Creates a list of samples of the form
return_properties_for_prop_by_start_substring()[222]

In [None]:
# The number of available queries
len(return_properties_for_prop_by_start_substring())

In [None]:
# Build sample
sample_5 = random.sample(return_properties_for_prop_by_start_substring(), 1000)
trainer = trainer + sample_5

## Querying the Paths

In [None]:
# Find all one hoops from given node
def find_node_neighbours():
    def prompter(label, prop, val):
        return {"section": "cypher_query_paths",
        "question": f"""Find all one hop connections of the {label} node for which {prop} is {val}. """ ,
        "context": "",
        "cypher": f"MATCH path=(:{label} {{{prop}:'{val}'}})-->() RETURN path"}

    sampler=[]
    for e in all_parsed:
        temp_dict = prompter(e[0], e[1], e[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_neighbours()[200]

In [None]:
# The number of available queries
len(find_node_neighbours())

In [None]:
# Build sample
sample_6 = random.sample(find_node_neighbours(), 1000)
trainer = trainer + sample_6

In [None]:
# Node count by property and relation
def find_node_relation_count():
    def prompter(label, prop,rel):
        return {"section": "cypher_query_paths",
        "question": f"""Find all the {label} nodes and return the {prop} property and the number of nodes connected to them via {rel}. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label}) RETURN n.{prop} AS {prop}, size((n)<-[:{rel}]-()) AS count"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[3].items():
            temp_dict = prompter(e[4], k, e[2])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_count()[2]

In [None]:
# The number of available queries
len(find_node_relation_count())

In [None]:
# Build sample
sample_7 = random.sample(find_node_relation_count(), 2000)
trainer = trainer + sample_7

In [None]:
# Node count by property and relation
def find_node_relation_node_count():
    def prompter(label_1, prop_1,rel, label_2):
        return {"section": "cypher_query_paths",
        "question": f"""Find all nodes labelled {label_1} and return their {prop_1} property along with the count of {label_2} that are connected to them through a  {rel} relationship. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) RETURN n.{prop_1} AS {prop_1}, count(m) AS count"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_node_count()[2]

In [None]:
# The number of available queries
len(find_node_relation_node_count())

In [None]:
# Build sample
sample_8 = random.sample(find_node_relation_node_count(), 2000)
trainer = trainer + sample_8

## Filter with Nodes and Paths

In [None]:
# Count nodes of given label
def find_node_property_count():
    def prompter(label, prop):
        return {"section": "filter_nodes_paths",
            "question": f"""Find the total number of {label} nodes that have a {prop}!""" ,
            "context": "",
            "cypher": f"MATCH (n:{label}) WHERE EXISTS(n.{prop}) RETURN count(n)"}
    sampler = []

    for e in all_parsed:
        temp_dict = prompter(e[0], e[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_property_count()[1]

In [None]:
# The number of available queries
len(find_node_property_count())

In [None]:
# Build sample
sample_9 = random.sample(find_node_property_count(), 1000)
trainer = trainer + sample_9

In [None]:
# Count nodes of given label
def find_node_notproperty_count():
    def prompter(label, prop):
        return {"section": "filter_nodes_paths",
            "question": f"""Find the total number of {label} nodes that do not have a {prop}!""" ,
            "context": "",
            "cypher": f"MATCH (n:{label}) WHERE NOT EXISTS(n.{prop}) RETURN count(n)"}
    sampler = []

    for e in all_parsed:
        temp_dict = prompter(e[0], e[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_notproperty_count()[1]

In [None]:
# The number of available queries
len(find_node_notproperty_count())

In [None]:
# Build sample
sample_10 = random.sample(find_node_notproperty_count(), 1000)
trainer = trainer + sample_10

In [None]:
# Pattern check
def find_notrelationships():
    def prompter(label, rel):
        return {"section": "filter_nodes_paths",
        "question": f"""Search for five {label} instances that are not linked through a {rel} relationship. """ ,
        "context": "",
        "cypher": f"MATCH (p:{label}) WHERE NOT EXISTS ((p)-[:{rel}]->()) RETURN p LIMIT 5"}

    sampler=[]
    for e in instances_rels:
        temp_dict = prompter(e[0], e[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_notrelationships()[2]

In [None]:
# The number of available queries
len(find_notrelationships())

In [None]:
# Build sample
sample_11 = find_notrelationships()
trainer = trainer + sample_11

In [None]:
# Pattern check
def find_yesrelationships():
    def prompter(label, rel):
        return {"section": "filter_nodes_paths",
        "question": f"""Find five {label} nodes that have {rel} relationships linked to them. """ ,
        "context": "",
        "cypher": f"MATCH (p:{label}) WHERE EXISTS ((p)-[:{rel}]->()) RETURN p LIMIT 5"}

    sampler=[]
    for e in instances_rels:
        temp_dict = prompter(e[0], e[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_yesrelationships()[200]

In [None]:
# The number of available queries
len(find_yesrelationships())

In [None]:
# Build sample
sample_12 = find_yesrelationships()
trainer = trainer + sample_12

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count():
    def prompter(label_1, prop_1,rel, label_2):
        return {"section": "filter_nodes_paths",
        "question": f"""For each {label_1}, find the number of {label_2} nodes connected to them via {rel}. Retrieve the {prop_1} of the {label_1} and the {label_2} counts in ascending order. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS {label_2.lower()}_count ORDER BY {label_2.lower()}_count"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_ordered_count()[2]

In [None]:
# The number of available queries
len(find_node_relation_ordered_count())

In [None]:
# Build sample
sample_13 = random.sample(find_node_relation_ordered_count(), 2000)
trainer = trainer + sample_13

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count_desc():
    def prompter(label_1, prop_1,rel, label_2):
        return {"section": "filter_nodes_paths",
        "question": f"""For each {label_1} return its {prop_1} and the count of {label_2} nodes connected via {rel}, in desc order based on this count. Limit the output to 10 results.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count ORDER BY count DESC LIMIT 10"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_ordered_count_desc()[1]

In [None]:
# The number of available queries
len(find_node_relation_ordered_count_desc())

In [None]:
# Build sample
sample_14 = random.sample(find_node_relation_ordered_count_desc(), 2000)
trainer = trainer + sample_14

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count_filter():
    def prompter(label_1, prop_1, rel, label_2):
        return {"section": "filter_nodes_paths",
        "question": f"""For each {label_1} and its {prop_1}, count the {label_2} associated to them via {rel} and return the {prop_1} and the counts that are greater than 5, in descending order of {prop_1} and the counts. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count WHERE count > 5 RETURN {prop_1}, {label_2}s ORDER BY {prop_1} DESC, {label_2}s DESC"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_ordered_count_filter()[1]

In [None]:
# The number of available queries
len(find_node_relation_ordered_count_filter())

In [None]:
# Build sample
sample_15 = random.sample(find_node_relation_ordered_count_filter(), 2000)
trainer = trainer + sample_15

## Aggregations

In [None]:
# Node count by property and relation
def find_node_relation_ordered_count_collect():
    def prompter(label_1, prop_1, rel, label_2, prop_2):
        return {"section": "aggregations",
        "question": f"""Which {label_1} {prop_1} had more than 5 associated {label_2}, via {rel}, and can you list those {prop_2} property of {label_2} and their respective counts, ordering the results by the {label_2} count, limiting to the top 10 results? """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count, COLLECT(m.{prop_2}) as {prop_2} WHERE count > 5 RETURN {prop_1}, count, {prop_2}ORDER BY count LIMIT 10"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            for kk, vv in e[3].items():
                temp_dict = prompter(e[0], k, e[2], e[4], kk)
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_relation_ordered_count_collect()[1]

In [None]:
# The number of available queries
len(find_node_relation_ordered_count_collect())

In [None]:
# Build sample
sample_16 = random.sample(find_node_relation_ordered_count_collect(), 2000)
trainer = trainer + sample_16

In [None]:
# Node count by property and relation
def aggregate_integers_by_string():
    def prompter(label_1, prop_1, prop_3):
        return {"section": "aggregations",
        "question": f"""For each {prop_1} of {label_1} that is not null, how many times does it appear, and what are the minimum, maximum and average values of {prop_3} associated to it? """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NOT NULL WITH DISTINCT n WITH n.{prop_1} as {prop_1}, COUNT(n) AS count, min(n.{prop_3}) AS min({prop_3}), max(n.{prop_3}) AS max({prop_3}), avg(n.{prop_3}) AS avg({prop_3}) RETURN {prop_1}, count, min({prop_3}), max({prop_3}), avg({prop_3})"}

    sampler=[]
    for e in string_parsed:
        for ee in integer_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
aggregate_integers_by_string()[1]

In [None]:
# The number of available queries
len(aggregate_integers_by_string())

In [None]:
# Build sample
sample_17 = random.sample(aggregate_integers_by_string(), 1000)
trainer = trainer + sample_17

In [None]:
# Node count by property and relation
def aggregate_floats_by_integer():
    def prompter(label_1, prop_1, prop_3):
        return {"section": "aggregations",
        "question": f"""Find the {label_1}s counts where {prop_1} is smaller than 10, and return the maximum, minimum and average values of the {prop_3}. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} < 10 WITH DISTINCT n WITH n.{prop_1} as {prop_1}, COUNT(n) AS count, min(n.{prop_3}) AS min({prop_3}), max(n.{prop_3}) AS max({prop_3}), avg(n.{prop_3}) AS avg({prop_3}) RETURN {prop_1}, count, min({prop_3}), max({prop_3}), avg({prop_3})"}

    sampler=[]
    for e in integer_parsed:
        for ee in float_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
aggregate_floats_by_integer()[1]


In [None]:
# The number of available queries
len(aggregate_floats_by_integer())

In [None]:
# Build sample
sample_18 = aggregate_floats_by_integer()
trainer = trainer + sample_18

## Filter with WHERE and WITH

In [None]:
# Node count by property and relation
def find_node_property_by_condition_on_node():
    def prompter(label_1, prop_1, rel, label_2):
        return {"section": "filter_where_with",
        "question": f"""Find the {prop_1}s values of {label_1} that each have more than 100 {label_2} nodes connected to them through a {rel} relationship. """ ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m WITH n.{prop_1} AS {prop_1}, count(m) AS count WHERE {label_2}s > 100 RETURN {prop_1}, count"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_property_by_condition_on_node()[2]

In [None]:
# The number of available queries
len(find_node_property_by_condition_on_node())

In [None]:
# Build sample
sample_19 = random.sample(find_node_property_by_condition_on_node(), 2000)
trainer = trainer + sample_19

In [None]:
# Node count by property and relation
def find_node_property_with_count_limit():
    def prompter(label_1, prop_1, rel, label_2):
        return {"section": "filter_where_with",
        "question": f"""Search for the {prop_1} values from 20 {label_1} instances that are linked to {label_2} nodes through a {rel} relationship. Return the {prop_1} along
        with the respective count of {label_2} nodes.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count LIMIT 20"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_property_with_count_limit()[2]

In [None]:
# The number of available queries
len(find_node_property_with_count_limit())

In [None]:
# Build sample
sample_20 = random.sample(find_node_property_with_count_limit(), 2000)
trainer = trainer + sample_19

In [None]:
# Node count by property and relation
def find_node_property_with_skip():
    def prompter(label_1, prop_1, rel, label_2):
        return {"section": "filter_where_with",
        "question": f"""Find the {prop_1} from {label_1} that are related to {label_2} via a {rel} relationship, and also return the respective count of associated {label_2} nodes. Skip the first 20 results.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WITH DISTINCT n, m RETURN n.{prop_1} AS {prop_1}, count(m) AS count SKIP 20"}

    sampler=[]
    for e in instances_rels:
        for k, v in e[1].items():
            temp_dict = prompter(e[0], k, e[2], e[4])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_property_with_skip()[2]

In [None]:
# The number of available queries
len(find_node_property_with_skip())

In [None]:
# Build sample
sample_20 = random.sample(find_node_property_with_skip(), 1000)
trainer = trainer + sample_20

## Temporal Functions

In [None]:
date_parsed[1], datetime_parsed[1]

In [None]:
# Node count by property and relation
def find_property_after_date():
    def prompter(label_1, prop_1, prop_2):
        return {"section": "temporal_data",
        "question": f"""Find all {prop_1} for {label_1} that have {prop_2} after January 1, 2020.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_2} > date('2020-01-01') RETURN n.{prop_1}"}

    sampler=[]
    for e in string_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_property_after_date()[2]

In [None]:
# The number of available queries
len(find_property_after_date())

In [None]:
# Build sample
sample_21 = random.sample(find_property_after_date(), 2000)
trainer = trainer + sample_21

In [None]:
# Node count by property and relation
def find_property_in_year():
    def prompter(label_1, prop_1, prop_2):
        return {"section": "temporal_data",
        "question": f"""Find all {prop_1} for {label_1} that have {prop_2} in 2020.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_2}.year = 2020 RETURN n.{prop_1}"}

    sampler=[]
    for e in string_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_property_in_year()[2]

In [None]:
# The number of available queries
len(find_property_in_year())

In [None]:
# Build sample
sample_22 = random.sample(find_property_in_year(), 2000)
trainer = trainer + sample_22

In [None]:
# Node count by property and relation
def find_property_in_month():
    def prompter(label_1, prop_1, prop_2):
        return {"section": "temporal_data",
        "question": f"""Find how many {prop_1} for {label_1} have {prop_2} in June.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_2}.month = 06 RETURN count(n.{prop_1}) "}

    sampler=[]
    for e in string_parsed:
        for ee in date_parsed:
            if e[0] == ee[0]:
                temp_dict = prompter(e[0], e[1], ee[1])
                sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_property_in_month()[2]

In [None]:
# The number of available queries
len(find_property_in_month())

In [None]:
# Build sample
sample_23 = random.sample(find_property_in_month(), 2000)
trainer = trainer + sample_23

In [None]:
# Node count for interval
def find_count_in_interval():
    def prompter(label_1, prop_1):
        return {"section": "temporal_data",
        "question": f"""Find how many {label_1} have {prop_1} between January 1, 2010 and January 1, 2015?""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} >= date('2010-01-01') AND n.{prop_1} <= date('2015-01-01') RETURN count(n) AS {label_1}s"}

    sampler=[]

    for ee in date_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_count_in_interval()[20]

In [None]:
# The number of available queries
len(find_count_in_interval())

In [None]:
# Build sample
sample_24 = find_count_in_interval()
trainer = trainer + sample_24

In [None]:
# Node count for interval
def find_property_after_hour():
    def prompter(label_1, prop_1):
        return {"section": "temporal_data",
        "question": f"""Find how many {label_1}s were {prop_1} after 6PM, January 1, 2020.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} >= datetime('2010-01-01T18:00:00') RETURN count(n) AS {label_1}s"}

    sampler=[]

    for ee in datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of 397 samples of the form
find_property_after_hour()[20]

In [None]:
# The number of available queries
len(find_property_after_hour())

In [None]:
# Build sample
sample_25 = find_property_after_hour()
trainer = trainer + sample_25

In [None]:
# Node count for interval
def find_nodes_today():
    def prompter(label_1, prop_1):
        return {"section": "temporal_data",
        "question": f"""List {label_1}s that have {prop_1} in the last 24 hours!""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} > datetime() - duration('P1D') RETURN n"}

    sampler=[]

    for ee in datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of 397 samples of the form
find_nodes_today()[20]

In [None]:
# The number of available queries
len(find_nodes_today())

In [None]:
# Build sample
sample_26 = find_nodes_today()
trainer = trainer + sample_26

In [None]:
# Node count for interval
def find_nodes_monday():
    def prompter(label_1, prop_1):
        return {"section": "temporal_data",
        "question": f"""How many {label_1}s have {prop_1} on a Monday?""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_1}).weekday = 1 RETURN count(n)"}

    sampler=[]

    for ee in datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of 397 samples of the form
find_nodes_monday()[20]

In [None]:
# The number of available queries
len(find_nodes_monday())

In [None]:
# Build sample
sample_27 = find_nodes_monday()
trainer = trainer + sample_27

In [None]:
# Node count for interval
def find_nodes_midnight():
    def prompter(label_1, prop_1):
        return {"section": "temporal_data",
        "question": f"""Which {label_1}s have {prop_1} at exactly midnight?""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1}.hour = 0 AND n.{prop_1}.minute=0 RETURN DISTINCT n"}

    sampler=[]

    for ee in datetime_parsed:
        temp_dict = prompter(ee[0], ee[1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_nodes_midnight()[20]

In [None]:
# The number of available queries
len(find_nodes_midnight())

In [None]:
# Build sample
sample_28 = find_nodes_midnight()
trainer = trainer + sample_28

In [None]:
# Node count by property and relation
def find_node_aggregation_date():
    def prompter(label_1, prop_1, rel, label_2, prop_2):
        return {"section": "filter_where_with",
        "question": f"""Search for {prop_1} values in {label_1}s that are connected through a {rel} relationship to {label_2} nodes with a {prop_2} date later than January 1, 2018.
        Return both the {prop_1} values and the count of associated {label_2} nodes.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) WHERE m.{prop_2} > date('2018-01-01') WITH n, count(m) AS {label_2}_Count ORDER BY {label_2}_Count DESC LIMIT 1 RETURN n.{prop_1}, {label_2}_Count"}

    sampler=[]
    for e in date_parsed:
        for er in instances_rels:
            if e[0] == er[4]:
                for k, v in er[1].items():
                    temp_dict = prompter(er[0], k, er[2], e[0], e[1])
                    sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
find_node_aggregation_date()[439]

In [None]:
# The number of available queries
len(find_node_aggregation_date())

In [None]:
# Build sample
sample_29 = random.sample(find_node_aggregation_date(), 2000)
trainer = trainer + sample_29

In [None]:
# Node count by property and relation
def find_node_aggregation_date_rels():
    def prompter(label_1, prop_1, rel, label_2, prop_2):
        return {"section": "filter_where_with",
        "question": f"""Calculate the average {prop_2} for {label_2}s that are linked to {label_1} via a {rel} relationship and have {prop_2} date falling between January 1, 2018 and December 31, 2020.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) -[:{rel}]->(m1:{label_2}) WHERE m1.{prop_2} < date('2020-12-31') WITH n MATCH (n) -[:{rel}]->(m2:{label_2}) WHERE m2.{prop_2} > date('2018-01-01') WITH n MATCH (n:{label_1}) -[:{rel}]->(m:{label_2}) RETURN n.{prop_1}, avg(m.{prop_2}.year) AS avg({prop_2})"}

    sampler=[]
    for e in date_parsed:
        for er in instances_rels:
            if e[0] == er[4]:
                for k, v in er[1].items():
                    temp_dict = prompter(er[0], k, er[2], e[0], e[1])
                    sampler.append(temp_dict)

    return sampler

# Creates a list of 397 samples of the form
find_node_aggregation_date_rels()[439]

In [None]:
# The number of available queries
len(find_node_aggregation_date_rels())

In [None]:
# Build sample
sample_30 = random.sample(find_node_aggregation_date_rels(), 2000)
trainer = trainer + sample_30

In [None]:
len(trainer)

In [None]:
# Save the samples to a json file
trainer_file = "datas/trainer_two.json"
write_json(trainer, trainer_file)

In [None]:
# Combine the two datasets
trainer_file_one = "datas/trainer_one.json"
trainer_file_two = "datas/trainer_two.json"
trainer_file_full = "datas/trainer.json"

In [None]:
trainer_one = read_json(trainer_file_one)
trainer_two = read_json(trainer_file_two)

In [None]:
import random
random_samples = random.sample(trainer_two, 6)

In [None]:
new_samples = [{"question": item["question"], "cypher": item["cypher"]} for item in random_samples]
new_samples

In [None]:
trainer_full = trainer_one + trainer_two
len(trainer_full)

In [None]:
write_json(trainer_full, trainer_file_full)

In [None]:
# Cypher sanity test
qc = "MATCH (n:NodeLabel) WHERE n.property_1 STARTS WITH 'some expression' RETURN n.property_2 AS property_2, n.property_3 AS property_3, n.property_4 AS property_4"
sutils.conn.query(qc)