# Data Building: Basic Cypher Queries

## Workspace Setup

In [None]:
# Load credentials and other environment variables
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

# Graph database credentials
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
pwd = os.getenv("NEO4J_PWD")

# Relevant files
schema_prompt_file = os.getenv("SCHEMA_PROMPT")
schema_full_file = os.getenv("SCHEMA_FULL")
schema_simplified_file = os.getenv("SCHEMA_SIMPLIFIED")
nodes_file = os.getenv("NODES_LIST")
node_props_file = os.getenv("NODES_PROPERTIES")
rels_props_file = os.getenv("RELATIONSHIPS_PROPERTIES")
rels_file = os.getenv("RELATIONSHIPS_LIST")
instances_nodes_file = os.getenv("INSTANCES_NODES")
instances_relationships_file = os.getenv("INSTANCES_RELATIONSHIPS")

In [None]:
# Set the working directory and import the local modules
import sys
from pathlib import Path
parent_dir = Path.cwd() #.parent
sys.path.append(str(parent_dir))

import itertools
from itertools import combinations
import random

from helpers.utilities import *
from helpers.graph_utils import *
from helpers.neo4j_schema import Neo4jSchema

In [None]:
# Instantiate the schema utilities module
sutils = Neo4jSchema(uri, user, pwd)

## Load Sample Data

In [None]:
#### If graph data is saved - read from files

# Read the nodes list
nodes = read_json(nodes_file)

# Read the nodes with their properties
node_props_types = read_json(node_props_file)

# Read the relationship properties
rel_props_types = read_json(rels_props_file)

# Read the relationships
rels = read_json(rels_file)

# Read the nodes instances
instances_nodes = read_json(instances_nodes_file)

# Read the relationships instances
instances_rels = read_json(instances_relationships_file)

In [None]:
# Parse the extracted node instances for a given data type
string_parsed = parse_instances(nodes, node_props_types, 'STRING', instances_nodes)
integer_parsed = parse_instances(nodes, node_props_types, 'INTEGER', instances_nodes)
float_parsed = parse_instances(nodes, node_props_types, 'FLOAT', instances_nodes)
boolean_parsed = parse_instances(nodes, node_props_types, 'BOOLEAN', instances_nodes)
date_parsed = parse_instances(nodes, node_props_types, 'DATE', instances_nodes)
datetime_parsed = parse_instances(nodes, node_props_types, 'DATE_TIME', instances_nodes)

In [None]:
# Extract nodes and properties with data type: STRING
string_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'STRING')
# Extract nodes and properties with data type: INTEGER
integer_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'INTEGER')
# Extract nodes and properties with data type: BOOLEAN
boolean_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'BOOLEAN')
# Extract nodes and properties with data type: DATE
date_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'DATE')
# Extract nodes and properties with data type: DATE_TIME
datetime_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'DATE_TIME')
# Extract nodes and properties with data type: FLOAT
float_properties = extract_nodes_with_properties_of_specified_type(node_props_types, nodes, 'FLOAT')


In [None]:
# Filter source and target nodes properties for relationships instances
string_to_string = filter_relationships_instances(node_props_types,
nodes, instances_rels, 'STRING', 'STRING')
string_to_string[0]

In [None]:
# Filter source and target nodes properties for relationships instances
date_to_string = filter_relationships_instances(node_props_types,
nodes, instances_rels, 'DATE', 'STRING')
date_to_string[0]

## Fine-Tuning Dataset - Cypher Book Based

### Conventions and Notations

- Node variables: var_i
- Node labels: Label_i
- Node properties: prop_i
- Node property values: val_i
- Relationship types: rtype_i
- Relationship properties: rprop_i
- Relationship properties values: rval_i
- Expression: expr_i
- Alias: alias_i

### Using Cypher Syntax

In [None]:
# Collect pairs for fine-tuning
trainer = []

#### Using the nodes syntax

In [None]:
# Node notation via variable (a), (customer), (cust)
def create_node_notation_by_var():
    def prompter(var):
        return {"section": "cypher_syntax_nodes",
            "question": f"""How do you represent a node identified with the {var} variable/alias? It can be of any type.""" ,
            "context": "In Cypher, a node identified with variable n is represented in a query by enclosing it in parantheses, such as (n)",
            "cypher": f"({var})"}

    lower_nodes = [s.lower() for s in nodes]
    truncate_nodes = [s[:4] for s in lower_nodes]
    letters = [chr(i) for i in range(ord("a"), ord("z")+1)]
    sampler = []
    for e in letters+lower_nodes+truncate_nodes:
        temp_dict = prompter(e)
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_node_notation_by_var()[0]

In [None]:
# How many samples are created
len(create_node_notation_by_var())

In [None]:
# Build samples
sample_1 = create_node_notation_by_var()
trainer = trainer + random.sample(sample_1, 50)

In [None]:
# Node notation using label
def create_node_notation_by_label():
    def prompter(label):
        return {"section": "cypher_syntax_nodes",
            "question": f"""How do you represent a node with the {label} label?""" ,
             "context": "In Cypher, a node identified with label NodeLabel is represented using parantheses and a colon, such as (:NodeLabel)", "cypher": f"(:{label})"}

    sampler = []
    for node in nodes:
        temp_dict = prompter(node)
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_node_notation_by_label()[0]

In [None]:
# How many samples are created
len(create_node_notation_by_label())

In [None]:
# Build samples
sample_2 = create_node_notation_by_label()
trainer = trainer + sample_2

In [None]:
# Unspecified node
def create_node_notation():
    def prompter():
        return {"section": "cypher_syntax_nodes",
        "question": """How do you represent a node that is not assigned a variable or an alias, and the label is not specified?""" ,
        "context": "In Cypher, a node that is not assigned a variable or a label is represented using parantheses, such as ()",
        "cypher": "( )"}

    sampler= []
    sampler.append(prompter())
    return sampler

# Creates a list of 1 sample of the form
create_node_notation()

In [None]:
# How many samples are created
len(create_node_notation())

In [None]:
# Build samples
sample_3 = create_node_notation()
trainer = trainer + sample_3

In [None]:
# Node notation using label
def create_node_notation_by_label_var():
    def prompter(label, var):
        return {"section": "cypher_syntax_nodes",
            "question": f"""How do you represent a node with {label} label, identified with the {var} variable/alias?""" ,
            "context": "In Cypher, to represent a node with label NodeLabel and variable n, use the syntax (n:NodeLabel).",
            "cypher": f"({var}:{label})"}

    sampler = []
    for label in nodes:
        temp_dict = prompter(label, label.lower())
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_node_notation_by_label_var()[0]

In [None]:
# How many samples are created
len(create_node_notation_by_label_var())

In [None]:
# Build samples
sample_4 = create_node_notation_by_label_var()
trainer = trainer + sample_4

In [None]:
# Node notation using multiple labels: Customer, Corporation, Customer, Person
def create_multiple_labels_node():
    def prompter(label_1, label_2, var):
        return {"section": "cypher_syntax_nodes",
        "question": f"""How can you depict a node that carries both {label_1} and {label_2} labels, using {var} variable or alias for identification??""" ,
        "context": "In Cypher, you can represent a node with two labels NodeLabel1, NodeLabel2 and variable n, using the syntax (n:NodeLabel_1:NodeLabel_2).",
        "cypher": f"({var}:{label_1}:{label_2})"}

    letters = [chr(i) for i in range(ord("a"), ord("z")+1)]
    label_1 = "NodeLabel_1"
    second_labels = ["NodeLabel_2", "NodeLabel_3"]

    sampler = []
    for var in letters:
        for label_2 in second_labels:
            temp_dict = prompter(label_1, label_2, var)
            sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_multiple_labels_node()[0]

In [None]:
# How many samples are created
len(create_multiple_labels_node())

In [None]:
# Build samples
sample_5 = create_multiple_labels_node()
trainer = trainer + sample_5

#### Using the relationships syntax

In [None]:
# Relationship notation using type and labels
def create_relationship_with_type_and_labels():
    def prompter(label_1, rtype, label_2):
        return {"section": "cypher_syntax_relationships",
        "question": f"""How do you indicate that the node labeled {label_1} is linked to the node labeled {label_2} through a {rtype} relationship? The direction of the relationship is from {label_1} to {label_2}.""" ,
        "context": "In Cypher, represent a node with label NodeLabel1 connected to NodeLabel2 via REL relationship using the syntax (:NodeLabel1)-[:REL]->(:NodeLabel2).",
        "cypher": f"(:{label_1})-[:{rtype}]->(:{label_2})"}

    triples = extract_relationships_triples(rels)

    sampler = []
    for triple in triples:
        temp_dict = prompter(triple[0],triple[1], triple[2] )
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_relationship_with_type_and_labels()[0]

In [None]:
# How many samples are created
len(create_relationship_with_type_and_labels())

In [None]:
# Build samples
sample_6 = create_relationship_with_type_and_labels()
trainer = trainer + sample_6

In [None]:
# Relationship notation using type
def create_relationship_with_type():
    def prompter(rtype):
        return {"section": "cypher_syntax_relationships",
            "question": f"""How do you represent that the node identified by n is connected to another node, p, through {rtype} relationship type. The direction of the relationship is from n to p.""" ,
              "context": "In Cypher, represent a node with variable n connected to p via REL relationship and with direction from n to p, using the syntax (n)-[:REL]->(p).",
        "cypher": f"(n)-[:{rtype}]->(p)"}

    relationships = extract_relationships_list(rels)

    sampler = []
    for rtype in relationships:
        temp_dict = prompter(rtype)
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_relationship_with_type()[0]

In [None]:
# How many samples are created
len(create_relationship_with_type())

In [None]:
# Build samples
sample_7 = create_relationship_with_type()
trainer = trainer + sample_7

In [None]:
# Relationship notation using variables
def create_relationship_with_variables():
    def prompter(var_1, rvar, var_2):
        return {"section": "cypher_syntax_relationships",
            "question": f"""How do you represent a node, identified by {var_1} that has directional relationship leading another node, labeled {var_2}? The type of the relationship can be any, and it is represented by the variable or alias {rvar}.""" ,
              "context": "In Cypher, represent nodes n and p connected via a relationship from n to p of any type and with variable rel using the syntax (n)-[rel]->(p).",
            "cypher": f"({var_1})-[{rvar}]->({var_2})"}

    letters = [chr(i) for i in range(ord("a"), ord("z")+1)]
    # Initialize an empty list to hold the random triples
    random_triples = []
    sampler=[]
    # Extract 20 triples
    for _ in range(20):
        random_triple = random.sample(letters, 3)
        random_triples.append(random_triple)
    for entry in random_triples:
        temp_dict = prompter(entry[0], entry[1],entry[2])
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_relationship_with_variables()[0]

In [None]:
# How many samples are built
len(create_relationship_with_variables())

In [None]:
# Build sample
sample_8 = create_relationship_with_variables()
trainer = trainer + sample_8

In [None]:
# Relationship notation using variables -no direction provided
def create_relationship_with_variables_nodir():
    def prompter(var_1, rvar, var_2):
        return {"section": "cypher_syntax_relationships",
            "question": f"""How do you represent a node identified by {var_1} that is connected to another node, {var_2}, with the direction of the relationship going either way.
            They can be connected via any relationship type, that is assigned variable/alias {rvar}.""" ,
               "context": "In Cypher, represent nodes n and p connected via a relationship of any type, and having variable rel, using the syntax (n)-[rel]-(p).",
        "cypher": f"({var_1})-[{rvar}]-({var_2})"}

    letters = [chr(i) for i in range(ord("a"), ord("z")+1)]

    # Initialize an empty list to hold the random triples
    random_triples = []
    sampler=[]
    # Extract 20 triples
    for _ in range(20):
        random_triple = random.sample(letters, 3)
        random_triples.append(random_triple)
    for entry in random_triples:
        temp_dict = prompter(entry[0], entry[1],entry[2])
        sampler.append(temp_dict)
    return sampler

# Creates a list of samples of the form
create_relationship_with_variables_nodir()[0]


In [None]:
# Build sample
sample_9 = create_relationship_with_variables_nodir()
trainer = trainer + sample_9

In [None]:
# Anonymous relationship using labels
def create_anonymous_relationship():
    def prompter(label_1, label_2):
        return {"section": "cypher_syntax_relationships",
        "question": f"""How do you depict the {label_1} node as being linked to the {label_2} node, via a relationship from {label_1} to {label_2}. The relationship can be of any type and it doesn't need to be assigned a specific variable or label.""" ,
           "context": "In Cypher, represent nodes with labels NodeLabel1 and NodeLabel2 connected via an anonymous relationship, using the syntax (:NodeLabel1)--->(NodeLabel2).",
        "cypher": f"(:{label_1})-->(:{label_2})"}

    triples = extract_relationships_triples(rels)
    sampler = []
    for entry in triples:
        if entry[0] != entry[2]:
            temp_dict = prompter(entry[0],entry[2])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
create_anonymous_relationship()[0]

In [None]:
# How many samples are created
len(create_anonymous_relationship())

In [None]:
# Build sample
sample_10 = create_anonymous_relationship()
trainer = trainer + sample_10

### Working with Cypher Keywords

#### Using MATCH and OPTIONAL MATCH

In [None]:
# Match return node properties
def match_return_property():
    def prompter(label, prop):
        return {"section": "cypher_keywords_match",
        "question": f"""Retrieve the {label} and return their {prop} property. If no nodes carrying both {label} label and {prop} property are found, then no results will be returned.""" ,
        "context": " In Cypher, to find nodes with a specific label NodeLabel and return a property nodeProperty, use the query MATCH (n:NodeLabel) RETURN n.nodeProperty",
        "cypher": f"MATCH (n:{label}) RETURN n.{prop}"}
    sampler=[]
    for label in nodes:
        node_props = sutils.get_node_properties(label)
        for prop in node_props:
            temp_dict = prompter(label,prop)
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_return_property()[0]

In [None]:
# How many samples are created
len(match_return_property())

In [None]:
# Build sample
sample_11 = match_return_property()
trainer = trainer + sample_11

In [None]:
# Optional match return property
def optional_match_return_property():
    def prompter(label, prop):
        return {"section": "cypher_keywords_match",
        "question": f"""For the {label} nodes, return their {prop}. If there are no nodes with both {label} label and {prop} property, null results are returned.""" ,
           "context": "In Cypher, to find nodes with a specific label NodeLabel and return a property nodeProperty, and to return null instead of an error, use the query OPTIONAL MATCH (n:NodeLabel) RETURN n.nodeProperty",
        "cypher": f"OPTIONAL MATCH (n:{label}) RETURN n.{prop}"}
    sampler=[]
    for label in nodes:
        node_props = sutils.get_node_properties(label)
        for prop in node_props:
            temp_dict = prompter(label,prop)
            sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
optional_match_return_property()[1]

In [None]:
# How many samples are created
len(optional_match_return_property())

In [None]:
# Build sample
sample_12 = optional_match_return_property()
trainer = trainer + sample_12

In [None]:
# Two match statements and two properties return
def match_and_match_return_properties():
    def prompter(label_1, prop_1, label_2, prop_2):
        return {"section": "cypher_keywords_match",
        "question": f"""Find all the {label_1} nodes, and assigned variable n, and the {label_2} nodes and assigned variable m. Return {prop_1} property for {label_1} and {prop_2} property for {label_2}.""",
           "context": " In Cypher, to find nodes with label NodeLabel1 and NodeLabel2 and return a property nodeProperty1 for the first label and nodeProperty2 for the second labe, use the query 'MATCH (n:NodeLabel1) MATCH (m:nodeLabel2) RETURN n.nodeProperty1, m.nodeProperty2'",
        "cypher": f"MATCH (n:{label_1}) MATCH (m:{label_2}) RETURN n.{prop_1}, m.{prop_2}"}

    sampler = []
    random_doubles = []
    # Extract 20 pairs of nodes
    for _ in range(50):
        random_double = random.sample(nodes, 2)
        random_doubles.append(random_double)
    for double in random_doubles:
        node_props_1 = sutils.get_node_properties(double[0])
        node_props_2 = sutils.get_node_properties(double[1])
        pairs = list(itertools.product(node_props_1, node_props_2))
        for pair in pairs:
            temp_dict = prompter(double[0],pair[0], double[1], pair[1])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_and_match_return_properties()[0]

In [None]:
# How many labels are created
len(match_and_match_return_properties())

In [None]:
# Build sample
sample_13 = random.sample(match_and_match_return_properties(), 1000)
trainer = trainer + sample_13

In [None]:
# One match, one optional match and two properties return
def match_and_optional_match_return_properties():
    def prompter(label_1, prop_1, label_2, prop_2):
        return {"section": "cypher_keywords_match",
        "question": f"""Find all the {label_1} nodes, and assigned variable n, and the {label_2} nodes and assigned variable m. Return {prop_1} property for {label_1} and {prop_2} property {label_2}. Return null if {label_2} does not exist. """,
           "context": " In Cypher, to find nodes with labels NodeLabel1 and NodeLabel2 and to return a property for each node label, so that the null value is returned if NodeLabel2 is not found, use the query MATCH (n:NodeLabel1) OPTIONAL MATCH (m:NodeLabel2) RETURN n.nodeProperty1, m.nodeProperty2",
        "cypher": f"MATCH (n:{label_1}) OPTIONAL MATCH (m:{label_2}) RETURN n.{prop_1}, m.{prop_2}"}

    sampler = []
    random_doubles = []
    # Extract 20 pairs of nodes
    for _ in range(20):
        random_double = random.sample(nodes, 20)
        random_doubles.append(random_double)
    for double in random_doubles:
        node_props_1 = sutils.get_node_properties(double[0])
        node_props_2 = sutils.get_node_properties(double[1])
        pairs = list(itertools.product(node_props_1, node_props_2))
        for pair in pairs:
            temp_dict = prompter(double[0],pair[0], double[1], pair[1])
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_and_optional_match_return_properties()[0]

In [None]:
# How many samples are created
len(match_and_optional_match_return_properties())

In [None]:
# Build sample
sample_14 = random.sample(match_and_optional_match_return_properties(), 1000)
trainer = trainer + sample_14

#### Using WHERE, LIMIT, ORDER BY

In [None]:
# Extract nodes where a string property has a given value
def where_one_node_property_is_string():
    def prompter(label_1, prop_1, val_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the occurrences of {label_1} where {prop_1} is {val_1}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = '{val_1}' RETURN n"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_is_string()[0]



In [None]:
len(where_one_node_property_is_string())

In [None]:
# Build sample
sample_15 =random.sample(where_one_node_property_is_string(), 1000)
trainer = trainer + sample_15

In [None]:
# Extract nodes where a string property contains a given substring
def where_one_node_property_contains_substring():
    def prompter(label_1, prop_1, val_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the {label_1} where {prop_1} contains {val_1[:5]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} CONTAINS '{val_1[:5]}' RETURN n"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_contains_substring()[100]

In [None]:
# How many samples are built
len(where_one_node_property_contains_substring())

In [None]:
# Build sample
sample_16 =random.sample(where_one_node_property_contains_substring(), 1000)
trainer = trainer + sample_16

In [None]:
# Return nodes where a string property is not null
def where_one_node_property_string_exists():
    def prompter(label_1, prop_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find 10 of {label_1} that possess the specified {prop_1} and return these values.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NOT NULL RETURN n.{prop_1} LIMIT 10"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1])
        sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_string_exists()[29]

In [None]:
# How many samples are created
len(where_one_node_property_string_exists())

In [None]:
# Build sample
sample_17 =random.sample(where_one_node_property_string_exists(), 1000)
trainer = trainer + sample_17

In [None]:
# Retrieve nodes where a string property is null
def where_one_node_property_string_dne():
    def prompter(label_1, prop_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find at most 10 instances of {label_1} that are missing {prop_1}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NULL RETURN n LIMIT 10"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1])
        sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_string_dne()[29]

In [None]:
# How many samples are created
len(where_one_node_property_string_dne())

In [None]:
# Build sample
sample_18 = random.sample(where_one_node_property_string_dne(), 1000)
trainer = trainer + sample_18

In [None]:
# Retrieve nodes where a date property has a specific value
def where_one_node_property_date():
    def prompter(label_1, prop_1, val_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find {label_1}s such that the {prop_1} is {val_1}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = date('{val_1}') RETURN n"}

    sampler=[]

    for entry in date_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_date()[15]

In [None]:
# How many samples are created
len(where_one_node_property_date())

In [None]:
# Build sample
sample_19 =where_one_node_property_date()
trainer = trainer + sample_19

In [None]:
# Retrieve nodes where a date property has a specific year
def where_one_node_property_date_year():
    def prompter(label_1, prop_1, date_year):
        return {"section": "cypher_keywords_where",
        "question": f"""Find {label_1}s with {prop_1} in {date_year}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1}.year = {date_year} RETURN n"}

    sampler=[]

    for entry in date_parsed:
        for date_year in [1978,  2022]:
            temp_dict = prompter(entry[0], entry[1], date_year)
            sampler.append(temp_dict)

    return sampler
# Creates a list of samples of the form
where_one_node_property_date_year()[30]

In [None]:
# How many samples are created
len(where_one_node_property_date_year())

In [None]:
# Build sample
sample_20 =random.sample(where_one_node_property_date_year(), 1000)
trainer = trainer + sample_20

In [None]:
# Retrieve nodes where a string property XOR another string property
def match_with_where_and_xor_strings():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Retrieve the unique values of {prop_2} from {label_1} nodes where either {prop_1} is {val_1} or {prop_2} is not null, but not both conditions simulatenously.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = '{val_1}' XOR n.{prop_2} IS NOT NULL RETURN DISTINCT n.{prop_2} AS {prop_2}"}

    sampler=[]
    for _ in range(500):
        ninst = random_properties(string_parsed, used_pairs=[])
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

In [None]:
# How many samples are created
len(match_with_where_and_xor_strings())

In [None]:
# Build sample
sample_21 = match_with_where_and_xor_strings()
trainer = trainer + sample_21

In [None]:
# Return nodes where a date property and a year for another date property
def match_with_where_and_date():
    def prompter(label_1, prop_1, val_1, prop_2, val_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the {prop_2} for those {label_1}s where {prop_1} is {val_1} and the year of the {prop_2} is {val_2[:4]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = {val_1} AND n.{prop_2}.year = {val_2[:4]} RETURN n.{prop_2} AS {prop_2}"}

    sampler=[]
    for _ in range(1000):
        ninst = random_properties(date_parsed,used_pairs=[])
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3], ninst[4])
        sampler.append(temp_dict)

    return sampler

In [None]:
# How many samples are built - determined by range function
len(match_with_where_and_date())

In [None]:
# Build sample
sample_22 = match_with_where_and_date()
trainer = trainer + sample_22

In [None]:
# Return nodes where a float property is greater, another float is less
def match_with_where_or_float():
    def prompter(label_1, prop_1, val_1, prop_2, val_2, prop_3):
        return {"section": "cypher_keywords_where",
        "question": f"""Find eight instances of {prop_3} values from nodes labeled {label_1} where either {prop_1} exceeds {val_1} or {prop_2} is less than {val_2}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} > {val_1} OR n.{prop_2} < {val_2} RETURN n.{prop_3} AS {prop_3} LIMIT 8"}

    sampler=[]
    used_pairs=[]
    for _ in range(200): # 30 repeats will give error because of pair uniqueness
        ninst = random_properties(float_parsed, used_pairs)
        node_props = sutils.get_node_properties(ninst[0])
        for prop_3 in node_props:
            temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3], ninst[4], prop_3)
            sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_with_where_or_float()[0]

In [None]:
# How many samples are built - determined by the range function
len(match_with_where_or_float())

In [None]:
# Build sample
sample_23 = random.sample(match_with_where_or_float(), 1000)
trainer = trainer + sample_23

In [None]:
# Retrieve nodes where a string property contains a given substring, return 2 properties
def match_with_where_contains_substring():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the {prop_1} and the {prop_2} for those {label_1}s where the {prop_1} contains the substring {val_1[:2]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} CONTAINS {val_1[2:]} RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"}

    sampler=[]
    used_pairs=[]
    for _ in range(500):
        ninst = random_properties(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# The samples of this type - 17 min for 500
#sample_24 = match_with_where_contains_substring()
#sample_24[0]

In [None]:
# How many samples are built - determined by the range function
len(match_with_where_contains_substring())

In [None]:
# Build sample
sample_24 = match_with_where_contains_substring()
trainer = trainer + sample_24

In [None]:
# Retrieve nodes where a string property starts with, return 2 properties
def match_with_where_starts_with_substring():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the {prop_1} and the {prop_2} for those {label_1}s where the {prop_1} starts with {val_1[0]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[0]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"}

    sampler=[]
    used_pairs=[]
    for _ in range(500):
        ninst = random_properties(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# The samples of this type - 17 min for 500
#sample_25 = match_with_where_starts_with_substring()
#sample_25[0]

In [None]:
# How many samples are built - determined by the range function
len(match_with_where_starts_with_substring())

In [None]:
# Build sample
sample_25 = match_with_where_starts_with_substring()
trainer = trainer + sample_25

In [None]:
# Retrieve nodes where a string property ends with, return 2 properties
def match_with_where_ends_with_substring():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Search for ten instances of {prop_1} and {prop_2} among {label_1}s where the {prop_1} terminates with the letter {val_1[-1]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} ENDS WITH '{val_1[-1]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2} LIMIT 10"}

    sampler=[]
    used_pairs=[]
    for _ in range(500):
        ninst = random_properties(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# The samples of this type - takes long time for larger n
#sample_26=match_with_where_ends_with_substring()
#sample_26[0]

In [None]:
# How many samples are bult - determined by the range function
len(match_with_where_ends_with_substring())

In [None]:
# Build sample
sample_26 = match_with_where_ends_with_substring()
trainer = trainer + sample_26

In [None]:
# Retrieve nodes where a string property is not a certain value
def match_with_where_not_value():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Retrieve unique values of {prop_1} and {prop_2} from {label_1}s where the {prop_1} is not {val_1}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} <> '{val_1}' RETURN DISTINCT n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"}

    sampler=[]
    used_pairs=[]
    for _ in range(500): 
        ninst = random_properties(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# There are samples of this type
#sample_27=match_with_where_not_value()
#sample_27[0]

In [None]:
# How many samples are built - determined by the range function
len(match_with_where_not_value())

In [None]:
# Build sample
sample_27 = match_with_where_not_value()
trainer = trainer + sample_27

In [None]:
# Return nodes where a string property is not a certain value
def match_with_where_not_is_value():
    def prompter(label_1, prop_1, val_1, prop_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Find unique values of {prop_1} and {prop_2} from {label_1}s where the {prop_1} does not start with {val_1[0]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE NOT n.{prop_1} STARTS WITH '{val_1[0]}' RETURN DISTINCT n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"}

    sampler=[]
    used_pairs=[]
    for _ in range(500):
        ninst = random_properties(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# The samples of this type
#sample_28=match_with_where_not_is_value()
#sample_28[0]

In [None]:
# How many samples are built - determined by the range function
len(match_with_where_not_is_value())

In [None]:
# Build sample
sample_28 = match_with_where_not_is_value()
trainer = trainer + sample_28

In [None]:
# Return nodes where a property is not null, a second property takes specified values, order by the second property
def match_with_where_not_null():
    def prompter(label_1, prop_1, prop_2, val_2):
        return {"section": "cypher_keywords_where",
        "question": f"""Search for the {prop_1} and {prop_2} attributes from nodes labeled {label_1}s where {prop_1} is not null and {prop_2} exceeds {val_2}. Sort the results by {prop_2}, beginning with the largest values.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1}  IS NOT NULL AND n.{prop_2} > {val_2} RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2} ORDER BY {prop_2} DESC"}

    sampler=[]
    for dinstance in date_parsed:
        for ninstance in integer_parsed:
            if dinstance[0] == ninstance[0]:
                temp_dict = prompter(dinstance[0], dinstance[1], ninstance[1], ninstance[2])
                sampler.append(temp_dict)

    return sampler

# The samples of this type
match_with_where_not_null()[0]

In [None]:
# How many samples are built
len(match_with_where_not_null())

In [None]:
# Build sample
sample_29 = match_with_where_not_null()
trainer = trainer + sample_29

In [None]:
# Usage of regular expression with WHERE
def where_one_node_string_re():
    def prompter(label_1, prop_1, val_1):
        return {"section": "cypher_keywords_where",
        "question": f"""Find the {label_1}s where {prop_1} ends with {val_1[:2]}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} =~'{val_1[1:-1]}.*' RETURN n"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
where_one_node_string_re()[0]


In [None]:
# How many samples are built
len(where_one_node_string_re())

In [None]:
# Build sample
sample_30 = random.sample(where_one_node_string_re(), 1000)
trainer = trainer + sample_30

##### Simple paths and WHERE, EXISTS

In [None]:
# Traverse a simple path
def match_a_simple_path():
    def prompter(label_1, prop_1, val_1, prop_2, rtype_1, label_2):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Find the values of the {prop_2} for the {label_2} nodes that have a {rtype_1} relationship with {label_1} nodes where the {prop_1} is {val_1}. """,
            "context": "",
            "cypher": "MATCH (:{label_1} {{{prop_1}:'{val_1}'}}) <- [:{rtype_1}]-({label_2}) RETURN {label_2}.{prop_2}".format(label_1=label_1, prop_1=prop_1, val_1=val_1, rtype_1=rtype_1, label_2=label_2,prop_2=prop_2)}

    sampler = []

    for entry in instances_rels:
        label_1 = entry[0]
        label_2 = entry[4]
        rtype_1 = entry[2]
        for prop_1, val_1 in entry[1].items():
            for prop_2, _ in entry[3].items():
                temp_dict =prompter(label_1, prop_1, val_1, prop_2, rtype_1, label_2)
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
match_a_simple_path()[0]

In [None]:
# How many samples are built
len(match_a_simple_path())

In [None]:
# Build sample
sample_31 = random.sample(match_a_simple_path(), 2000)
trainer = trainer + sample_31

In [None]:
# Match with relations and property
def where_and_simple_path():
    def prompter(label_1, prop_1, val_1, label_2, prop_2, rtype_1):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Search for the {prop_2} values in {label_2} nodes, that are linked through a {rtype_1} relationship with the {label_1} nodes where {prop_1} property is {val_1}. """,
            "context": "",
            "cypher": f"MATCH (n:{label_1}) -[{rtype_1[:2].lower()}:{rtype_1}]->(m) WHERE n.{prop_1}='{val_1}' RETURN m.{prop_2} "}

    sampler = []

    for entry in string_to_string:
        for prop_1, val_1 in entry[1].items():
            for prop_2, _ in entry[4].items():
                temp_dict =prompter(entry[0], prop_1, val_1, entry[3], prop_2, entry[2])
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
where_and_simple_path()[0]

In [None]:
# How many samples are built
len(where_and_simple_path())

In [None]:
# Build sample
sample_32 = random.sample(where_and_simple_path(), 2000)
trainer = trainer + sample_32

In [None]:
# Match with relations and exists
def where_and_exists_simple_path():
    def prompter(label_1, prop_1, rtype_1, label_2):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Retrieve the {prop_1} of the {label_1} that are connected to {label_2} nodes through a {rtype_1} relationship. """,
            "context": "",
            "cypher": "MATCH (n:{label_1}) WHERE EXISTS {{ MATCH (n)-[:{rtype_1}]->(:{label_2}) }} RETURN n.{prop_1} AS {prop_1}".format(label_1=label_1, prop_1=prop_1, rtype_1=rtype_1, label_2=label_2)}

    sampler = []

    for entry in string_to_string:
        for prop_1, _ in entry[1].items():
            for prop_2, _ in entry[4].items():
                temp_dict =prompter(entry[0], prop_1, entry[2], entry[3])
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
where_and_exists_simple_path()[0]

In [None]:
# The number of samples
len(where_and_exists_simple_path())

In [None]:
# Build sample
sample_33 = random.sample(where_and_exists_simple_path(), 2000)
trainer = trainer + sample_33

In [None]:
# Match with relations, exists and property
def where_and_exists_simple_path_and_property():
    def prompter(label_1, prop_1, rtype_1, label_2, var_2, prop_2):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Search for the {prop_1} of {label_1}s that have a {rtype_1} relationship with {label_2}s where the {prop_1} of the {label_1} matches the {prop_2} of the related  {label_2}. """,
            "context": "",
            "cypher": "MATCH (n:{label_1}) WHERE EXISTS {{ MATCH (n)-[:{rtype_1}]->({var_2}:{label_2}) WHERE n.{prop_1} = {var_2}.{prop_2} }} RETURN n.{prop_1} AS {prop_1}".format(label_1=label_1, prop_1=prop_1, rtype_1=rtype_1, label_2=label_2, var_2 = var_2, prop_2=prop_2)}

    sampler = []

    for entry in string_to_string:
        for prop_1, _ in entry[1].items():
            for prop_2, _ in entry[4].items():
                var_2 = entry[3].lower()
                temp_dict =prompter(entry[0], prop_1, entry[2], entry[3], var_2, prop_2)
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
where_and_exists_simple_path_and_property()[0]

In [None]:
# How many samples are built
len(where_and_exists_simple_path_and_property())

In [None]:
# Build sample
sample_34 = random.sample(where_and_exists_simple_path_and_property(), 2000)
trainer = trainer + sample_34

In [None]:
# Match with relations, not and property
def where_not_simple_path_and_property():
    def prompter(label_1, prop_1, rtype_1, label_2, var_2, prop_2, val_2):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Look for the {prop_1} of the {label_1}s that are not connected to {label_2}s where the {prop_2} is {val_2}. """,
            "context": "",
            "cypher": "MATCH (n:{label_1}), ({var_2}:{label_2} {{{prop_2}: '{val_2}'}}) WHERE NOT (n) --> ({var_2}) RETURN n.{prop_1}".format(label_1=label_1, prop_1=prop_1, rtype_1=rtype_1, label_2=label_2, var_2 = var_2, prop_2=prop_2, val_2=val_2)}

    sampler = []

    for entry in string_to_string:
        for prop_1, _ in entry[1].items():
            for prop_2, val_2 in entry[4].items():
                var_2 = entry[3].lower()
                temp_dict =prompter(entry[0], prop_1, entry[2], entry[3], var_2, prop_2, val_2)
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
where_not_simple_path_and_property()[0]

In [None]:
# The number of samples built
len(where_not_simple_path_and_property())

In [None]:
# Build sample
sample_35 = random.sample(where_not_simple_path_and_property(), 2000)
trainer = trainer + sample_35

#### Using WITH, SKIP

In [None]:
# Match relations, with,
def relation_with_and_where():
    def prompter(label_1, prop_1, val_1, rtype_1, var_2, prop_2, val_2):
        return {"section": "cypher_keyword_match_path",
            "question": f"""Search for the {prop_2} values in nodes labeled with the variable {var_2}. These nodes should have a {prop_2} that begings with {var_2[0]} and must be connected to {label_1} nodes through a {rtype_1} relationship, where the {label_1} nodes have a {prop_1} of {val_1}. """,
            "context": "",
            "cypher": "MATCH ({var_1}:{label_1} {{{prop_1}: '{val_1}'}}) -[:{rtype_1}]- ({var_2}) WITH {var_2} WHERE {var_2}.{prop_2} STARTS WITH '{var_2[0]}' RETURN {var_2}.{prop_2} LIMIT 4".format(var_1=label_1.lower(), label_1=label_1, prop_1=prop_1, val_1=val_1, rtype_1=rtype_1, var_2 = var_2, prop_2=prop_2, val_2=val_2)}

    sampler = []

    for entry in string_to_string:
        for prop_1, val_1 in entry[1].items():
            for prop_2, val_2 in entry[4].items():
                var_2 = entry[3].lower()
                temp_dict =prompter(entry[0], prop_1, val_1, entry[2], var_2, prop_2, val_2)
                sampler.append(temp_dict)

    return sampler

# The samples are of the form
relation_with_and_where()[0]

In [None]:
# The number of samples of this type
len(relation_with_and_where())

In [None]:
# Build sample
sample_36 = random.sample(relation_with_and_where(), 2000)
trainer = trainer + sample_36

In [None]:
# Match, with, skip, limit combination
def match_where_skip_limit_return_property():
    def prompter(label_1, prop_1, val_1, nrecs):
        return {"section": "cypher_keywords_match",
        "question": f"""Find the {label_1} nodes for which {prop_1} starts with {val_1[0]}, skip the first {nrecs} records and return the next {nrecs} records of {prop_1}.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[0]}' WITH n.{prop_1} AS {prop_1} SKIP {nrecs} LIMIT {nrecs} RETURN {prop_1}"}

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2], 3)
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_where_skip_limit_return_property()[0]

In [None]:
# How many samples are built
len(match_where_skip_limit_return_property())

In [None]:
# Build sample
sample_37 = random.sample(match_where_skip_limit_return_property(), 1000)
trainer = trainer + sample_37

In [None]:
# Match, with, skip, limit combination
def match_skip_limit_return_property():
    def prompter(label_1, prop_1, nrecs):
        return {"section": "cypher_keywords_match",
        "question": f"""Return the {prop_1} of the {label_1} nodes, skip the first {nrecs} records and return {nrecs} records.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1}  SKIP {nrecs} LIMIT {nrecs}"}

    sampler=[]
    for entry in float_parsed:
        temp_dict = prompter(entry[0], entry[1], 3)
        sampler.append(temp_dict)

    return sampler

# Creates a list of 10 samples of the form
match_skip_limit_return_property()[0]

In [None]:
# The number of samples built
len(match_skip_limit_return_property())

In [None]:
# Build sample
sample_38 = match_skip_limit_return_property()
trainer = trainer + sample_38

#### Using UNION and UNION ALL

In [None]:
# Union of two sets with filtering duplicates
def match_nodes_with_union():
    def prompter(label_1, prop_1, label_2, prop_2):
        return {"section": "cypher_keywords_match",
        "question": f"""Return the {prop_1} for the {label_1} nodes combined with {prop_2} for the {label_2} nodes, and filter the duplicates if there are any.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1} AS Records UNION MATCH (m:{label_2}) RETURN m.{prop_2} AS Records"}

    sampler=[]
    for _ in range(1000):
        #ninst = random_properties(string_instances)
        pair = random.sample(string_parsed, 2)
        temp_dict = prompter(pair[0][0], pair[0][1],
        pair[1][0], pair[1][1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_nodes_with_union()[0]

In [None]:
# How many samples are built - determined by the range function
len(match_nodes_with_union())

In [None]:
# Build sample
sample_39 = match_nodes_with_union()
trainer = trainer + sample_39

In [None]:
# Union of two sets without filtering duplicates
def match_nodes_with_union_all():
    def prompter(label_1, prop_1, label_2, prop_2):
        return {"section": "cypher_keywords_match",
        "question": f"""Return records of {prop_1} for the {label_1} nodes combined with {prop_2} for the {label_2} nodes, and filter the duplicates if there are any.""" ,
        "context": "",
        "cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1} AS Records UNION ALL MATCH (m:{label_2}) RETURN m.{prop_2} AS Records"}

    sampler=[]
    for _ in range(1000):
        pair = random.sample(string_parsed, 2)
        temp_dict = prompter(pair[0][0], pair[0][1],
        pair[1][0], pair[1][1])
        sampler.append(temp_dict)

    return sampler

# Creates a list of samples of the form
match_nodes_with_union_all()[0]

In [None]:
# How many samples are built - determined by the range function
len(match_nodes_with_union_all())

In [None]:
# Build sample
sample_40 = match_nodes_with_union_all()
trainer = trainer + sample_40

In [None]:
len(trainer)

In [None]:
# Save the samples to a json file
trainer_file = "datas/trainer_one.json"
write_json(trainer, trainer_file)

In [None]:
# Check saved trainer
#trainer_saved = read_json(trainer_file)