# Sample Builder - Parametric Queries - Part I

**In this notebook:**

- process data extracted from KG,
- generate sample queries - part I,
- save the generated data to a file.

There are 31 generating functions in this collection.

## Workspace Setup

In [None]:
%pip install neo4j
%pip install python-levenshtein

In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/cypherGen/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/cypherGen


In [None]:
import neo4j
import pandas as pd
import random
import itertools

# Import the local modules
from utils.utilities import *
from utils.graph_utils import *

## Data Preparation for Sample Building

In [None]:
# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/cypherGen/datas/'

# Set file names
schema_file = 'schema_file.json'
#formatted_schema_file = 'formatted_schema.txt'
node_instances_file = 'node_instances_file.json'
rels_instances_file = 'rels_instances_file.json'

# Read the data from files
jschema = read_json(data_path+schema_file)
node_instances = read_json(data_path+node_instances_file) # these are serialized
rels_instances = read_json(data_path+rels_instances_file) # these are serialized

In [None]:
# List of node labels
nodes = get_nodes_list(jschema)

# Read the nodes with their properties and datatypes
node_props_types = jschema['node_props']

# Read the relationship properties with their datatypes
rel_props_types = jschema['rel_props']

# Read the relationships as a list of triples
relationships = jschema['relationships']

# Extract node labels and properties with data type: STRING
string_properties = get_nodes_properties_of_datatype(jschema, nodes,'STRING')

# Extract node labels and properties with data type: INTEGER
integer_properties = get_nodes_properties_of_datatype(jschema, nodes, 'INTEGER')

# Extract node labels and properties with data type: DATE
date_properties = get_nodes_properties_of_datatype(jschema, nodes, 'DATE')


In [None]:
# Extract and parse n instances of specified datatype

string_parsed = parse_node_instances_datatype(jschema, node_instances, nodes,'STRING')
string_parsed = filter_empty_sublists(string_parsed)

integer_parsed =  parse_node_instances_datatype(jschema, node_instances, nodes,'INTEGER')
integer_parsed = filter_empty_sublists(integer_parsed)

date_parsed =  parse_node_instances_datatype(jschema, node_instances, nodes,'DATE')
date_parsed = filter_empty_sublists(date_parsed)

# All node instances parsed
dtypes_parsed = string_parsed+integer_parsed+date_parsed

In [None]:
# Parse relationships instances

string_string_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'STRING')
string_integer_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'INTEGER')
string_date_rels = filter_relationships_instances(jschema, rels_instances, 'STRING', 'DATE')
integer_integer_rels = filter_relationships_instances(jschema, rels_instances, 'INTEGER', 'INTEGER')
date_date_rels = filter_relationships_instances(jschema, rels_instances, 'DATE', 'DATE')
all_rels = string_string_rels+string_integer_rels+string_date_rels+integer_integer_rels+date_date_rels

### Samples Collector

In [None]:
# List to collect the samples
trainer=[]
# Function to select a number of samples of each type
def collect_samples(sampler, M=100):
    M = min(M, len(sampler))
    rsampler = random.sample(sampler, M)
    return rsampler

## MATCH

In [None]:
## MATCH node, prop
def match_one_node_one_prop():
    def prompter(label_1, prop_1):
        # Extract subschema for the variables of interest
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the {label_1} nodes and extract their {prop_1} property!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1}"
                   }
        return message

    # Build the samples of this type
    sampler=[]
    for node in nodes:
        node_props = get_node_properties(jschema, node)
        for prop in node_props:
            temp_dict = prompter(node,prop)
            sampler.append(temp_dict)
    return sampler

# Build the set
sampler_1 = match_one_node_one_prop()
# Print information about the sampler set
print(f"There are {len(sampler_1)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_1)
# Display an example for inspection
sampler_1[0]

There are 22 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the Article nodes and extract their abstract property!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) RETURN n.abstract'}

In [None]:
# MATCH two nodes and two properties - only exception with two nodes
def match_two_nodes_two_props(N):
    def prompter(label_1, prop_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the {prop_1} of the {label_1} and the {prop_2} for {label_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) MATCH (m:{label_2}) RETURN n.{prop_1}, m.{prop_2}"
                   }
        return message

    sampler = []
    # Create pairs of distinct nodes (i, j) where i!=j
    node_pairs = create_pairs(nodes)
    # Extract n pairs of nodes from all possible pairs
    random_doubles = get_random_elements(node_pairs, N)
    for double in random_doubles:
        # Get the list of properties for label_1
        node_props_1 = get_node_properties(jschema, double[0])
        # Get the list of propetrties for label_2
        node_props_2 = get_node_properties(jschema, double[1])
        # Create propery pairs via cartesian product
        prop_pairs = list(itertools.product(node_props_1, node_props_2))
        for pair in prop_pairs:
            temp_dict = prompter(double[0],pair[0], double[1], pair[1])
            sampler.append(temp_dict)

    return sampler

N=100
sampler_2 = match_two_nodes_two_props(N)
# Print information about the sampler set
print(f"There are {len(sampler_2)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_2)
# Display an example for inspection
sampler_2[0]

There are 422 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the name of the DOI and the abstract for Article!',
 'Schema': 'Graph schema: Node properties are the following:\nDOI {name: STRING, doi_id: STRING},Article {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:DOI) MATCH (m:Article) RETURN n.name, m.abstract'}

## WHERE, LIMIT, ORDER BY

In [None]:
# Extract nodes where a string property has a given value
def where_one_node_one_prop_one_val():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} where {prop_1} is {val_1.strip()}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = '{val_1}' RETURN n"
                   }
        return message

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler


# Build the set
sampler_3 = where_one_node_one_prop_one_val()
# Print information about the sampler set
print(f"There are {len(sampler_3)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_3)
# Display an example for inspection
sampler_3[22]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Topic where description is Focusing on techniques and concepts related to transformations, solutions, and properties of linear equations and matrices, including Jordan normal form, eigenvalues, eigenvectors, diagonalization, and eigenformulations.!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Topic) WHERE n.description = 'Focusing on techniques and concepts related to transformations, solutions, and properties of linear equations and matrices, including Jordan normal form, eigenvalues, eigenvectors, diagonalization, and eigenformulations.' RETURN n"}

In [None]:

# Extract nodes where a string property contains a given substring
def where_one_node_one_string_contains():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} where {prop_1} contains {val_1[:5]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} CONTAINS '{val_1[:5]}' RETURN n"
                   }
        return message

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_4 = where_one_node_one_string_contains()
# Print information about the sampler set
print(f"There are {len(sampler_4)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_4)
# Display an example for inspection
sampler_4[22]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Topic where description contains Focus!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Topic) WHERE n.description CONTAINS 'Focus' RETURN n"}

In [None]:
# Return nodes where a property is not null - use numerals
def where_one_node_one_prop_notnull_numeral():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find 10 {label_1} that have the {prop_1} recorded and return these values!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NOT NULL RETURN n.{prop_1} LIMIT 10"
                   }
        return message

    sampler=[]
    for entry in dtypes_parsed:
        temp_dict = prompter(entry[0], entry[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_5 = where_one_node_one_prop_notnull_numeral()
# Print information about the sampler set
print(f"There are {len(sampler_5)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_5)
# Display an example for inspection
sampler_5[22]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find 10 Topic that have the description recorded and return these values!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Topic) WHERE n.description IS NOT NULL RETURN n.description LIMIT 10'}

In [None]:
# Return nodes where a string property is not null - use words
def where_one_node_one_prop_notnull_literal():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find ten {label_1} that have {prop_1} and return their records!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NOT NULL RETURN n.{prop_1} LIMIT 10"
                   }
        return message

    sampler=[]
    for entry in dtypes_parsed:
        temp_dict = prompter(entry[0], entry[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_6 = where_one_node_one_prop_notnull_literal()
# Print information about the sampler set
print(f"There are {len(sampler_6)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_6)
# Display an example for inspection
sampler_6[22]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find ten Topic that have description and return their records!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Topic) WHERE n.description IS NOT NULL RETURN n.description LIMIT 10'}

In [None]:
# Retrieve nodes where a property is null
def where_one_node_one_prop_null_numeral():
    def prompter(label_1, prop_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find 8 {label_1} that are missing the {prop_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} IS NULL RETURN n LIMIT 8"
                   }
        return message

    sampler=[]
    for entry in dtypes_parsed:
        temp_dict = prompter(entry[0], entry[1])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_7 = where_one_node_one_prop_null_numeral()
# Print information about the sampler set
print(f"There are {len(sampler_7)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_7)
# Display an example for inspection
sampler_7[22]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find 8 Topic that are missing the description!',
 'Schema': 'Graph schema: Node properties are the following:\nTopic {cluster: INTEGER, description: STRING, label: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Topic) WHERE n.description IS NULL RETURN n LIMIT 8'}

In [None]:
# Retrieve nodes where a date property has a specific value
def where_one_node_one_prop_equals_date():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find {label_1} such that {prop_1} is {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = date('{val_1}') RETURN n"
                   }
        return message

    sampler=[]

    for entry in date_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_8 = where_one_node_one_prop_equals_date()
# Print information about the sampler set
print(f"There are {len(sampler_8)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_8)
# Display an example for inspection
sampler_8[0]

There are 4 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find UpdateDate such that update_date is 2013-12-02!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': "MATCH (n:UpdateDate) WHERE n.update_date = date('2013-12-02') RETURN n"}

In [None]:
# Retrieve nodes where a date property has a specific year
def where_one_node_one_prop_equals_year():
    def prompter(label_1, prop_1, date_year):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch {label_1} where {prop_1} is in {date_year}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE date(n.{prop_1}).year = {date_year} RETURN n"
                   }
        return message

    sampler=[]

    for entry in date_parsed:
        for date_year in [2008, 2016, 2020, 2021]:
            temp_dict = prompter(entry[0], entry[1], date_year)
            sampler.append(temp_dict)

    return sampler

# Build the set
sampler_9 = where_one_node_one_prop_equals_year()
# Print information about the sampler set
print(f"There are {len(sampler_9)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_9)
# Display an example for inspection
sampler_9[0]

There are 16 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch UpdateDate where update_date is in 2008!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE date(n.update_date).year = 2008 RETURN n'}

In [None]:
# Nodes where a string property OR another string property
def where_one_node_two_props_notnull_or(N):  # choose how many samples to generate
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the distinct values of the {prop_2} from {label_1} where either {prop_1} is {val_1} or {prop_2} is not null!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = '{val_1}' OR n.{prop_2} IS NOT NULL RETURN DISTINCT n.{prop_2} AS {prop_2}"
                   }
        return message

    sampler=[]
    for n in range(N):
        ninst = get_distinct_random_pairs(string_parsed, used_pairs=[])
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

N=100
# Build the set
sampler_10 = where_one_node_two_props_notnull_or(100)
# Print information about the sampler set
print(f"There are {len(sampler_10)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_10)
# Display an example for inspection
sampler_10[90]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the distinct values of the key_id from Keyword where either name is exponentially growing or key_id is not null!',
 'Schema': 'Graph schema: Node properties are the following:\nKeyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Keyword) WHERE n.name = 'exponentially growing' OR n.key_id IS NOT NULL RETURN DISTINCT n.key_id AS key_id"}

In [None]:
# Return nodes where a date property and a year for another date property
def where_one_node_two_props_two_vals_or_notnull_date(N):
    def prompter(label_1, prop_1, val_1, prop_2, val_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {prop_2} for those {label_1}s where {prop_1} is {val_1} and the year of the {prop_2} is {val_2[:4]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} = {val_1} AND n.{prop_2}.year = {val_2[:4]} RETURN n.{prop_2} AS {prop_2}"
                   }
        return message

    sampler=[]
    for _ in range(N):
        ninst = get_distinct_random_pairs(date_parsed,used_pairs=[])
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3], ninst[4])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_11 = where_one_node_two_props_two_vals_or_notnull_date(N)
# Print information about the sampler set
print(f"There are {len(sampler_11)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_11)
# Display an example for inspection
sampler_11[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the update_date for those UpdateDates where update_date is 2008-01-29 and the year of the update_date is 2008!',
 'Schema': 'Graph schema: Node properties are the following:\nUpdateDate {update_date: DATE}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:UPDATED]->(:UpdateDate)',
 'Cypher': 'MATCH (n:UpdateDate) WHERE n.update_date = 2008-01-29 AND n.update_date.year = 2008 RETURN n.update_date AS update_date'}

In [None]:
# Return nodes where a numerical (FLOAT, INTEGER) property is greater, another numerical is less
def match_with_where_or_numerical_literal():
    def prompter(label_1, prop_1, val_1, prop_2, val_2, prop_3):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find eight instances of the {prop_3} for {label_1} where either {prop_1} exceeds {val_1} or {prop_2} is less than {val_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} > {val_1} OR n.{prop_2} < {val_2} RETURN n.{prop_3} AS {prop_3} LIMIT 8"
                   }
        return message

    sampler=[]
    used_pairs=[]
    N = 4# choose how many samples to generate, depends how many are available in data, more give error
    for _ in range(N):
        ninst = get_distinct_random_pairs(integer_parsed, used_pairs)
        node_props = get_node_properties(jschema, ninst[0])
        for prop_3 in node_props:
            temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3], ninst[4], prop_3)
            sampler.append(temp_dict)

    return sampler

# Build the set
N=4 # choose how many samples to generate
sampler_12 = match_with_where_or_numerical_literal()
# Print information about the sampler set
print(f"There are {len(sampler_12)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_12)
# Display an example for inspection
sampler_12[0]

There are 16 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find eight instances of the abstract for Article where either article_id exceeds 1008 or article_id is less than 1009!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE n.article_id > 1008 OR n.article_id < 1009 RETURN n.abstract AS abstract LIMIT 8'}

In [None]:
# Retrieve nodes where a string property contains a given substring, return 2 properties
def match_with_where_contains_substring(N):
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {prop_1} and the {prop_2} for those {label_1} where {prop_1} contains the substring {val_1[:2]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} CONTAINS {val_1[2:]} RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"
        }
        return message

    sampler=[]
    used_pairs=[]
    for _ in range(N):
        ninst = get_distinct_random_pairs(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose the number of samples
sampler_13 = match_with_where_contains_substring(N)
# Print information about the sampler set
print(f"There are {len(sampler_13)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_13)
# Display an example for inspection
sampler_13[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the comments and the title for those Article where comments contains the substring 21!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE n.comments CONTAINS  pages, AMS-LaTeX RETURN n.comments AS comments, n.title AS title'}

In [None]:
# Retrieve nodes where a string property starts with, return 2 properties
def match_with_where_starts_with_substring(N):
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {prop_1} and the {prop_2} for those {label_1} where {prop_1} starts with {val_1[0]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[0]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"
                   }
        return message

    sampler=[]
    used_pairs=[]
    for _ in range(N):
        ninst = get_distinct_random_pairs(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_14 =match_with_where_starts_with_substring(N)
# Print information about the sampler set
print(f"There are {len(sampler_14)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_14)
# Display an example for inspection
sampler_14[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the title and the title for those Article where title starts with S!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH (n:Article) WHERE n.title STARTS WITH 'S' RETURN n.title AS title, n.title AS title"}

In [None]:
# Retrieve nodes where a string property ends with, return 2 properties
def match_with_where_ends_with_substring(N):
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Search for 23 instances of the {prop_1} and the {prop_2} among {label_1}s where the {prop_1} ends with {val_1[-1]}.""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} ENDS WITH '{val_1[-1]}' RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2} LIMIT 23"
                   }
        return message

    sampler=[]
    used_pairs=[]
    for _ in range(N):
        ninst = get_distinct_random_pairs(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_15 = match_with_where_ends_with_substring(N)
# Print information about the sampler set
print(f"There are {len(sampler_15)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_15)
# Display an example for inspection
sampler_15[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Search for 23 instances of the journal_id and the journal_id among Journals where the journal_id ends with 7.',
 'Schema': 'Graph schema: Node properties are the following:\nJournal {name: STRING, journal_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:PUBLISHED_IN]->(:Journal)',
 'Cypher': "MATCH (n:Journal) WHERE n.journal_id ENDS WITH '7' RETURN n.journal_id AS journal_id, n.journal_id AS journal_id LIMIT 23"}

In [None]:
# Retrieve nodes where a string property is not a certain value
def match_with_where_not_value(N):
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Retrieve distinct values of the {prop_1} and the {prop_2} from {label_1} where {prop_1} is not {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} <> '{val_1}' RETURN DISTINCT n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"
                   }
        return message

    sampler=[]
    used_pairs=[]
    for _ in range(N): # N depends on the size of KG
        ninst = get_distinct_random_pairs(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler


# Build the set
N=100 # choose how many samples to generate
sampler_16 = match_with_where_not_value(N)
# Print information about the sampler set
print(f"There are {len(sampler_16)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_16)
# Display an example for inspection
sampler_16[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Retrieve distinct values of the author_id and the affiliation from Author where author_id is not fd6fcf9374c7e5fd0377800da5e8f846!',
 'Schema': 'Graph schema: Node properties are the following:\nAuthor {author_id: STRING, affiliation: STRING, first_name: STRING, last_name: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:WRITTEN_BY]->(:Author)',
 'Cypher': "MATCH (n:Author) WHERE n.author_id <> 'fd6fcf9374c7e5fd0377800da5e8f846' RETURN DISTINCT n.author_id AS author_id, n.affiliation AS affiliation"}

In [None]:
# Return nodes where a string property is not a certain value
def match_with_where_not_is_value(N):
    def prompter(label_1, prop_1, val_1, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch unique values of {prop_1} and {prop_2} from {label_1} where {prop_1} does not start with {val_1[0]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE NOT n.{prop_1} STARTS WITH '{val_1[0]}' RETURN DISTINCT n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2}"
                   }
        return message

    sampler=[]
    used_pairs=[]
    for _ in range(N):
        ninst = get_distinct_random_pairs(string_parsed, used_pairs)
        temp_dict = prompter(ninst[0], ninst[1], ninst[2], ninst[3])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_17 = match_with_where_not_is_value(N)
# Print information about the sampler set
print(f"There are {len(sampler_17)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_17)
# Display an example for inspection
sampler_17[17]


There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch unique values of key_id and key_id from Keyword where key_id does not start with 7!',
 'Schema': 'Graph schema: Node properties are the following:\nKeyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Keyword) WHERE NOT n.key_id STARTS WITH '7' RETURN DISTINCT n.key_id AS key_id, n.key_id AS key_id"}

In [None]:
# Return nodes where a property is not null, a second property takes specified values, order by the second property
def match_with_where_not_null():
    def prompter(label_1, prop_1, prop_2, val_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Search for {prop_1} and {prop_2} from {label_1} where {prop_1} is not null and {prop_2} exceeds {val_2} and sort the results by {prop_2}, beginning with the largest!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1}  IS NOT NULL AND n.{prop_2} > {val_2} RETURN n.{prop_1} AS {prop_1}, n.{prop_2} AS {prop_2} ORDER BY {prop_2} DESC"
                   }
        return message

    sampler=[]
    for dinstance in integer_parsed:
        for ninstance in integer_parsed:
            if dinstance[0] == ninstance[0]:
                temp_dict = prompter(dinstance[0], dinstance[1], ninstance[1], ninstance[2])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_18 = match_with_where_not_null()
# Print information about the sampler set
print(f"There are {len(sampler_18)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_18)
# Display an example for inspection
sampler_18[0]

There are 25 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Search for article_id and article_id from Article where article_id is not null and article_id exceeds 1006 and sort the results by article_id, beginning with the largest!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) WHERE n.article_id  IS NOT NULL AND n.article_id > 1006 RETURN n.article_id AS article_id, n.article_id AS article_id ORDER BY article_id DESC'}

In [None]:
# Usage of regular expression with WHERE
def where_one_node_string_re():
    def prompter(label_1, prop_1, val_1):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch the {label_1} where {prop_1} ends with {val_1[:2]}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} =~'{val_1[1:-1]}.*' RETURN n"
                   }
        return message

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2])
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_19 = where_one_node_string_re()
# Print information about the sampler set
print(f"There are {len(sampler_19)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_19)
# Display an example for inspection
sampler_19[0]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the Article where abstract ends with   !',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH (n:Article) WHERE n.abstract =~' Using matrix inversion and determinant evaluation techniques we prove several\nsummation and transformation formulas for terminating, balanced,\nvery-well-poised, elliptic hypergeometric series..*' RETURN n"}

## Simple Paths, WHERE, EXISTS

In [None]:
# Traverse a simple path
def match_a_simple_path():
    def prompter(label_1, prop_1, val_1, prop_2, rtype_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Retrieve the {prop_2} of {label_2} which are connected to the {label_1} where {prop_1} is {val_1} through the {rtype_1} relationship!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": "MATCH (:{label_1} {{{prop_1}:'{val_1}'}}) <- [:{rtype_1}]-({label_2}) RETURN {label_2}.{prop_2}".format(label_1=label_1,
                                                                                                                                      prop_1=prop_1,
                                                                                                                                      val_1=val_1,
                                                                                                                                      rtype_1=rtype_1,
                                                                                                                                      label_2=label_2,
                                                                                                                                      prop_2=prop_2)}
        return message

    sampler = []

    for entry in string_string_rels:
        label_1 = entry[0]
        label_2 = entry[3]
        rtype_1 = entry[2]
        for prop_1, val_1 in entry[1].items():
            for prop_2, _ in entry[4].items():
                temp_dict =prompter(label_1, prop_1, val_1, prop_2, rtype_1, label_2)
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_20 = match_a_simple_path()
# Print information about the sampler set
print(f"There are {len(sampler_20)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_20)
# Display an example for inspection
sampler_20[100]

There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Retrieve the category_id of Categories which are connected to the Article where title is Applications of another characterization of betaN\\N through the HAS_CATEGORY relationship!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Categories {category_id: STRING, specifications: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_CATEGORY]->(:Categories)',
 'Cypher': "MATCH (:Article {title:'Applications of another chara

In [None]:
# Match with relations and property
def where_and_simple_path():
    def prompter(label_1, prop_1, val_1, label_2, prop_2, rtype_1):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Search for the {prop_2} in {label_2} that is linked through a {rtype_1} relationship with the {label_1} where {prop_1} property is {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) -[{rtype_1[:2].lower()}:{rtype_1}]->(m) WHERE n.{prop_1}='{val_1}' RETURN m.{prop_2}"
                   }
        return message

    sampler = []

    for entry in string_string_rels:
        for prop_1, val_1 in entry[1].items():
            for prop_2, _ in entry[4].items():
                temp_dict =prompter(entry[0], prop_1, val_1, entry[3], prop_2, entry[2])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_21 = where_and_simple_path()
# Print information about the sampler set
print(f"There are {len(sampler_21)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_21)
# Display an example for inspection
sampler_21[2]

There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Search for the name in Keyword that is linked through a HAS_KEY relationship with the Article where comments property is 21 pages, AMS-LaTeX!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Article) -[ha:HAS_KEY]->(m) WHERE n.comments='21 pages, AMS-LaTeX' RETURN m

In [None]:
# Match with relations and exists
def where_and_exists_simple_path():
    def prompter(label_1, prop_1, rtype_1, label_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Fetch {prop_1} of the {label_1} that are connected to {label_2} via {rtype_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": "MATCH (n:{label_1}) WHERE EXISTS {{ MATCH (n)-[:{rtype_1}]->(:{label_2}) }} RETURN n.{prop_1} AS {prop_1}".format(label_1=label_1,
                                                                                                                                                prop_1=prop_1,
                                                                                                                                                rtype_1=rtype_1,
                                                                                                                                                label_2=label_2)}

        return message

    sampler = []

    for entry in string_string_rels:
        for prop_1, _ in entry[1].items():
            for prop_2, _ in entry[4].items():
                temp_dict =prompter(entry[0], prop_1, entry[2], entry[3])
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_22 = where_and_exists_simple_path()
# Print information about the sampler set
print(f"There are {len(sampler_22)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_22)
# Display an example for inspection
sampler_22[0]


There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch abstract of the Article that are connected to Keyword via HAS_KEY!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': 'MATCH (n:Article) WHERE EXISTS { MATCH (n)-[:HAS_KEY]->(:Keyword) } RETURN n.abstract AS abstract'}

In [None]:
# Match with relations, exists and property
def where_and_exists_simple_path_and_property():
    def prompter(label_1, prop_1, rtype_1, label_2, var_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Search for the {prop_1} of {label_1} that have a {rtype_1} relationship with {label_2}s where the {prop_1} of the {label_1} matches the {prop_2} of the related  {label_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f""
                   }
        return message

    sampler = []

    for entry in string_string_rels:
        for prop_1, _ in entry[1].items():
            for prop_2, _ in entry[4].items():
                var_2 = entry[3].lower()
                temp_dict =prompter(entry[0], prop_1, entry[2], entry[3], var_2, prop_2)
                sampler.append(temp_dict)

    return sampler


# Build the set
sampler_23 = where_and_exists_simple_path_and_property()
# Print information about the sampler set
print(f"There are {len(sampler_23)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_23)
# Display an example for inspection
sampler_23[0]

There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Search for the abstract of Article that have a HAS_KEY relationship with Keywords where the abstract of the Article matches the name of the related  Keyword!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': ''}

In [None]:
# Match with relations, not and property
def where_not_simple_path_and_property():
    def prompter(label_1, prop_1, label_2, prop_2, val_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Look for the {prop_1} of the {label_1} that is not related  to the {label_2} with the  {prop_2}  {val_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": "MATCH (n:{label_1}), (:{label_2} {{{prop_2}: '{val_2}'}}) WHERE NOT (n) --> (:{label_2}) RETURN n.{prop_1}".format(label_1=label_1,
                                                                                                                                                     prop_1=prop_1,
                                                                                                                                                     label_2=label_2,
                                                                                                                                                     prop_2=prop_2,
                                                                                                                                                     val_2=val_2)}

        return message

    sampler = []

    for entry in string_string_rels:
        for prop_1, _ in entry[1].items():
            for prop_2, val_2 in entry[4].items():
                #var_2 = entry[3].lower()
                temp_dict =prompter(entry[0], prop_1, entry[3], prop_2, val_2)
                sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_24 = where_not_simple_path_and_property()
# Print information about the sampler set
print(f"There are {len(sampler_24)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_24)
# Display an example for inspection
sampler_24[0]

There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Look for the abstract of the Article that is not related  to the Keyword with the  name  summation!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report),(:Article)-[:HAS_KEY]->(:Keyword),(:Keyword)-[:HAS_TOPIC]->(:Topic)',
 'Cypher': "MATCH (n:Article), (:Keyword {name: 'summation'}) WHERE NOT (n) --> (:Keyword) RETURN n.abstract"}

In [None]:
# Match relations, with,
def relation_with_and_where():
    def prompter(label_1, prop_1, val_1, rtype_1, label_2, prop_2, val_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find {label_2} that has a {prop_2} which begins with {label_2[0].lower()}, and is linked to {label_1} via {rtype_1} relationship, where {label_1} has {prop_1} {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": "MATCH (n:{label_1} {{{prop_1}: '{val_1}'}}) -[:{rtype_1}]- (m:{label_2}) WITH {label_2} WHERE m.{prop_2} STARTS WITH '{label_2[0]}' RETURN n.{prop_2}".format(label_1=label_1,
                                                                                                                                                                                               prop_1=prop_1,
                                                                                                                                                                                               val_1=val_1,
                                                                                                                                                                                               rtype_1=rtype_1,
                                                                                                                                                                                               label_2 = label_2,
                                                                                                                                                                                               prop_2=prop_2,
                                                                                                                                                                                               val_2=val_2)}
        return message

    sampler = []

    for entry in string_string_rels:
        for prop_1, val_1 in entry[1].items():
            for prop_2, val_2 in entry[4].items():
                temp_dict =prompter(entry[0], prop_1, val_1, entry[2], entry[3], prop_2, val_2)
                sampler.append(temp_dict)

    return sampler

# Build the set
sampler_25 = relation_with_and_where()
# Print information about the sampler set
print(f"There are {len(sampler_25)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_25)
# Display an example for inspection
sampler_25[0]

There are 368 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find Keyword that has a name which begins with k, and is linked to Article via HAS_KEY relationship, where Article has abstract   Using matrix inversion and determinant evaluation techniques we prove several\nsummation and transformation formulas for terminating, balanced,\nvery-well-poised, elliptic hypergeometric series.\n!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},Keyword {name: STRING, key_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS

## MATCH, SKIP, LIMIT, UNION

In [None]:
# Match, with, skip, limit combination
def match_where_skip_limit_return_property():
    def prompter(label_1, prop_1, val_1, nrecs):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Find the {label_1} for which {prop_1} starts with {val_1[0]}, skip the first {nrecs} records and return the next {nrecs} records of {prop_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} STARTS WITH '{val_1[0]}' WITH n.{prop_1} AS {prop_1} SKIP {nrecs} LIMIT {nrecs} RETURN {prop_1}"
                   }
        return message

    sampler=[]
    for entry in string_parsed:
        temp_dict = prompter(entry[0], entry[1], entry[2], 3)
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_26 = match_where_skip_limit_return_property()
# Print information about the sampler set
print(f"There are {len(sampler_26)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_26)
# Display an example for inspection
sampler_26[0]

There are 76 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Find the Article for which abstract starts with  , skip the first 3 records and return the next 3 records of abstract!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': "MATCH (n:Article) WHERE n.abstract STARTS WITH ' ' WITH n.abstract AS abstract SKIP 3 LIMIT 3 RETURN abstract"}

In [None]:
# Match, with, skip, limit combination
def match_skip_limit_return_property():
    def prompter(label_1, prop_1, nrecs):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Return the {prop_1} of the {label_1}, skip the first {nrecs} records and return {nrecs} records!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1}  SKIP {nrecs} LIMIT {nrecs}"
                   }
        return message

    sampler=[]
    for entry in dtypes_parsed:
        temp_dict = prompter(entry[0], entry[1], 2)
        sampler.append(temp_dict)

    return sampler

# Build the set
sampler_27 = match_skip_limit_return_property()
# Print information about the sampler set
print(f"There are {len(sampler_27)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_27)
# Display an example for inspection
sampler_27[1]

There are 87 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Return the comments of the Article, skip the first 2 records and return 2 records!',
 'Schema': 'Graph schema: Node properties are the following:\nArticle {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Article) RETURN n.comments  SKIP 2 LIMIT 2'}

In [None]:
# Union of two property selections
# Union of two sets with filtering duplicates
def match_properties_with_union(N):
    def prompter(label_1, prop_1, val_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Retrieve the {label_1} where {prop_1} or {prop_2} contain {val_1}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) WHERE n.{prop_1} CONTAINS '{val_1}' UNION ALL MATCH (m:{label_1}) WHERE m.{prop_2} CONTAINS '{val_1}' RETURN n.{prop_1}, m.{prop_2}"
                   }
        return message

    sampler=[]
    for _ in range(N):
        #ninst = random_properties(string_instances)
        pair = random.sample(string_parsed, 2)
        temp_dict = prompter(pair[0][0], pair[0][1], pair[0][2],
        pair[1][0], pair[1][1])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_28 = match_properties_with_union(N)
# Print information about the sampler set
print(f"There are {len(sampler_28)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_28)
# Display an example for inspection
sampler_28[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Retrieve the Journal where name or specifications contain Rev. Mat.Iberoamericana!',
 'Schema': 'Graph schema: Node properties are the following:\nJournal {name: STRING, journal_id: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:PUBLISHED_IN]->(:Journal)',
 'Cypher': "MATCH (n:Journal) WHERE n.name CONTAINS 'Rev. Mat.Iberoamericana' UNION ALL MATCH (m:Journal) WHERE m.specifications CONTAINS 'Rev. Mat.Iberoamericana' RETURN n.name, m.specifications"}

In [None]:
# Union of two sets with filtering duplicates
def match_nodes_with_union(N):
    def prompter(label_1, prop_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Return the {prop_1} for {label_1} combined with the {prop_2} for {label_2}, filter the duplicates if any!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1} AS Records UNION MATCH (m:{label_2}) RETURN m.{prop_2} AS Records"
                   }
        return message

    sampler=[]
    for _ in range(N):
        #ninst = random_properties(string_instances)
        pair = random.sample(string_parsed, 2)
        temp_dict = prompter(pair[0][0], pair[0][1],
        pair[1][0], pair[1][1])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_29 = match_nodes_with_union(N)
# Print information about the sampler set
print(f"There are {len(sampler_29)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_29)
# Display an example for inspection
sampler_29[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Return the specifications for Categories combined with the name for DOI, filter the duplicates if any!',
 'Schema': 'Graph schema: Node properties are the following:\nCategories {category_id: STRING, specifications: STRING},DOI {name: STRING, doi_id: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:HAS_DOI]->(:DOI)',
 'Cypher': 'MATCH (n:Categories) RETURN n.specifications AS Records UNION MATCH (m:DOI) RETURN m.name AS Records'}

In [None]:
# Union of two sets without filtering duplicates
def match_nodes_with_union_all(N):
    def prompter(label_1, prop_1, label_2, prop_2):
        subschema = get_subgraph_schema(jschema, [label_1, label_2], 2, True)
        message = {"Prompt": "Convert the following question into a Cypher query using the provided graph schema!",
                   "Question": f"""Return the {prop_1} for {label_1} combined with the {prop_2} for {label_2}!""",
                   "Schema": f"Graph schema: {subschema}",
                   "Cypher": f"MATCH (n:{label_1}) RETURN n.{prop_1} AS Records UNION ALL MATCH (m:{label_2}) RETURN m.{prop_2} AS Records"
                   }
        return message

    sampler=[]
    for _ in range(N):
        #ninst = random_properties(string_instances)
        pair = random.sample(string_parsed, 2)
        temp_dict = prompter(pair[0][0], pair[0][1],
        pair[1][0], pair[1][1])
        sampler.append(temp_dict)

    return sampler

# Build the set
N=100 # choose how many samples to generate
sampler_30 = match_nodes_with_union_all(N)
# Print information about the sampler set
print(f"There are {len(sampler_30)} queries in this subset.")
# Add to trainer dataset
trainer += collect_samples(sampler_30)
# Display an example for inspection
sampler_30[0]

There are 100 queries in this subset.


{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Return the category_id for Categories combined with the abstract for Article!',
 'Schema': 'Graph schema: Node properties are the following:\nCategories {category_id: STRING, specifications: STRING},Article {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING}\nRelationship properties are the following:\nPUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\nThe relationships are the following:\n(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)',
 'Cypher': 'MATCH (n:Categories) RETURN n.category_id AS Records UNION ALL MATCH (m:Article) RETURN m.abstract AS Records'}

## Data Saving

In [None]:
# Display the number of samples created
len(trainer)

2435

In [None]:
# Save the samples to a json file

trainer_one = 'trainer_one.json'
write_json(trainer, data_path+trainer_one)