# Synthetic Data Generation

## Workspace Setup

In [None]:
!pip install openai -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/222.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m215.0/222.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.3/222.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires

In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/cypherGen/'

# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/cypherGen/datas/'

# Schema file in string form
formatted_schema_file = 'formatted_schema.txt'

# The json file for queries categories
query_category_file = 'qcategories.json'

# File to store the generated synthetic data
synthetic_data_file = 'synthetic_data_4turbo.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/arxivKG


In [None]:
import pandas as pd
import random
import itertools
import os
import re
import json

from openai import OpenAI

# Import the local modules
from utils.utilities import *

In [None]:
# Credentials for OpenAI - replace with your own token
os.environ["OPENAI_API_KEY"] = "sk-your key here"

# Instantiate the OpenAI client
client = OpenAI()

In [None]:
# The graph schema in string format

schema = """Node properties are the following:\n
Article {abstract: STRING, article_id: INTEGER, comments: STRING, title: STRING},
Keyword {name: STRING, key_id: STRING},
Topic {cluster: INTEGER, description: STRING, label: STRING},
Author {author_id: STRING, affiliation: STRING, first_name: STRING, last_name: STRING},
DOI {name: STRING, doi_id: STRING},Categories {category_id: STRING, specifications: STRING},
Report {report_id: STRING, report_no: STRING},
UpdateDate {update_date: DATE},
Journal {name: STRING, journal_id: STRING}\n
Relationship properties are the following:\n
PUBLISHED_IN {meta: STRING, pages: STRING, year: INTEGER}\n
The relationships are the following:\n
(:Article)-[:HAS_KEY]->(:Keyword),
(:Article)-[:HAS_DOI]->(:DOI),
(:Article)-[:HAS_CATEGORY]->(:Categories),
(:Article)-[:WRITTEN_BY]->(:Author),
(:Article)-[:UPDATED]->(:UpdateDate),
(:Article)-[:PUBLISHED_IN]->(:Journal),
(:Article)-[:HAS_REPORT]->(:Report),
(:Keyword)-[:HAS_TOPIC]->(:Topic)
"""

## Categories of Question:Cypher Pairs

In [None]:
# Queries categories extracted with ChatGPT with several tries, combined and edited
# Notice the consistency of the format: Description: Questions about... For example, ...

chatgpt_categories_prompt = f"""
You are an experienced and useful Python and Neo4j/Cypher developer.

I have a knowledge graph for which I would like to generate interesting questions
which span 12 categories (or types) about the graph. They should cover single nodes questions,
two or three more nodes, relationships and paths. Please suggest 12
categories together with their short descriptions. Here is the graph schema:
 {schema}
 """

# Edited, enhanced, combined ChatGPT categories

categories = [
'''Article Inquiry: Questions about specific articles, focusing on their attributes, such as abstracts, titles, or comments.
For example, "What is the abstract of the article with the most comments?"''',
'''Keyword Analysis: Questions that delve into keywords, exploring their names, relationships with articles,
and associated topics.
For example, "Which keyword is most common across all articles?"''',
'''Topic Exploration: Questions about topics, their descriptions, associations with keywords and clusters.
For example, "What are the main topics associated with a given keyword?"''',
'''Author Profile: Questions about authors, their affiliations, names, and articles they have written.
For example, "Who are the authors affiliated with 'X University' and what articles have they written?"''',
'''DOI Details: Questions about the Digital Object Identifier (DOI) of articles, including their names and specific IDs.
For example, "List all articles with a DOI containing a specific string or pattern."''',
'''Category Classification: Questions about the categories of articles, their specifications and how articles are classified into these categories.
For example, "What are the different categories of articles published in the last year?"''',
'''Report Details: Questions about reports linked to articles, including report IDs and numbers.
For example, "Which articles are linked to report 'XYZ'?"''',
'''Update Date Timeline: Questions about when articles were updated, looking for patterns of recent updates.
For example, "Which articles were updated in the last month?"''',
'''Journal Publication: Questions about the journals in which articles are published, including journal names and the relationships between journals and articles.
For example, "What are the most common journals for articles on a specific topic?"''',
'''Authorship and Collaboration: Questions about co-authorship and collaboration patterns.
For example, "Which authors have co-authored articles the most?"''',
'''Article-Author Connections: Questions about the relationships between articles and authors,
such as finding articles written by a specific author or authors of a particular article.
For example, "Find all the authors of the article with tile 'Explorations of manifolds'"''',
'''Pathfinding and Connectivity: Questions that involve paths between multiple nodes,
such as tracing the relationship path from an article to a topic through keywords, or from an author to a journal through their articles.
For example, "How is the author 'John Doe' connected to the journal 'Nature'?"''',
'''Graph Structure and Patterns: Questions that analyze the overall structure of the graph,
like finding the most connected nodes, or identifying patterns in relationships and node attributes, centrality measures, cluster analysis.
For example, "Which keyword is at the center of the largest cluster of articles?"'''
]

## Helpers and Wrappers

In [None]:
def create_prompt(schema, category):
    """Build and format the prompt."""
    formatted_prompt = [
        {"role": "system",
        "content": "You are an experienced Cypher developer and a helpful assistant designed to output JSON!"},
        {"role": "user",
         "content": f"""Generate 40 questions and their corresponding Cypher statements about the Neo4j graph database with the following schema:
        {schema}
        The questions should cover {category} and should be phrased in a natural conversational manner. Make the questions diverse and interesting.
        Make sure to use the latest Cypher version and that all the queries are working Cypher queries for the provided graph. You may add
        values for the node attributes as needed. Do not add any comments, do not label or number the questions.
        """}]
    return formatted_prompt

In [None]:
def prompt_model(messages):
    """Function to produce and extract model's generation."""
    response = client.chat.completions.create(
        model="gpt-4-1106-preview", # work with gpt-4-turbo
        response_format={"type": "json_object"},
        messages=messages)
    return response.choices[0].message.content

In [None]:
def build_synthetic_data(schema, categories):
    """Function to loop through the categories and generate data."""

    # List to collect all outputs
    full_output=[]
    for category in categories:
        # Prompt the model and retrieve the generated answer
        output = [prompt_model(create_prompt(schema, category))]
        # Store all the outputs in a list
        full_output += output
    return full_output

## Collect the  Data

In [None]:
# Generate 40 pairs for each of the categories
full_output = build_synthetic_data(schema, categories)

In [None]:
# Save the outputs to a file
write_json(full_output, data_path + synthetic_data_file)

### Clean and Format the Data

In [None]:
# Read the generated data
synthetic = read_json( data_path+"synthetic_data_4turbo.json")

In [None]:
# Replace each entry with a dictionary
for i in range(len(synthetic)):
    if i != 9:
        synthetic[i] = json.loads(synthetic[i])


#### Comments:

- some individual corrections might be needed, for example one of my entries `synthetic[9]` ended in an unfinished Cypher statement, hence not a JSON object which I had to complete manually,
- not all the entries are dictionaries with keys `question`, `cypher`, thus the keys have to be renamed,
- in a couple of cases the questions were grouped together, while the Cypher statements were also grouped together.

### Correct the remaining entries

In [None]:
# List to collect the entries
parsed = []

In [None]:
# Some categories need extra data cleaning
syn7=[]
for d in synthetic[7]['questions']:
    d['cypher'] = d['query']
    d.pop('query', None)
    d = {'question': d['question'], 'cypher': d['cypher']}
    syn7.append(d)

for d in synthetic[3]['questions']:
    d['question'] = d['query']
    d.pop('query', None)
    d = {'question': d['question'], 'cypher': d['cypher']}
syn3 = []
for d in synthetic[3]['questions']:
    d = {'question': d['question'], 'cypher': d['cypher']}
    syn3.append(d)


In [None]:
syn5 = [{"question": q, "cypher": c} for q, c in zip(synthetic[5]['questions'], synthetic[5]['cypher'])]
syn8 = [{"question": q, "cypher": c} for q, c in zip(synthetic[8]['questions'], synthetic[8]['cypher_statements'])]

In [None]:
# Add the categories that required extra data cleaning
parsed+=syn3+syn5+syn7+syn8+syn9['questions']

In [None]:
# Category is split into several keys
parsed+=synthetic[0]['questions']+synthetic[0]['additional_questions']+synthetic[0]['more_questions']

In [None]:
# Add the generated categories with key 'questions_and_cypher'
parsed+=synthetic[1]['questions_and_cypher']+synthetic[6]['questions_and_cypher']

In [None]:
# Add the generated categories with key 'questions'
for i in [2, 4, 10, 11, 12]:
    parsed+=synthetic[i]['questions']

In [None]:
# Check how many samples are collected
len(parsed)

494

In [None]:
# Chanke all keys to conform with the rest of the training data
subschema = schema
new_parsed = []

for d in parsed:
    new_d = {}

    # Step 1 and 2: Update keys
    new_d['Prompt'] = "Convert the following question into a Cypher query using the provided graph schema!"
    new_d['Question'] = d['question']
    new_d['Schema'] = f"Graph schema: {subschema}"
    new_d['Cypher'] = d['cypher']

    new_parsed.append(new_d)

In [None]:
# Save the data to a file
write_json(new_parsed, data_path+"parsed_synthetic.json")

# Extra Adjustments and Data Cleaning

This section can be processed independently from the previous cells.

At this point the data is in a format that could be used for fine-tuning. However, if we want to be throrough we also should check the quality of the Cypher queries. And we can easily do this by running each of these queries against the Neo4j graph database.

In [None]:
%pip install neo4j
%pip install python-levenshtein

In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/cypherGen/'

Mounted at /content/drive
/content/drive/MyDrive/cypherGen


In [None]:
# Necessary imports
import neo4j
import pandas as pd

# Import the local modules
from utils.utilities import *
from utils.neo4j_conn import *

# Neo4j graph database credentials
URI = 'neo4j+s://xxxxxxxx.databases.neo4j.io'
USER = 'neo4j'
PWD = 'your password here'

# Initialize the Neo4j connector and utilities modules
graph=Neo4jGraph(url=URI, username=USER, password=PWD)

In [None]:
# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/cypherGen/datas/'

# Read the data from the file
data = read_json(data_path+"parsed_synthetic.json")

In [None]:
# Check the data
pd_data = pd.DataFrame(data)
pd_data.head(4)

Unnamed: 0,Prompt,Question,Schema,Cypher
0,Convert the following question into a Cypher q...,Who are the authors affiliated with 'X Univers...,Graph schema: Node properties are the followin...,MATCH (a:Author {affiliation:'X University'}) ...
1,Convert the following question into a Cypher q...,What articles have been written by 'John Doe'?,Graph schema: Node properties are the followin...,"MATCH (a:Author {first_name:'John', last_name:..."
2,Convert the following question into a Cypher q...,Can you list the affiliations of 'Jane Smith'?,Graph schema: Node properties are the followin...,"MATCH (a:Author {first_name:'Jane', last_name:..."
3,Convert the following question into a Cypher q...,Which authors from 'Y Institute' have publishe...,Graph schema: Node properties are the followin...,MATCH (a:Author {affiliation:'Y Institute'})-[...


In [None]:
# Create a list of Cypher queries
cypher_list = pd_data.Cypher.tolist()
# Check for success
cypher_list[1]

"MATCH (a:Author {first_name:'John', last_name:'Doe'})-[:WRITTEN_BY]-(article:Article) RETURN article.title, article.article_id;"

In [None]:
# Required Neo4j exceptions imports
from neo4j.exceptions import Neo4jError, CypherTypeError

def execute_cypher_queries(queries):
    """Executes Cypher queries and records errors."""
    results = []
    for query in queries:
        try:
            result = graph.query(query)
            results.append(result)
        # Catch CypherTypeError
        except CypherTypeError:
            results.append('type error')
        # Catch other Neo4j errors
        except Neo4jError as e:
            # Check for syntax errors
            if 'SyntaxError' in str(e):
                results.append('faulty query')
            # Look for other Neo4j specific errors
            else:
                results.append(f'other Neo4j error: {str(e)}')
        # Catch any other exceptions
        except Exception as e:
            results.append(f'other error: {str(e)}')
    return results


In [None]:
# It is advisable to process the queries in batches

def process_in_batches(queries, batch_size):
    """This function will store the results and their original indices."""

    results = []

    # Process the queries in batches.
    for i in range(0, len(queries), batch_size):
        batch = queries[i:i + batch_size]

        # Execute each query in the batch.
        for j, query in enumerate(batch):
            result = execute_cypher_queries(batch)
            # Record the result and its original index (i+j).
            results.append((i + j, result))

    return results

In [None]:
# Check the Neo4j connection
graph.query("MATCH (n) RETURN count(n)")

[{'count(n)': 38650}]

In [None]:
# Sample output
output = execute_cypher_queries(cypher_list[:10])
output

[[], [], [], [], [], [{'count(article)': 0}], [], [], 'type error', []]

In [None]:
cypher_list[:10]

["MATCH (a:Author {affiliation:'X University'}) RETURN a.first_name, a.last_name, a.author_id;",
 "MATCH (a:Author {first_name:'John', last_name:'Doe'})-[:WRITTEN_BY]-(article:Article) RETURN article.title, article.article_id;",
 "MATCH (a:Author {first_name:'Jane', last_name:'Smith'}) RETURN a.affiliation;",
 "MATCH (a:Author {affiliation:'Y Institute'})-[:WRITTEN_BY]-(article:Article)-[:PUBLISHED_IN]-(j:Journal {name: 'Journal of Science'}) RETURN a.first_name, a.last_name, article.title;",
 "MATCH (a:Author {first_name:'Alice', last_name:'Johnson'})-[:WRITTEN_BY]-(article:Article)-[:PUBLISHED_IN]-(journal:Journal) RETURN article.title, article.article_id, journal.year;",
 "MATCH (a:Author {affiliation:'Z College'})-[:WRITTEN_BY]-(article:Article) RETURN count(article);",
 "MATCH (a:Author {first_name:'Emma', last_name:'White'})-[:WRITTEN_BY]-(article:Article)-[:HAS_KEY]-(keyword:Keyword) RETURN article.title, keyword.name;",
 "MATCH (topic:Topic {label: 'Artificial Intelligence'})<-

In [None]:
cypher_list[9]

'MATCH (author:Author)-[:WRITTEN_BY]-(article:Article)-[:UPDATED]-(updateDate:UpdateDate) WHERE article.creation_date = updateDate.update_date RETURN DISTINCT author.first_name, author.last_name;'