# Preperation -- Library Import & API

In [1]:
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import RetrievalQA, LLMChain
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig, pipeline
from langchain.prompts import ChatPromptTemplate
from langchain import embeddings
from langchain import text_splitter as ts
from langchain import vectorstores as vs
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from operator import itemgetter
from requests.exceptions import HTTPError
import torch
from transformers import pipeline
from langchain import document_loaders as dl
import re
import time
import json
import nest_asyncio
nest_asyncio.apply()
import datetime
import pytz
import string
import os
from getpass import getpass
import ast




Free Open Source LLM

In [32]:
HF_TOKEN = getpass("HF Token:")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
# 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

HF Token:········


In [29]:
#Define embedding models
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
)

#Large Language Model Setup
llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length": 8192,"max_new_tokens":2048}
)

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 8192,"max_new_tokens":2048}
)

Open AI

In [2]:
os.environ['OPENAI_API_KEY'] = getpass("Enter the key:")

Enter the key:········


In [3]:
#Define embedding models
embeddings=OpenAIEmbeddings()
#Large Language Model Setup 
llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

# Step 1 Document input

In [4]:
document_path =r"C:\Users\y1kel\Documents\Master IS\Thesis\Dataset\Thesis_input_doc.pdf"
loader = dl.PyPDFLoader(document_path)
content = loader.load()

# Step 2 Split & Step 3 Embeddings 

In [5]:
#Split the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128) #Define chunk size
chunks = text_splitter.split_documents(content)

#Embedding the chunks
vectorstore = Chroma.from_documents(chunks, embeddings)

#Define the retriever
retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

# Step 4 RAG Chain

In [7]:
#System prompt
template = """
 <|system|>
You are an AI assistant that follows instruction extremely well. Please be truthful, give direct and complete answers instead of examples. If you don't know the answer, just say that you don't know, don't try to make up an answer.
</s>
 <|user|>
 {query}
 </s>
Assistant
"""

#Prompt template setup
prompt = ChatPromptTemplate.from_template(template)

#RAG Chain setup
rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Query prompt

NB: This code is not designed for running since it takes the output from another steps.

In [None]:
#Query1 (Step 5.1 Domain glossary) ResultQ1=rag_chain.invoke(QueryQ1)
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."

#Query234 (Step 5.2 Synonymy, Taxonomy, and Predication) ResultQ234=rag_chain.invoke(QueryQ234)
QueryQ234 = f"""We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity."""

#Query5 (Step 5.3 Parthood) ResultQ5=rag_chain.invoke(QueryQ5)        
QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please mapping the relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        
#Query6 (Step 5.4 Ontology Schema) ResultQ6=rag_chain.invoke(QueryQ6)        
QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
<http://example.org/person/Mark_Twain>
<http://example.org/relation/author>
<http://example.org/books/Huckleberry_Finn> . """

#Query7 (Step 5.7 Finalize Ontology) ResultQ7=rag_chain.invoke(QueryQ7)
QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
<http://example.org/person/Mark_Twain>
<http://example.org/relation/author>
<http://example.org/books/Huckleberry_Finn>."""

# Step 5 Pipeline

## Final Pipeline zephyr-7b-alpha with Open AI Embeddings 
llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length": 100000,"max_new_tokens":2048}
)

In [95]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length":8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")
        
print(f"ResultQ234:{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("zephyr-7b-alpha_result_Q6.txt", "w") as file:
     file.write(str(ResultQ6))

with open("zephyr-7b-alpha_result_Q7.txt", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 19:53:32
ResultQ1:{'Pipe': 'A closed conduit for conveying fluids, typically made of metal, plastic, or concrete.', 'Reservoir': 'A large container for storing water, typically used to regulate water supply and pressure.', 'Pump': 'A mechanical device that increases the pressure and flow rate of water by using rotary or reciprocating motion.', 'Valve': 'A device used to control the flow of water by opening, closing, or regulating it.', 'Tank': 'A container for storing water, typically used for distribution purposes or as a backup for emergencies.', 'Pressure Reducing Valve (PRV)': 'A valve used to reduce the pressure of water for residential or commercial purposes.', 'Meter': 'A device used to measure the amount of water consumed by a customer.', 'Fire Hydrant': 'A device used to provide water for firefighting purposes, typically installed at strategic locations in urban areas.', 'Manhole': 'A structure used to provide access to the water distribution network for maintenance

In [104]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length":8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")
        
print(f"ResultQ234:{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("zephyr-7b-alpha_result_Q6.ttl", "w") as file:
     file.write(str(ResultQ6))

with open("zephyr-7b-alpha_result_Q7.ttl", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 20:18:34
ResultQ1:{'Pipe': 'A closed conduit for conveying fluids, typically made of metal, plastic, or concrete.', 'Reservoir': 'A large container for storing water, typically used to regulate water supply and pressure.', 'Pump': 'A mechanical device that increases the pressure and flow rate of water by using rotary or reciprocating motion.', 'Valve': 'A device used to control the flow of water by opening, closing, or regulating it.', 'Tank': 'A container for storing water, typically used for distribution purposes or as a backup for emergencies.', 'Pressure Reducing Valve (PRV)': 'A valve used to reduce the pressure of water for residential or commercial purposes.', 'Meter': 'A device used to measure the amount of water consumed by a customer.', 'Fire Hydrant': 'A device used to provide water for firefighting purposes, typically installed at strategic locations in urban areas.', 'Manhole': 'A structure used to provide access to the water distribution network for maintenance

Test Results

In [92]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length":8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")
        
print(f"ResultQ234:{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

2024-03-31 19:19:14
ResultQ1:{'Pipe': 'A closed conduit for conveying fluids, typically made of metal, plastic, or concrete.', 'Reservoir': 'A large container for storing water, typically used to regulate water supply and pressure.', 'Pump': 'A mechanical device that increases the pressure and flow rate of water by using rotary or reciprocating motion.', 'Valve': 'A device used to control the flow of water by opening, closing, or regulating it.', 'Tank': 'A container for storing water, typically used for distribution purposes or as a backup for emergencies.', 'Pressure Reducing Valve (PRV)': 'A valve used to reduce the pressure of water for residential or commercial purposes.', 'Meter': 'A device used to measure the amount of water consumed by a customer.', 'Fire Hydrant': 'A device used to provide water for firefighting purposes, typically installed at strategic locations in urban areas.', 'Manhole': 'A structure used to provide access to the water distribution network for maintenance

In [77]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")
        
print(f"ResultQ234:{term_answers}")

2024-03-31 00:24:48
ResultQ1:{'Pipe': 'A long, cylindrical tube used to transport water from one location to another in a water distribution network.', 'Valve': 'A device used to regulate or control the flow of water in a water distribution network.', 'Pump': 'A machine that increases the pressure of water in a water distribution network by using a rotating shaft and blades.', 'Tank': 'A large container used to store water in a water distribution network.', 'Reservoir': 'A large body of water used to store water in a water distribution network.', 'Hydrant': 'A device used to access water from a water distribution network for firefighting or other purposes.', 'Service connection': "A pipe that connects a water distribution network to a customer's property.", 'Fire hydrant': 'A device used to access water from a water distribution network for firefighting purposes.', 'Water meter': 'A device used to measure the amount of water used by a customer in a water distribution network.', 'Water 

In [80]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.5, "max_length": 100000,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")
        
print(f"ResultQ234:{term_answers}")

2024-03-31 07:53:45
ResultQ1:{'Pipe': 'A long, cylindrical structure that carries water from one point to another in a water distribution network. It is typically made of materials such as steel, concrete, or plastic.', 'Valve': 'A mechanical device that controls the flow of water in a water distribution network. It can be used to close, open, or regulate the flow of water.', 'Reservoir': 'A large container that stores water for use in a water distribution network. It is typically built above or below ground.', 'Pump': 'A mechanical device that moves water from one location to another in a water distribution network. It is typically used to lift water to higher elevations or to move water through long distances.', 'Tank': 'A container that stores water for use in a water distribution network. It is typically built above ground.', 'Hydrant': 'A device that allows firefighters to access the water distribution network for firefighting purposes. It typically includes a valve, a hose connec

## Final Pipeline GPT-4-0125-preview  with Open AI Embeddings
llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

In [105]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "python"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("gpt-4-0125-preview_result_Q6.ttl", "w") as file:
     file.write(str(ResultQ6))

with open("gpt-4-0125-preview_result_Q7.ttl", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 20:19:15
ResultQ1:{'Reservoir': 'A large natural or artificial lake used as a source of water supply.', 'Pump Station': 'A facility that houses pumps and equipment for pumping fluids from one place to another.', 'Water Tower': 'A tall structure that stores water at a height sufficient to pressurize a water supply system for the distribution of potable water.', 'Pipes': 'Cylindrical tubes made of various materials used to transport water from one location to another.', 'Valves': 'Devices that regulate, direct, or control the flow of water by opening, closing, or partially obstructing various passageways.', 'Hydrants': 'An outlet from a fluid main often consisting of an upright pipe with a valve attached from which water can be drawn for fighting fires.', 'Meters': 'Devices used to measure the volume of water usage.', 'Fittings': 'Pieces used to connect pipes in a water distribution network, including elbows, tees, couplings, and flanges.', 'Water Main': 'A principal pipe in a

Test Results

In [90]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "python"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("gpt-4-0125-preview_result_Q6.txt", "w") as file:
     file.write(str(ResultQ6))

with open("gpt-4-0125-preview_result_Q7.txt", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 19:05:49
ResultQ1:{'Reservoir': 'A natural or artificial lake used as a source of water supply.', 'Pump Station': 'A facility that houses pumps and equipment for pumping fluids from one place to another.', 'Water Tower': 'A large elevated water storage container constructed to hold a water supply at a height sufficient to pressurize a water distribution system.', 'Distribution Mains': 'Large pipes responsible for carrying water from the treatment plant or pumping station to various parts of the service area.', 'Service Reservoir': 'An artificial reservoir or tank that stores treated water close to the point of distribution.', 'Pipes': 'Cylindrical tubes made of various materials used to transport water from one location to another within the distribution network.', 'Valves': 'Mechanical devices installed within the water distribution system to control the flow and pressure of water.', 'Hydrants': 'An outlet from a fluid main often consisting of an upright pipe with a valve a

In [96]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "python"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("gpt-4-0125-preview_result_Q6.txt", "w") as file:
     file.write(str(ResultQ6))

with open("gpt-4-0125-preview_result_Q7.txt", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 19:57:13
ResultQ1:{'Water Source': 'The starting point of the water in the distribution system, such as a reservoir, lake, or aquifer.', 'Pump Station': 'Facilities including pumps and equipment for pumping fluids from one place to another.', 'Water Tower': 'A large elevated water storage container constructed to hold a water supply at a height sufficient to pressurize a water distribution system.', 'Pipes': 'Cylindrical conduits made of various materials such as PVC, ductile iron, or steel used to transport water from the source to consumers and between different parts of the distribution network.', 'Valves': 'Devices used to control the flow of water by opening, closing, or partially obstructing various passageways.', 'Fire Hydrant': 'A connection point by which firefighters can tap into a water supply, characterized by a standpipe with valves.', 'Water Meters': 'Devices that measure the volume of water used by residential and commercial buildings connected to the municipa

In [73]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "python"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")

2024-03-30 23:58:08
ResultQ1:{'Water Source': 'The initial source of water, such as a river, lake, reservoir, or aquifer, from which water is collected for treatment and distribution.', 'Treatment Plant': 'A facility that treats water to meet drinking water standards before it is distributed to consumers. This may involve filtration, disinfection, and removal of contaminants.', 'Pump Station': 'A facility that uses pumps to transport water through the distribution system, especially where gravity flow is insufficient.', 'Water Tower': 'A tall structure that stores treated water and uses gravity to maintain pressure and deliver water to the distribution network.', 'Distribution Reservoir': 'A storage facility within the distribution system that holds treated water ready for distribution to meet demand fluctuations.', 'Pipes': 'Conduits made of various materials such as PVC, ductile iron, or steel that transport water within the distribution network.', 'Valves': 'Devices that control the

In [79]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.5)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "python"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")

2024-03-31 07:46:56
ResultQ1:{'Reservoir': 'A storage space for fluids, commonly used to store water in a water distribution system.', 'Pump Station': 'A facility that houses pumps and equipment for pumping fluids from one place to another.', 'Water Tower': 'A tall structure that stores water at a height to provide a gravity-fed water supply with sufficient pressure.', 'Pipes': 'Tubular sections or hollow cylinders used to transport water from one location to another within the distribution network.', 'Valves': 'Devices used to control the flow of water by opening, closing, or partially obstructing various passageways.', 'Fire Hydrant': 'A connection point by which firefighters can tap into a water supply, typically a prominent component of urban water distribution networks.', 'Meter': 'A device used to measure the volume of water flowing through the distribution system to a specific point or customer.', 'Backflow Preventer': 'A device used to prevent the reverse flow of water from a p

## Final pipeline zephyr-7b-beta with Open AI Embeddings
llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 100000,"max_new_tokens":2048}
)

In [98]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("zephyr-7b-beta_result_Q6.txt", "w") as file:
     file.write(str(ResultQ6))

with open("zephyr-7b-beta_result_Q7.txt", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 20:01:15
ResultQ1:{'pipe': 'A long, narrow tube used to transport water through a water distribution network.', 'reservoir': 'A large storage facility used to store water for distribution to consumers.', 'pumping station': 'A facility used to increase the pressure of water in a water distribution network.', 'valve': 'A device used to control the flow of water in a water distribution network.', 'manhole': 'A structure used to provide access to the underground water distribution network for inspection and maintenance.', 'hydrant': 'A device used to provide a connection point for firefighting purposes in a water distribution network.', 'water treatment plant': 'A facility used to treat raw water and convert it into safe drinking water for distribution to consumers.'}
ResultQ5:1. Pipe is a part of the water distribution network.
2. Reservoir is a part of the water distribution network (as it stores water for distribution).
3. Pumping station is a part of the water distribution n

Test Results

In [113]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("zephyr-7b-beta_result_Q6.ttl", "w") as file:
     file.write(str(ResultQ6))

with open("zephyr-7b-beta_result_Q7.ttl", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 20:54:48
ResultQ1:{'pipe': 'A long, narrow tube used to transport water in a water distribution network.', 'valve': 'A device used to regulate or stop the flow of water in a water distribution network.', 'tank': 'A large container used to store water in a water distribution network.', 'pump': 'A machine used to move water from one location to another in a water distribution network.', 'reservoir': 'A large storage facility used to store water in a water distribution network.', 'hydrant': 'A device used to provide a source of water for firefighting or other emergencies in a water distribution network.', 'manhole': 'A large, covered opening used to access and maintain the infrastructure of a water distribution network.', 'service line': 'The pipe that connects a building to the main water distribution network.', 'main': 'The large pipe that distributes water throughout a water distribution network.', 'junction': 'A point where two or more pipes join together in a water distrib

In [89]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 8192,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

2024-03-31 19:02:00
ResultQ1:{'pipe': 'A long, narrow tube used to transport water through a water distribution network.', 'reservoir': 'A large storage facility used to store water for distribution to consumers.', 'pumping station': 'A facility used to increase the pressure of water in a water distribution network.', 'valve': 'A device used to control the flow of water in a water distribution network.', 'manhole': 'A structure used to provide access to the underground water distribution network for inspection and maintenance.', 'hydrant': 'A device used to provide a connection point for firefighting purposes in a water distribution network.', 'water treatment plant': 'A facility used to treat raw water and convert it into safe drinking water for distribution to consumers.'}
ResultQ5:1. Pipe is a part of the water distribution network.
2. Reservoir is a part of the water distribution network (as it stores water for distribution).
3. Pumping station is a part of the water distribution n

In [72]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")

2024-03-30 23:56:03
ResultQ1:{'Pipe': 'A long, narrow tube used to transport water through a water distribution network.', 'Valve': 'A device used to control the flow of water in a water distribution network.', 'Reservoir': 'A large container used to store water in a water distribution network.', 'Pump': 'A mechanical device used to move water through a water distribution network.', 'Junction': 'A point in a water distribution network where two or more pipes join together.', 'Manhole': 'A covered opening in a water distribution network providing access to the pipes and other infrastructure.', 'Hydrant': 'A device mounted on a pipe used to provide a water supply for firefighting purposes.', 'Service Connection': 'The point at which a water distribution network connects to a building or property for water supply.', 'Meter': 'A device used to measure the amount of water consumed by a customer in a water distribution network.', 'Filter': 'A device used to remove impurities from water in a 

In [78]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.5, "max_length": 100000,"max_new_tokens":2048}
)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
keyword = "Assistant"
ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
ResultQ1 = ast.literal_eval(json.dumps(ResultQ1))
ResultQ1= ast.literal_eval(ResultQ1)
print(f"ResultQ1:{ResultQ1}")
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
                ResultQ234 = ResultQ234.split(keyword, 1)
                ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
        ResultQ5 = ResultQ5.split(keyword, 1)
        ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
        ResultQ6 = ResultQ6.split(keyword, 1)
        ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
            ResultQ7 = ResultQ7.split(keyword, 1)
            ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")

2024-03-31 07:41:30
ResultQ1:{'pipe': 'A tube or cylindrical structure used to transport water in a water distribution network.', 'valve': 'A device used to regulate or stop the flow of water in a water distribution network.', 'reservoir': 'A large storage tank used to store water in a water distribution network.', 'pump': 'A machine used to move water from one location to another in a water distribution network.', 'hydrant': 'A device used to provide an emergency water supply in a water distribution network.', 'manhole': 'A structure used to provide access to the underground pipes and equipment in a water distribution network.', 'tank': 'A container used to store water in a water distribution network.', 'service connection': 'The point at which a water distribution network connects to a building or property.', 'pipe joint': 'The point at which two pipes are joined together in a water distribution network.', 'pipe fitting': 'A component used to connect two pipes or to change the direct

## Final Pipeline Gpt-3.5-turbo-0125 with Open AI Embeddings 

In [103]:
# Format the Amsterdam time as a string
amsterdam_tz = pytz.timezone('Europe/Amsterdam')
# Get the current time in UTC
utc_now = datetime.datetime.utcnow()
# Convert UTC time to Amsterdam time
amsterdam_now = utc_now.astimezone(amsterdam_tz)
# Format the Amsterdam time as a string
amsterdam_now_str = amsterdam_now.strftime("%Y-%m-%d %H:%M:%S")
print(amsterdam_now_str)

llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0.5)

retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 10} #Retrived documents
)

rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Step 5.1 Domain terminology
QueryQ1 = "We are creating an ontology in the water distribution network domain. The first step is Domain terminology. The domain lexicon listing the domain terms that characterize the water distribution network. The outcome of this step is a domain lexicon, or information structure used to answer questions like - What are the nouns typically used while building the water distribution network? The answers should only include physical entities, exclude the water quality or system characteristics. Please return the complete answer in the format of the python Dictionary {Entity: Definition}."
ResultQ1 = rag_chain.invoke(QueryQ1)

# Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#keyword = "python"
#ResultQ1 = ResultQ1.split(keyword, 1)  # Split only once
#ResultQ1 = ResultQ1[1].strip() if len(ResultQ1) > 1 else ""
#ResultQ1 = ResultQ1.strip("```")
ResultQ1 = json.loads(ResultQ1)
print(f"ResultQ1:{ResultQ1}") 
        
# Condition Check if the Result 5.1 is a dictionary
if isinstance(ResultQ1, dict):
    # Create the dictionary for the answers from Query234
    term_answers = {}
    for term, definition in ResultQ1.items():
        retry_count_Q234 = 0
        while retry_count_Q234 < 3:
            # Step 5.2 Retrieve answers of Synonymy, Taxonomy, and Predication for each term
            QueryQ234 = f"""
        We are creating the ontology in the drinking water distribution network domain. Please return the answer of step 2, 3, 4 for the input entity: {term} : {definition} based on step descriptions. There should not be any repeated answer between the step 2, 3, 4. A term can be either Synonyms, taxonomy, or predication. You don't need to explain the answer or specify the input entity in the answer, just return the answers in the following python dictionary format without adding extra dictionary or layer: """"{{Synonyms:[];Taxonomy:[];Predication:[]}}"""".
        Step 2. Domain glossary for input entity. The terms of the lexicon associated with a textual description, indicating also possible synonyms; Having produced a first lexicon, users could, in this step, enrich it by associating a textual description with each entry. Users can enrich the lexicon by associating a textual description with each entry. In identifying synonyms it is necessary to pinpoint the "preferred term" and label the others as synonyms. You can provide 0 to 2 synonyms for the input entity in the form of a dictionary if available. 
        Step 3. Taxonomy for input entity. Domain terms organized in a generalization/specialization (ISA) hierarchy; The first is a taxonomy based on the specialization relation, or the ISA relationship connecting a more specific concept to a more general one (such as invoice ISA business document).  Users must not only identify ISA relations between existing terms but also introduce more abstract terms or generic concepts seldom used in everyday life but that are extremely useful in organizing knowledge. During this step users thus provide feedback to the two previous knowledge levels—lexicon and glossary—since taxonomy building is also an opportunity to validate the two previous levels and extend them with new terms. Users must find a good balance between the breadth of the taxonomy, or average number of children of intermediate nodes, and its depth, or levels of specialization and the granularity of taxonomy leaves. Please provide the taxonomy for the input entity.
        Step 4. Predication for input entity. Terms representing properties from the glossary identified and connected to the entities they characterize; This step is similar to a database design activity, as it concentrates on the properties that, in the domain at hand, characterize the relevant entities. Users generally identify atomic properties (AP) and complex properties (CP). The former can be seen as printable data fields (such as unit price), and the latter exhibit an internal structure and have components (such as address composed of, say, street, city, postal code, and state). Finally, if a property refers to other entities (such as a customer referred to in an invoice) it is called a reference property (RP). In a relational database, an RP is represented by a foreign key. The resulting predicate hierarchy is organized with the entity at the top, then a property hierarchy below it, where nodes are tagged with CP, AP, and RP. Please only return CP,AP,RP for the input entity.
        """
            try:
                ResultQ234 = rag_chain.invoke(QueryQ234)
#                 ResultQ234 = ResultQ234.split(keyword, 1)
#                 ResultQ234 = ResultQ234[1].strip() if len(ResultQ234) > 1 else ""
                term_answers[term] = ResultQ234
                break  # Exit the loop if the query succeeds
                
            except Exception as e: # Except of Step 5.2: Retrieve fails three times
                retry_count_Q234+=1
                if retry_count_Q234 == 3:
                    print(f"Error occurred in Query 6 after 3 attempts: {e}")
else:
    print(f"Query1 did not return a dictionary.ResultQ1={type(ResultQ1)}")

# Step 5.3: Retrieve answers based on the Result 5.1
retry_count_Q5 = 0
while retry_count_Q5 < 3:
    try:
        QueryQ5 = f"""We are creating a drinking water distribution network ontology. Please only map the definite relationships for the input entities based on step descriptions. There should be no conflicting relationships, such as A being a part of B while B is also a part of A. Input entities:{ResultQ1}.
        Descriptions: Step 5. Parthood (meronymy) -- Complex entity names connected to their components, with all names needing to be present in the glossary; This step concentrates on the 'architectural' structure of business entities, or parts of composite entities, whether objects, processes, or actors, by eliciting their decomposition hierarchy (or part-whole hierarchy). To this end, a user would analyze the structure and components an entity exhibits, creating the hierarchy based on the partOf (inverse hasPart) relationship. Parthood can also be applied to immaterial entities (such as a regulation subdivided into sections and articles or a process subdivided into sub-processes and activities). 
        In sub-processes and activities, users can enrich the model with other relations (such as 'precedence' and 'sync'), a subject beyond the scope of this article. """ 
        ResultQ5 = rag_chain.invoke(QueryQ5)   
        # Excluding the prompt part from the answer. If the keyword is found, return the text after the keyword, otherwise return an empty string. 
#         ResultQ5 = ResultQ5.split(keyword, 1)
#         ResultQ5 = ResultQ5[1].strip() if len(ResultQ5) > 1 else ""
        print(f"ResultQ5:{ResultQ5}")
        break
# Except of Step 5.3: Retrieve fails three times
    except Exception as e:
        retry_count_Q5+=1
        if retry_count_Q5 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}")

retry_count_Q6 = 0
while retry_count_Q6 < 3:
# Step 5.4: Retrieve answers based on the Result 5.3 
    try:
        QueryQ6 = f""" We are creating the ontology in the drinking water distribution network domain. Please return the complete the turtle syntax of the ontology based on the input. Input entities:{ResultQ5}. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
        <http://example.org/person/Mark_Twain>
        <http://example.org/relation/author>
        <http://example.org/books/Huckleberry_Finn> . """
        ResultQ6 = rag_chain.invoke(QueryQ6)
#         ResultQ6 = ResultQ6.split(keyword, 1)
#         ResultQ6 = ResultQ6[1].strip() if len(ResultQ6) > 1 else ""
        print(f"ResultQ6:{ResultQ6}")
        break
    # Except of Step 5.4: Retrieve fails 
    except Exception as e:
        retry_count_Q6 +=1
        if retry_count_Q6 == 3:
            print(f"Error occurred in Query 6 after 3 attempts: {e}, please pass the QueryQ6 to the ChatGPT for further prompting")
    
    
#Condition Check if the Result 5.2 is a dictionary
if ResultQ6 is not None:
    retry_count_Q7 = 0
    while retry_count_Q7 < 3:
# Step 5.5: Retrieve answers based on the Result 5.4
        try:
            QueryQ7 = f"""We are at the final stage of creating the turtle syntax for the drinking water distribution network. Please use all the input entities and their synonyms, Taxonomy and Predication:{term_answers}, this dictionary contains the concepts in Step 2 and the semantic relations elicited in three steps: a taxonomy (Step 3), further enriched by the predication (Step 4), and parthood (Step 5)); encode all the input into the turtle syntax and then incorporate all of them into the existing turtle syntax:{ResultQ6}. You don't need to give the explaination, return the complete final turtle syntax instead of examples. Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. In computing, Terse RDF Triple Language (Turtle) is a syntax and file format for expressing data in the Resource Description Framework (RDF) data model. RDF represents information using semantic triples, which comprise a subject, predicate, and object. Each item in the triple is expressed as a Web URI. Turtle provides a way to group three URIs to make a triple, and provides ways to abbreviate such information, for example by factoring out common portions of URIs. For example, information about Huckleberry Finn could be expressed as:
            <http://example.org/person/Mark_Twain>
            <http://example.org/relation/author>
            <http://example.org/books/Huckleberry_Finn>."""
            ResultQ7 = rag_chain.invoke(QueryQ7)
#             ResultQ7 = ResultQ7.split(keyword, 1)
#             ResultQ7 = ResultQ7[1].strip() if len(ResultQ7) > 1 else ""
            print(f"ResultQ7:{ResultQ7}")
            break
            # Except of Step 5.5: Retrieve fails 
        except Exception as e:
            retry_count_Q7+=1
            if retry_count_Q7 == 3:
                print(f"Error occurred in Query 7 for term '{term}': {e}，please pass the following information to the ChatGPT for further prompting. Term_answers:{term:answers}")
else:
        print("Error occurred: ResultQ6 is undefinied")

print(f"ResultQ234{term_answers}")
print(f"LLM:{llm} and embedding model:{embeddings}")

with open("gpt-3.5-turbo-0125_result_Q6.ttl", "w") as file:
     file.write(str(ResultQ6))

with open("gpt-3.5-turbo-0125_result_Q7.ttl", "w") as file:
    file.write(str(ResultQ7))

2024-03-31 20:17:09
ResultQ1:{'Pipes': 'Tubes or conduits used to transport water within the distribution network', 'Valves': 'Devices used to control the flow of water within the distribution network', 'Pumps': 'Mechanical devices used to increase the pressure and flow of water within the distribution network', 'Tanks': 'Storage containers used to hold water within the distribution network', 'Reservoirs': 'Large bodies of water used to store and supply water to the distribution network', 'Meters': 'Devices used to measure the amount of water flowing through the distribution network', 'Hydrants': 'Outlets installed in the distribution network for accessing water in case of emergencies or maintenance', 'Manholes': 'Access points in the distribution network for maintenance and inspection purposes', 'Junctions': 'Points where different pipes in the distribution network meet or split', 'Crossings': 'Points where pipes in the distribution network intersect or pass over/under each other'}
Re