In [42]:
import datetime
import requests
import json

from pprint import pprint
import logging
from openai import OpenAI
import os
import sys
import time
from pathlib import Path
import hashlib

## Configuration

In [43]:
# load the OpenAI API key from the environment


OPENAI_API_KEY = Path('./OPENAI_API_KEY').read_text() # stored in a local file for security reasons

language = "en" # used for documentation only
#language = "de"
language_string = "English" if language == "en" else "German" # TODO: use a dictionary

current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

in_filename = "https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_train_dbpedia.json"

cache_dir = "stored_data"

# prefix of in_filename used for storing the cached files 
prefix = in_filename.split("/")[-1].split(".")[0]

# cf. https://platform.openai.com/docs/models/continuous-model-upgrades
#OPENAI_MODEL = "gpt-3.5-turbo"
#OPENAI_MODEL = "gpt-3.5-turbo-16k"
OPENAI_MODELS = ["gpt-4", "gpt-3.5-turbo"]
temperature = 0.0

prompt_template = """Assume you have to rephrase the following SPARQL query as a natural-language question for non-technical humans. 
Use the {language_string} language. SPARQL query: 
{sparql}
"""

#logging.basicConfig(level=logging.INFO)
# date_strftime_format = "%Y-%m-%d %H:%M:%S"
# logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

logger = logging.getLogger(__name__)
#Remove all handlers:
for handler in logger.handlers[:]: #get rid of existing old handlers
    print('removing handler %s'%handler)
    logger.removeHandler(handler)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
logger.setLevel(logging.INFO)


removing handler <StreamHandler stdout (NOTSET)>


## Methods

In [44]:
def get_filename_for_cache(prefix, question_id, prompt_hash, model, temperature):
    # extend id to 6 digits
    question_id = str(question_id).zfill(6)
    return "{}/{}_{}_{}_model-{}_temperature-{}.json".format(cache_dir, prefix, prompt_hash, question_id, model, temperature)

In [45]:
def get_cached_data_if_exists(filename):
    try:
        with open(filename, "r") as f:
            return json.load(f)
    except:
        return None

In [46]:
def store_data_in_cache(filename, data):
    # format JSON with 4 spaces indentation    
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
        logging.info("Stored data in cache: {}".format(filename))

## Fetch Data 

In [47]:
json_data = json.loads(requests.get(in_filename).text)
logger.info("Loaded {}: {} questions".format(in_filename, len(json_data["questions"])))
#json_data
#print(json_data["questions"][0])

Loaded https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_train_dbpedia.json: 408 questions


INFO:__main__:Loaded https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_train_dbpedia.json: 408 questions


In [49]:
assert OPENAI_API_KEY, "Please set OPENAI_API_KEY environment variable"

# create hash from prompt_template 
prompt_hash = hashlib.sha1(prompt_template.encode('utf-8')).hexdigest()
print(prompt_hash)

for openai_model in OPENAI_MODELS:
    for item in json_data["questions"][:100]:
        question_id = item.get("id")
        questions = item["question"]    
        sparql = item.get("query", None).get("sparql", None)
        
        for question in questions:
            if language == question["language"]:
                print(question_id, language, question.get("string"), sparql)
                
                cache_filename = get_filename_for_cache(prefix, question_id, prompt_hash, openai_model, temperature)
                print(cache_filename)
                
                question_string = question.get("string")
                
                # check if the data is already stored
                cached_data = get_cached_data_if_exists(cache_filename)
                try:
                    if cached_data and cached_data.get("sparql2nl").get(language):
                        logging.debug(cached_data, indent=2, width=120)
                        logging.warning("cached data already contains the sparql2nl value for the language: {}, file: {}".format(language, cache_filename))
                        continue # skip this item
                except:
                    logging.info("cached data doesn't contain the sparql2nl value for the language: {}".format(language))
                    pass
                finally:
                    logging.info("no cached data found: {}".format(cache_filename))
                    if cached_data is None:
                        cached_data = {}

                prompt = prompt_template.format(language_string=language_string, sparql=sparql)
                logging.debug(prompt)               
                
                # get the data from the OpenAI API
                    
                assert prompt, "Please set the prompt"
                assert len(prompt.strip()) > 0, "Please set the prompt"
                
                client = OpenAI(
                    api_key=OPENAI_API_KEY
                )

                chat_completion = client.chat.completions.create(
                    model=openai_model, 
                    temperature=temperature,
                    messages=[{"role": "user", "content": prompt}]
                )
                
                generated_content = chat_completion.choices[0].message.content
                chat_completion_as_string = str(chat_completion)
                
                print(generated_content)
                
                cached_data["id"] = question_id
                cached_data["sparql"] = sparql
                cached_data["sparql2nl"] = {
                        language: generated_content,
                        "original_{}".format(language): question_string,
                        "datetime": current_datetime,
                        "temperature": temperature,
                        "model": openai_model,
                        "prompt": prompt,            
                        "prompt_hash": prompt_hash
                    }
                
                store_data_in_cache(cache_filename, cached_data)
                # print(chat_completion_as_string)
                
                time.sleep(1)



061ab317b1997f423db4b29caa1126458606c018
1 en List all boardgames by GMT. PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri WHERE { ?uri dbo:publisher res:GMT_Games }
stored_data/qald_9_plus_train_dbpedia_061ab317b1997f423db4b29caa1126458606c018_000001_model-gpt-4_temperature-0.0.json
1 en List all boardgames by GMT. PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri WHERE { ?uri dbo:publisher res:GMT_Games }
stored_data/qald_9_plus_train_dbpedia_061ab317b1997f423db4b29caa1126458606c018_000001_model-gpt-3.5-turbo_temperature-0.0.json
