In [None]:
!pip install rdflib pymongo openai > /dev/null

# Get the datasets

In [None]:
!wget https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_test_wikidata.json > /dev/null # qald-9-plus-test
!wget https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_train_wikidata.json > /dev/null # qald-9-plus train
!wget https://raw.githubusercontent.com/KGQA/QALD-10/main/data/qald_10/qald_10.json > /dev/null # qald-10
!wget https://raw.githubusercontent.com/vladislavneon/RuBQ/master/RuBQ_2.0/RuBQ_2.0_dev.json > /dev/null # rubq 2.0 dev
!wget https://raw.githubusercontent.com/vladislavneon/RuBQ/master/RuBQ_2.0/RuBQ_2.0_test.json > /dev/null # rubq 2.0 test

--2024-02-27 13:46:16--  https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_test_wikidata.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8912790 (8.5M) [text/plain]
Saving to: ‘qald_9_plus_test_wikidata.json’


2024-02-27 13:46:17 (75.1 MB/s) - ‘qald_9_plus_test_wikidata.json’ saved [8912790/8912790]

--2024-02-27 13:46:17--  https://raw.githubusercontent.com/KGQA/QALD_9_plus/main/data/qald_9_plus_train_wikidata.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17324209 (17M) [text/plain]
Saving to: 

# Import dependencies

In [None]:
import re
import os
import json
import random
import logging
import requests
import pandas as pd
from time import sleep
from rdflib import URIRef
from openai import OpenAI
from getpass import getpass
from datetime import datetime
from pymongo import MongoClient
from pyparsing import ParseResults
from rdflib.plugins.sparql.parser import parseQuery

In [None]:
token = getpass('Token Open AI')

Token Open AI··········


# Read datasets

In [None]:
files = [f for f in os.listdir() if ".json" in f]

data_dict = {}

for file in files:
  with open(file) as f:
    data = json.load(f)
  data_dict[file] = data

# Setup DB and OpenAI key

In [None]:
client = OpenAI(
    api_key=token,
)

mongo_client = MongoClient('141.57.8.18:40200',
    username='admin',
    password='admin123',
    authSource='admin'
)

db = mongo_client['QAfiltering']

In [None]:
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [None]:
REPLACE = (('[', ''), (']', ''), (' ', '_'), (',', ''), ("'", ''))

WIKIDATA_PREFIXES = ['wd:', 'wdt:', 'p:', 'ps:', 'pq:',]

FIXED_LABELS = {
    'rdfs:label': 'label',
    'http://www.w3.org/2000/01/rdf-schema#label': 'label',
    'skos:altLabel': 'alternative label',
    'xsd:integer': 'integer',
    'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': 'type',
    'http://www.w3.org/2001/XMLSchema#integer': 'integer',
    'http://www.w3.org/2001/XMLSchema#gYear': 'year',
    'http://www.w3.org/2001/XMLSchema#double': 'double',
}

headers = {
    'User-Agent': 'wiki_parser_online/0.17.1 (https://deeppavlov.ai; info@deeppavlov.ai) deeppavlov/0.17.1',
}

WIKIDATA = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'

In [None]:
from operator import truediv
def extract_entities_recursive(parsed):
    results = []

    if isinstance(parsed, URIRef):
        results += [str(parsed)]

    if isinstance(parsed, (list, ParseResults)):
        for i in list(parsed):
            results += extract_entities_recursive(i)

    if isinstance(parsed, dict):
        if 'prefix' in parsed and 'localname' in parsed:
            results += [f'{parsed["prefix"]}:{parsed["localname"]}']
        else:
            for i in parsed.values():
                results += extract_entities_recursive(i)

    return results

def extract_entities(sparql):
    try:
        return extract_entities_recursive(parseQuery(sparql)[1]['where'])
    except:
        return []

def execute(query: str, endpoint_url: str = WIKIDATA):
    """
    Send query direct to wikidata.

    query: SPARQL query
    endpoint_url: endpoint of wikidata query service

    execute(): json response
    """
    try:
        r = requests.get(endpoint_url, headers=headers, params = {'format': 'json', 'query': query})

        if r.status_code == 200:
            return r.json()

    except Exception as e:
        logger.error(f'Exception in function "execute": {e}')

    return None

def query_wikidata(query: str, repeat: int = 3, timeout: float = 10.0):
    """
    Send query direct to wikidata.
    query: SPARQL query
    query_wikidata(): list of dicts
    """
    while repeat > 0:
        try:
            return execute(query)['results']['bindings']

        except Exception as e:
            logger.error(f'Exception in function "query_wikidata": {e}')
            sleep(timeout)
            repeat -= 1

    return None

def query_wikidata_label(uri: str, lang: str='en') -> str:
    query = (
        'SELECT ?label WHERE {',
        '  {',
        f'    {uri} rdfs:label ?label .',
        '  }',
        '  UNION',
        '  {',
        f'    {uri} owl:sameAs+ ?redirect .',
        '    ?redirect rdfs:label ?label .',
        '  }',
        f'  FILTER (lang(?label) = "{lang}")',
        '} LIMIT 1'
    )

    try:
        data = query_wikidata('\n'.join(query))
        return data[0]['label']['value']
    except KeyError:
        raise
    except Exception as e:
        # print(f'Exception in function "query_wikidata_label": {e}')
        # print(f'Input parameters: {uri}, {lang}')
        return None

def get_wikidata_label_cached(cache, uri: str, lang: str='en') -> str:
    label = cache.wikidata_labels.find_one({'uri': uri, 'lang': lang})

    if label:
        logger.debug(f"Found label in cache {label}")
        return label['label']

    prefix = 'wd' # it works despite property or entity

    try:
        label = query_wikidata_label(f'{prefix}:{uri}', lang)
        logger.debug(f"get_wikidata_label_cached label: {str(label)}, lang: {str(lang)}")
        if not label:
            label = None

        cache.wikidata_labels.insert_one({ 'uri': uri, 'lang': lang, 'label': label })

    except KeyboardInterrupt:
        raise
    except Exception as e:
        # print(f'Exception in function "get_wikidata_label_cached": {e}')
        # print(f'Input parameters: {uri}, {lang}')
        return None
    logger.debug(f"returning label {label} from  get_wikidata_label_cached")

    return label

def get_wikidata_label(cache, literal, lang='en'):
    literal = literal.strip('(<>).')

    if literal in FIXED_LABELS:
        return FIXED_LABELS[literal]

    if not any(i in literal for i in WIKIDATA_PREFIXES+['http']):
        if "xsd:" not in literal:
            return literal
        else:
            return None

    if any(literal.startswith(i) for i in WIKIDATA_PREFIXES):
        w_id = literal.split(':')[-1]
    else:
        parts = literal.split('/')
        w_id = parts[-1][:-1] if parts[-1].endswith(">") else parts[-1]

    label = get_wikidata_label_cached(cache, w_id, lang)

    return label

def get_question_by_language(item, dataset='qald', lang='en'):
  q_list = []
  if dataset == 'qald':
    for q in item['question']:
      if q['language'] == lang:
        q_list.append(q['string'])
    return q_list
  elif dataset == 'rubq':
    if lang == 'en':
      q_list.append(item["question_eng"])
    elif lang == 'ru':
      q_list.append(item["question_text"])
    else:
      assert False
    return q_list
  else:
    assert False

def make_the_prompt(cache, query, prompt_template, lang='en', dataset=None):
    entities = extract_entities(query)

    list_labels = []
    for i in entities:
        label = get_wikidata_label(cache, i, lang)
        if label:
            list_labels.append(prompt_template[lang]['list'].format(uri=i, uriLabel=label))

    if prompt_template["type"] == "ZERO_SHOT":
      prompt = prompt_template[lang]["head"].format(query=query) + "\n ".join(l for l in list_labels) + prompt_template[lang]["tail"]
    elif prompt_template["type"] == "ONE_SHOT":
      queries = [q["query"]["sparql"] for q in dataset].copy()
      questions = [get_question_by_language(q, 'qald', lang) for q in dataset].copy()

      r_idx = queries.index(query)
      queries.pop(r_idx) # remove the main query
      questions.pop(r_idx) # remove the main query

      c_idx = random.choice([i for i in range(len(queries))]) # choice a random query to make a one shot
      shot_query = queries[c_idx]
      gold_standard = questions[c_idx]

      shot = make_the_prompt(cache, shot_query, {"type": "ZERO_SHOT", lang: prompt_template[lang]}, lang=lang)
      shot += ". " + random.choice(gold_standard)
      prompt = prompt_template[lang]["shot"].format(shot=shot) + prompt_template[lang]["head"].format(query=query) + "\n ".join(l for l in list_labels) + prompt_template[lang]["tail"]

    return prompt

def find_in_cache(collection_name: str, filter_dict: dict):
    try:
        result = db[collection_name].find_one(filter_dict)
        if result:
            return result
        else:
            return None
    except Exception as e:
        print(str(e))
        return None

def cache_gpt(model: str, prompt: dict, result: dict):
    try:
        document = {
            'prompt': prompt,
            'result': result,
            'date': datetime.now()
        }
        db[model].insert_one(document)
    except Exception as e:
        print(str(e))

def ask_openai(prompt, model="gpt-3.5-turbo"):
  cached_result = find_in_cache(model, {"prompt": prompt})
  if cached_result:
    return cached_result["result"]

  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model=model,
  )

  result = chat_completion.choices[0].message.content
  cache_gpt(model, prompt, result)

  return result

In [None]:
ZERO_SHOT_PROMPT = {
    'type': "ZERO_SHOT",
    'en': {
        'head': 'Having a SPARQL query: {query} \n Where:\n ',
        'list': '{uri} has human-readable name "{uriLabel}."',
        'tail': '\n Transform the SPARQL query to a natural language question. Output just the transformed question'
    },
    'ru': {
        'head': 'Имея следующий SPARQL запрос: {query} \n Где:\n ',
        'list': '{uri} именуется как "{uriLabel}."',
        'tail': '\n Трансформируй SPARQL запрос в вопрос на естественном языке. Выведи только транфсормируемый вопрос'
    },
    'de': {
        'head': 'Gegeben ist die SPARQL-Anfrage: {query} \n Dabei gilt:\n ',
        'list': 'Die Bezeichnung von {uri} ist "{uriLabel}."',
        'tail': '\n Transformiere die SPARQL-Anfrage in eine Frage in natürlicher Sprache. Gib nur die transformierte Frage aus.'
    }
}

ONE_SHOT_PROMPT = {
    'type': "ONE_SHOT",
    'en': {
        'shot': "---- Start Example ---- \n {shot} \n ----End Example ---- \n",
        'head': 'Having a SPARQL query: {query} \n Where:\n ',
        'list': '{uri} has human-readable name "{uriLabel}."',
        'tail': '\n Transform the SPARQL query to a natural language question. Output just the transformed question'
    },
    'ru': {
        'shot': "{shot} \n ----------- \n",
        'head': 'Имея следующий SPARQL запрос: {query} \n Где:\n ',
        'list': '{uri} именуется как "{uriLabel}."',
        'tail': '\n Трансформируй SPARQL запрос в вопрос на естественном языке. Выведи только транфсормируемый вопрос'
    },
    'de': {
        'shot': "{shot} \n ----------- \n",
        'head': 'Gegeben ist die SPARQL-Anfrage: {query} \n Dabei gilt:\n ',
        'list': 'Die Bezeichnung von {uri} ist "{uriLabel}."',
        'tail': '\n Transformiere die SPARQL-Anfrage in eine Frage in natürlicher Sprache. Gib nur die transformierte Frage aus.'
    }
}

# QALD-9 experiments

In [None]:
def process_qald_dataset(dataset, MODEL=None, lang=None):
  id_list = []
  prompt_list = []
  predicted_list = []
  gs_list = []
  lang_list = []
  model_list = []
  prompt_type_list = []
  query_list = []

  for q in dataset:
    query = q["query"]["sparql"]

    prompt = make_the_prompt(db, query, PROMT_TEMPLATE, lang, dataset)
    predicted_nl = ask_openai(prompt=prompt, model=MODEL)
    gold_standard_nl_list = get_question_by_language(q, 'qald', lang)

    for gs in gold_standard_nl_list:
      id_list.append(q["id"])
      prompt_list.append(prompt)
      predicted_list.append(predicted_nl)
      gs_list.append(gs)
      lang_list.append(lang)
      model_list.append(MODEL)
      prompt_type_list.append(PROMT_TEMPLATE["type"])
      query_list.append(query)

  return pd.DataFrame.from_dict({
      "id": id_list,
      "prompt": prompt_list,
      "generated_nl": predicted_list,
      "gold_standard_nl": gs_list,
      "lang": lang_list,
      "model": model_list,
      "prompt_type": prompt_type_list,
      "query": query_list
  })

## QALD-9-plus experiments

In [None]:
languages = ["en", "ru", "de"]
MODEL = "gpt-4-1106-preview"
PROMT_TEMPLATE = ONE_SHOT_PROMPT

for lang in languages:
  # 9-plus-test
  QALD_9_TEST_NAME = "qald_9_plus_test_wikidata.json"
  df = process_qald_dataset(data_dict[QALD_9_TEST_NAME]["questions"], MODEL, lang)
  df.to_csv(QALD_9_TEST_NAME + f"_{lang}.tsv", sep='\t', index=False)

  # 9-plus-train
  QALD_9_TRAIN_NAME = "qald_9_plus_train_wikidata.json"
  df = process_qald_dataset(data_dict[QALD_9_TRAIN_NAME]["questions"], MODEL, lang)
  df.to_csv(QALD_9_TRAIN_NAME + f"_{lang}.tsv", sep='\t', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fbecb9355fd358412ed19'), 'uri': 'P19', 'lang': 'ru', 'label': 'место рождения'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fea12cf4e087b45310716'), 'uri': 'Q380', 'lang': 'ru', 'label': 'Meta Platforms'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fdcca5eb7ff90084b7a49'), 'uri': 'P127', 'lang': 'ru', 'label': 'владельцем является'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655feaa1cf4e087b45310831'), 'uri': 'Q7378', 'lang': 'ru', 'label': 'слон'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fbed69355fd358412ed31'), 'uri': 'P279', 'lang': 'ru', 'label': 'подкласс от'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fbec39355fd358412ed0d'), 'uri': 'P161', 'lang': 'ru', 'label': 'в ролях'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fd58f9389840180e1adad'), 'uri': 'Q37079', 'lang': 'ru', 'label': 

## QALD-10 experiments

In [None]:
languages = ["en", "ru", "de"]
MODEL = "gpt-4-1106-preview"
PROMT_TEMPLATE = ONE_SHOT_PROMPT

for lang in languages:
  QALD_10_NAME = "qald_10.json"
  df = process_qald_dataset(data_dict[QALD_10_NAME]["questions"], MODEL, lang)
  df.to_csv(QALD_10_NAME + f"_{lang}.tsv", sep='\t', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:__main__:Found label in cache {'_id': ObjectId('65d5f2c6f36c475f0d1f3106'), 'uri': 'Q79226802', 'lang': 'ru', 'label': 'животное из китайского зодиака'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('65d5f396f36c475f0d1f31b3'), 'uri': 'Q338553', 'lang': 'ru', 'label': 'The Visitors'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fd5fe9389840180e1ae7e'), 'uri': 'P577', 'lang': 'ru', 'label': 'дата публикации'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('65d5f290f36c475f0d1f30db'), 'uri': 'Q164815', 'lang': 'ru', 'label': 'Вудсток'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('655fd6109389840180e1aea0'), 'uri': 'P710', 'lang': 'ru', 'label': 'участник(и)'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('65d5f291f36c475f0d1f30dc'), 'uri': 'Q190076', 'lang': 'ru', 'label': 'Джо Кокер'}
DEBUG:__main__:Found label in cache {'_id': ObjectId('65d5f398f36c475f0d1f31b5'), 'uri': 'Q98