# Question Generator for Coronabot

## Define question templates and required parameters

In [62]:
templates = [
        { 
            "last": True,
            "questions": [
                "How many cases were reported in August in Magdeburg?" # static by intention
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location_time",
                "CONTAINS_TIME": True
            }
        },
        {
            "last": True,
            "questions": [
                "What was the death toll in [PLACE_LABEL] during the last [MONTH]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_location_time",
                "CONTAINS_TIME": True
            }
        },
        {
            "last": False,
            "questions": [
                "How many infections were there in [PLACE_LABEL] in [MONTH] [YEAR]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location_time",
                "CONTAINS_TIME": True
            }
        },
        {
            "last": False,
            "questions": [
                "What was the number of incidents of [PLACE_LABEL] in [YEAR]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location_time",
                "CONTAINS_TIME": True
            }
        },
        {
            "last": False,
            "questions": [
                "How many people died in [PLACE_LABEL]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_location",
                "CONTAINS_TIME": False
            }
        },
        {
            "last": False,
            "questions": [
                "How many cases were reported on the first of [MONTH] [YEAR]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_time",
                "CONTAINS_TIME": True
            }
        },
        {
            "last": False,
            "questions": [
                "Infections in [PLACE_LABEL] and Kiel?"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location",
                "CONTAINS_TIME": False
            }
        }        
]

parameters = {
    "MONTH": ["January", "February", "March", "April", "May", "June"],
    "YEAR": [2020, 2021],
    "PLACE_LABEL": [],
    "PLACE_URL": [],
    "LANGUAGE": ["en"]
}

### add places from Wikidata

In [63]:
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON # https://sparqlwrapper.readthedocs.io/en/latest/main.html

sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
sparql.setQuery("""
# get all districts

PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

SELECT DISTINCT ?district ?label WHERE {
  #VALUES ?district { wd:Q6098 } .
  ?district p:P31/ps:P31/wdt:P279* wd:Q106658 .
  ?district wdt:P17 wd:Q183 .
  ?district rdfs:label ?label .
  FILTER (LANG(?label) = "en").
} 
LIMIT 10
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# pprint(results)

place_values = []
place_names_map = {}
for result in results["results"]["bindings"]:
    # print(result["label"]["value"], result["district"]["value"])
    place_values.append(result["label"]["value"])
    place_names_map[result["label"]["value"]] = result["district"]["value"]

pprint(place_names_map)
parameters["PLACE_LABEL"] = place_values
parameters["PLACE_URL"] = place_names_map
number=len(parameters["PLACE_LABEL"])
print(f"number of found places: {number}")


{'Altmarkkreis Salzwedel': 'http://www.wikidata.org/entity/Q6061',
 'Dithmarschen': 'http://www.wikidata.org/entity/Q2947',
 'Mittweida': 'http://www.wikidata.org/entity/Q20253',
 'Niederschlesischer Oberlausitzkreis': 'http://www.wikidata.org/entity/Q20259',
 'Nordfriesland district': 'http://www.wikidata.org/entity/Q2937',
 'Schleswig-Flensburg': 'http://www.wikidata.org/entity/Q2941',
 'Steinburg': 'http://www.wikidata.org/entity/Q3011',
 'Stendal District': 'http://www.wikidata.org/entity/Q6057',
 'Warendorf District': 'http://www.wikidata.org/entity/Q2839',
 'Weißeritzkreis': 'http://www.wikidata.org/entity/Q20274'}
number of found places: 10


## Generate questions

In [64]:
import time
import random
random.seed(round(time.time() * 1000))

def get_random_parameters(parameters):
    result = {}
    for k in parameters.keys():
        if k != "PLACE_URL": # TODO: needs to be generalized
            random_key = random.randint(0,len(parameters[k])-1)
            try:
                result[k] = parameters[k][random_key]
            except Exception as e:
                print(f"random_key: {random_key}")
                pprint(parameters[k])
                raise e
        else:
            pass # skip should only happen in case of dependent values

    result["PLACE_URL"] = parameters["PLACE_URL"][result["PLACE_LABEL"]] # TODO: needs to be generalized

    return result

def get_random_list_of_templates_questions(questions, number_of_questions_to_be_selected):
    randomly_selected_questions = []
    number_of_available_questions = len(questions)
    for i in range(number_of_questions_to_be_selected):
        random_key = random.randint(0,number_of_available_questions-1)
        randomly_selected_questions.append(questions[random_key])
    return randomly_selected_questions

generated_questions = []

random_templates = get_random_list_of_templates_questions(templates, 10)
for template in random_templates:
    questions = template["questions"]
    for question in questions:
        concrete_parameters = get_random_parameters(parameters)
        for place_holder, concrete_instance in concrete_parameters.items():
            question = question.replace('['+place_holder+']', str(concrete_instance))

        concrete_parameters_merged = {}
        if template.get("parameters", None) != None: # there might be predefined parameters of the question template
            concrete_parameters_merged = {**template.get("parameters"), **concrete_parameters}
        else:
            concrete_parameters_merged = concrete_parameters
        generated_question_configuration = {"question": question, "replacements": concrete_parameters_merged, "template": template}

        generated_questions.append(generated_question_configuration)


pprint(generated_questions, width=120)

generated_configuration = {}
generated_configuration["qanary"] = {
    "system_url": "https://webengineering.ins.hs-anhalt.de:43740/startquestionansweringwithtextquestion",
    "componentlist": ["LD-Shuyo", "coronabot-dialog-flow", "coronabot-question-classification", "coronabot-named-entity-recognition-time-en", "coronabot-named-entity-recognition-time-de", "coronabot-named-entity-recognition-location-en", "coronabot-named-entity-recognition-location-de", "OpenTapiocaNED", "LocationToGermanDistrict", "coronabot-missing-information", "coronabot-query-generation", "coronabot-data-acquisition", "coronabot-answer-generation"],
    "qanary_triplestore_endpoint": "https://webengineering.ins.hs-anhalt.de:40159",
    "qanary_triplestore_database": "qanary",
    "qanary_triplestore_username": "admin",
    "qanary_triplestore_password": "admin"
}
generated_configuration["tests"] = generated_questions
generated_configuration["validation-sparql-templates"] = ["01_language_detection.sparql","02_question_classification.sparql", "03_time_recognition.sparql", "04_ned.sparql", "05_location_to_german_district.sparql", "06_coronabot_missing_information.sparql", "07_rki_query.sparql"]
# generated_configuration["custom-validation"] = "dummy" # should not be defined if not required

pprint(generated_configuration, width=120)


[{'question': 'Infections in Nordfriesland district and Kiel?',
  'replacements': {'CONTAINS_TIME': False,
                   'LANGUAGE': 'en',
                   'MONTH': 'June',
                   'PLACE_LABEL': 'Nordfriesland district',
                   'PLACE_URL': 'http://www.wikidata.org/entity/Q2937',
                   'QUESTION_CLASS': 'infection_location',
                   'YEAR': 2021},
  'template': {'last': False,
               'parameters': {'CONTAINS_TIME': False, 'QUESTION_CLASS': 'infection_location'},
               'questions': ['Infections in [PLACE_LABEL] and Kiel?']}},
 {'question': 'How many cases were reported in August in Magdeburg?',
  'replacements': {'CONTAINS_TIME': True,
                   'LANGUAGE': 'en',
                   'MONTH': 'February',
                   'PLACE_LABEL': 'Mittweida',
                   'PLACE_URL': 'http://www.wikidata.org/entity/Q20253',
                   'QUESTION_CLASS': 'infection_location_time',
                   'YEAR

# Output to file

In [65]:
import json 

with open('sparql_test_query_templates/qanary-test-definition.json', 'w') as outfile:
    json.dump(generated_configuration, outfile, indent=4)