# Question Generator for Coronabot

## Define question templates and required parameters

In [88]:
templates = [
        {
            "last": True,
            "questions": [
                "What was the death toll in [PLACE_LABEL] during the last [MONTH]?",
                "How many people died in [PLACE_LABEL] last [MONTH]?",
                "How many deaths were reported last [MONTH] in [PLACE_LABEL]?",
                "covid deaths in [PLACE_LABEL] last [MONTH]",
                "how many deaths did [PLACE_LABEL] have last [MONTH]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_location_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": True
            }
        },
        {
            "last": False,
            "questions": [
                "What was the death toll in [PLACE_LABEL] during [MONTH] of [YEAR]?",
                "How many people died in [PLACE_LABEL] in [MONTH] [YEAR]?",
                "How many deaths were reported in [PLACE_LABEL] in [MONTH] [YEAR]?",
                "covid deaths in [PLACE_LABEL] in [MONTH] of [YEAR]",
                "how many deaths did [PLACE_LABEL] have in [MONTH] [YEAR]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_location_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": True
            }
        },
        {
            "last": False,
            "questions": [
                "How many infections were there in [PLACE_LABEL] in [MONTH] [YEAR]?",
                "In [MONTH] [YEAR], how many people were infected in [PLACE_LABEL]?",
                "Give me the number of cases in [PLACE_LABEL] during [MONTH] [YEAR]!",
                "How many cases of Covid did [PLACE_LABEL] report in [MONTH] [YEAR]?",
                "infections in [MONTH] [YEAR] in [PLACE_LABEL]"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": True
            }
        },
        {
            "last": False,
            "questions": [
                "What was the number of incidents of [PLACE_LABEL] in [YEAR]?",
                "In [YEAR], how many people were infected in [PLACE_LABEL]?",
                "What was the number of infections during [YEAR] in [PLACE_LABEL]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": True
            }
        },
        {
            "last": False,
            "questions": [
                "How many people died in [PLACE_LABEL]?",
                "How many deaths were reported in [PLACE_LABEL]?",
                "covid deaths in [PLACE_LABEL]",
                "What is the current death toll in [PLACE_LABEL]?"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_location",
                "CONTAINS_TIME": False,
                "CONTAINS_PLACE": True
            }
        },       
        {
            "last": False,
            "questions": [
                "How many people died during [MONTH] [YEAR]?",
                "How many deaths were reported in [MONTH] of [YEAR]?",
                "covid deaths on the fith of [MONTH] [YEAR]",
                "What was the death toll during [MONTH] [YEAR]?",
                "number of deaths from first of [MONTH] until 15th of [MONTH] [YEAR]"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": False
            }
        }, 
        {
            "last": True,
            "questions": [
                "How many people died during last [MONTH]?",
                "How many deaths were reported in last [MONTH]?",
                "covid deaths on the fith of last [MONTH]",
                "What was the death toll last [MONTH]?",
                "number of deaths from the first until the 15th of last [MONTH]"
            ],
            "parameters": {
                "QUESTION_CLASS": "death_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": False
            }
        },
        {
            "last": False,
            "questions": [
                "How many cases were reported on the first of [MONTH] [YEAR]?",
                "What was the number of infections during [MONTH] [YEAR]?",
                "infections between the first of [MONTH] [YEAR] and the 28th of [MONTH] [YEAR]?",
                "How many people were infected with Covid-19 in [MONTH] [YEAR]",
                "corona cases in [MONTH] [YEAR]"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": False
            }
        }, 
        {
            "last": True,
            "questions": [
                "How many cases were reported on the first of last [MONTH]?",
                "What was the number of infections last [MONTH]?",
                "infections between the first and 28th of last [MONTH]?",
                "How many people were infected with Covid-19 last [MONTH]",
                "corona cases last [MONTH]"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_time",
                "CONTAINS_TIME": True,
                "CONTAINS_PLACE": False
            }
        },
        {
            "last": False,
            "questions": [
                "Infections in [PLACE_LABEL] and Kiel?",
                "Cases in [PLACE_LABEL]?",
                "How many people have been infected in [PLACE_LABEL] in total?",
                "[PLACE_LABEL] corona cases",
                "give me the number of infections in [PLACE_LABEL]"
            ],
            "parameters": {
                "QUESTION_CLASS": "infection_location",
                "CONTAINS_TIME": False,
                "CONTAINS_PLACE": True
            }
        }        
]

max_number_of_locations = 200
max_number_of_used_templates = 50
max_number_of_generated_questions = 100
parameters = {
    "MONTH": ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
    "YEAR": [2020, 2021],
    "PLACE_LABEL": [],
    "PLACE_URL": [],
    "DISTRICT_URL": [],
    "LANGUAGE": ["en"]
}

## SPARQL query to retrieve places in Germany



In [89]:
any_place_with_population_query = """
# get any kind of places 

PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

SELECT DISTINCT ?district_url ?district_label ?place_url ?place_label WHERE {
  ?district_url p:P31/ps:P31/wdt:P279* wd:Q106658 .
  ?district_url wdt:P17 wd:Q183 .
  ?district_url rdfs:label ?district_label .
  ?place_url wdt:P1082 ?population.
  ?place_url rdfs:label ?place_label .
  ?place_url wdt:P131* ?district_url
  FILTER (LANG(?place_label) = "en").
  FILTER (LANG(?district_label) = "en").
} 
LIMIT """ + str(max_number_of_locations) + """
"""

only_district_query = """
# get all districts

PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

SELECT DISTINCT ?district_url ?district_label ?place_url ?place_label WHERE {
  ?district_url p:P31/ps:P31/wdt:P279* wd:Q106658 .
  ?district_url wdt:P17 wd:Q183 .
  ?district_url rdfs:label ?district_label .
  FILTER (LANG(?district_label) = "en").
  BIND(?district_url AS ?place_url) . # district is the place here
  BIND(?district_label AS ?place_label) . # district is the place here
} 
LIMIT """ + str(max_number_of_locations) + """
"""

places_query = any_place_with_population_query
# places_query = only_district_query

### add places from Wikidata

In [90]:
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON # https://sparqlwrapper.readthedocs.io/en/latest/main.html

sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
sparql.setQuery(places_query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# pprint(results)

place_values = []
place_names_map = {}
place_to_district_map = {}
place_to_district_label_map = {}

for result in results["results"]["bindings"]:
    # print(result["label"]["value"], result["district"]["value"])
    place_url = result["place_url"]["value"]
    place_label = result["place_label"]["value"]
    district_url = result["district_url"]["value"]
    district_label = result["district_label"]["value"]

    place_values.append(place_label)
    place_names_map[place_label] = place_url
    place_to_district_map[place_url] = district_url
    place_to_district_label_map[place_url] = district_label

parameters["PLACE_LABEL"] = place_values
parameters["PLACE_URL"] = place_names_map
parameters["DISTRICT_URL"] = place_to_district_map
parameters["DISTRICT_LABEL"] = place_to_district_label_map

pprint(parameters["PLACE_URL"])
number=len(parameters["PLACE_LABEL"])
print(f"number of found places: {number}")


{'Achtrup': 'http://www.wikidata.org/entity/Q341040',
 'Ahlen': 'http://www.wikidata.org/entity/Q1132',
 'Ahrenshöft': 'http://www.wikidata.org/entity/Q402840',
 'Ahrenviöl': 'http://www.wikidata.org/entity/Q402850',
 'Ahrenviölfeld': 'http://www.wikidata.org/entity/Q48248',
 'Albersdorf': 'http://www.wikidata.org/entity/Q499889',
 'Alkersum': 'http://www.wikidata.org/entity/Q20780',
 'Arlewatt': 'http://www.wikidata.org/entity/Q20745',
 'Aventoft': 'http://www.wikidata.org/entity/Q506341',
 'Beckum': 'http://www.wikidata.org/entity/Q2707',
 'Beelen': 'http://www.wikidata.org/entity/Q2815',
 'Behrendorf': 'http://www.wikidata.org/entity/Q48257',
 'Bondelum': 'http://www.wikidata.org/entity/Q48262',
 'Borgsum': 'http://www.wikidata.org/entity/Q27308',
 'Braderup': 'http://www.wikidata.org/entity/Q523770',
 'Bramstedtlund': 'http://www.wikidata.org/entity/Q522802',
 'Brebel': 'http://www.wikidata.org/entity/Q557015',
 'Bredstedt': 'http://www.wikidata.org/entity/Q20753',
 'Brunsbüttel': 

## Generate questions

In [93]:
import time
import random
random.seed(round(time.time() * 1000))

def get_random_parameters(parameters):
    result = {}
    for k in parameters.keys():
        if not k in ["PLACE_URL","DISTRICT_URL","DISTRICT_LABEL"]: # TODO: needs to be generalized
            random_key = random.randint(0,len(parameters[k])-1)
            try:
                result[k] = parameters[k][random_key]
            except Exception as e:
                print(f"random_key: {random_key}")
                pprint(parameters[k])
                raise e
        else:
            pass # skip should only happen in case of dependent values

    # find the concrete district for the given place
    result["PLACE_URL"] = parameters["PLACE_URL"][result["PLACE_LABEL"]] # TODO: needs to be generalized
    result["DISTRICT_URL"] = parameters["DISTRICT_URL"][result["PLACE_URL"]] # TODO: needs to be generalized
    result["DISTRICT_LABEL"] = parameters["DISTRICT_LABEL"][result["PLACE_URL"]] # TODO: needs to be generalized

    return result

def get_random_list_of_templates_questions(questions, number_of_questions_to_be_selected):
    randomly_selected_questions = []
    number_of_available_questions = len(questions)
    for i in range(number_of_questions_to_be_selected):
        random_key = random.randint(0,number_of_available_questions-1)
        randomly_selected_questions.append(questions[random_key])
    return randomly_selected_questions

generated_questions = []

random_templates = get_random_list_of_templates_questions(templates, max_number_of_used_templates)
for template in random_templates:
    questions = template["questions"]
    for question in questions:
        concrete_parameters = get_random_parameters(parameters)
        for place_holder, concrete_instance in concrete_parameters.items():
            question = question.replace('['+place_holder+']', str(concrete_instance))

        concrete_parameters_merged = {}
        if template.get("parameters", None) != None: # there might be predefined parameters of the question template
            concrete_parameters_merged = {**template.get("parameters"), **concrete_parameters}
        else:
            concrete_parameters_merged = concrete_parameters
        generated_question_configuration = {"question": question, "replacements": concrete_parameters_merged, "template": template}

        generated_questions.append(generated_question_configuration)

if len(generated_questions) > max_number_of_generated_questions:
    random.shuffle(generated_questions) # compute random order
    generated_questions = generated_questions[:max_number_of_generated_questions] # reduce number of questions

pprint(generated_questions, width=120)

generated_configuration = {}
generated_configuration["qanary"] = {
    "system_url": "https://webengineering.ins.hs-anhalt.de:43740/startquestionansweringwithtextquestion",
    # "componentlist": ["LD-Shuyo", "coronabot-dialog-flow", "coronabot-question-classification", "coronabot-named-entity-recognition-time-en", "coronabot-named-entity-recognition-time-de", "coronabot-named-entity-recognition-location-en", "coronabot-named-entity-recognition-location-de", "OpenTapiocaNED", "LocationToGermanDistrict", "coronabot-missing-information", "coronabot-query-generation", "coronabot-data-acquisition", "coronabot-answer-generation"],
    "componentlist": ["LD-Shuyo", "coronabot-dialog-flow", "coronabot-question-classification", "coronabot-named-entity-recognition-time-en", "coronabot-named-entity-recognition-location-en", "OpenTapiocaNED", "LocationToGermanDistrict", "coronabot-missing-information", "coronabot-query-generation", "coronabot-data-acquisition", "coronabot-answer-generation"],
    "qanary_triplestore_endpoint": "https://webengineering.ins.hs-anhalt.de:40159",
    "qanary_triplestore_database": "qanary",
    "qanary_triplestore_username": "admin",
    "qanary_triplestore_password": "admin"
}
generated_configuration["tests"] = generated_questions
generated_configuration["validation-sparql-templates"] = ["01_language_detection.sparql","02_question_classification.sparql", "03_time_recognition.sparql", "04_ned.sparql", "05_location_to_german_district.sparql", "06_coronabot_missing_information.sparql", "07_rki_query.sparql"]
# generated_configuration["custom-validation"] = "dummy" # should not be defined if not required

pprint(generated_configuration, width=120)

print("number of questions: ", len(generated_questions))


[{'question': 'How many deaths were reported in Erfde?',
  'replacements': {'CONTAINS_PLACE': True,
                   'CONTAINS_TIME': False,
                   'DISTRICT_LABEL': 'Schleswig-Flensburg',
                   'DISTRICT_URL': 'http://www.wikidata.org/entity/Q2941',
                   'LANGUAGE': 'en',
                   'MONTH': 'February',
                   'PLACE_LABEL': 'Erfde',
                   'PLACE_URL': 'http://www.wikidata.org/entity/Q118540',
                   'QUESTION_CLASS': 'death_location',
                   'YEAR': 2021},
  'template': {'last': False,
               'parameters': {'CONTAINS_PLACE': True, 'CONTAINS_TIME': False, 'QUESTION_CLASS': 'death_location'},
               'questions': ['How many people died in [PLACE_LABEL]?',
                             'How many deaths were reported in [PLACE_LABEL]?',
                             'covid deaths in [PLACE_LABEL]',
                             'What is the current death toll in [PLACE_LABEL]?']}

# Output to file

In [92]:
import json 

with open('sparql_test_query_templates/qanary-test-definition.json', 'w') as outfile:
    json.dump(generated_configuration, outfile, indent=4)