In [22]:
from utils.openai_helpers import query_openai_model, query_openai_model_batch
from utils.preprocess_functions import get_labels_and_descriptions_for_triplets, format_triplets_text

In [4]:
from wikidata.client import Client

In [2]:
wikidata_properties = {
    "Geospatial Information": {
        "Moving Up the Hierarchy": {
            "Located in the administrative territorial entity": "P131",
            "Part of": "P361",
            "Located on terrain feature": "P706"
        },
        "Moving Down the Hierarchy": {
            "Contains administrative territorial entity": "P150",
            "Location": "P276",
            "Located on street": "P669"
        },
        "Lateral Connections": {
            "Location": "P276",
            "Located on terrain feature": "P706"
        },
        "General Context": {
            "Country": "P17",
            "Instance of": "P31"
        }
    },
    "Occupation and Professional Information": {
        "Moving Up the Hierarchy": {
            "Field of work": "P101",
            "Part of": "P361",
            "Academic discipline": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Employer": "P108",
            "Member of": "P463",
            "Affiliation": "P1416"
        },
        "General Context": {
            "Occupation": "P106",
            "Position held": "P39"
        }
    },
    "Temporal Information": {
        "Moving Up the Hierarchy": {
            "Follows": "P155",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Concurrent with": "P1072",
            "Followed by": "P156"
        },
        "General Context": {
            "Point in time": "P585",
            "Duration": "P2047"
        }
    },
    "Biological and Taxonomic Information": {
        "Moving Up the Hierarchy": {
            "Parent taxon": "P171",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Taxon rank": "P105",
            "Has part": "P527"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Species": "P105"
        }
    },
    "Cultural and Creative Work": {
        "Moving Up the Hierarchy": {
            "Part of the series": "P179",
            "Genre": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subgenre": "P741"
        },
        "Lateral Connections": {
            "Inspired by": "P941",
            "Derivative work": "P4969"
        },
        "General Context": {
            "Instance of": "P31",
            "Creative work type": "P1448"
        }
    },
    "Historical and Event Information": {
        "Moving Up the Hierarchy": {
            "Part of": "P361",
            "Significant event": "P793"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Instance of": "P31"
        },
        "Lateral Connections": {
            "Followed by": "P156",
            "Follows": "P155"
        },
        "General Context": {
            "Point in time": "P585",
            "Location": "P276"
        }
    },
    "Organizational Information": {
        "Moving Up the Hierarchy": {
            "Parent organization": "P749",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Subsidiary": "P355",
            "Branch": "P1833"
        },
        "Lateral Connections": {
            "Affiliate": "P1416",
            "Member of": "P463"
        },
        "General Context": {
            "Industry": "P452",
            "Instance of": "P31"
        }
    },
    "Literary and Bibliographic Information": {
        "Moving Up the Hierarchy": {
            "Part of the series": "P179",
            "Genre": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subgenre": "P741"
        },
        "Lateral Connections": {
            "Inspired by": "P941",
            "Derivative work": "P4969"
        },
        "General Context": {
            "Instance of": "P31",
            "Author": "P50"
        }
    },
    "Scientific and Technical Information": {
        "Moving Up the Hierarchy": {
            "Field of work": "P101",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Research topic": "P2579"
        }
    },
    "Social and Demographic Information": {
        "Moving Up the Hierarchy": {
            "Part of": "P361",
            "Member of": "P463"
        },
        "Moving Down the Hierarchy": {
            "Subclass of": "P279",
            "Has part": "P527"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Demographic group": "P1448"
        }
    }
}



In [5]:
def get_abstraction_nodes(qid, property_dict):
    client = Client()
    entity = client.get(qid, load=True)
    abstraction_nodes = {}

    def fetch_related_nodes(entity, properties):
        related_nodes = []
        for pid in properties.values():
            if pid in entity.data['claims']:
                related_entities = entity.data['claims'][pid]
                related_nodes.extend(
                    related_entity['mainsnak']['datavalue']['value']['id']
                    for related_entity in related_entities
                )
        return related_nodes

    for category, relations in property_dict.items():
        abstraction_nodes[category] = {}
        for relation_type, properties in relations.items():
            abstraction_nodes[category][relation_type] = fetch_related_nodes(entity, properties)

    return abstraction_nodes

In [7]:
qid = 'Q6366688' #'Q76' 
abstraction_nodes = get_abstraction_nodes(qid, wikidata_properties)
abstraction_nodes

{'Geospatial Information': {'Moving Up the Hierarchy': ['Q18094', 'Q5684342'],
  'Moving Down the Hierarchy': ['Q6739512'],
  'Lateral Connections': ['Q6739512'],
  'General Context': ['Q30', 'Q16917']},
 'Occupation and Professional Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Temporal Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Biological and Taxonomic Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['Q16917']},
 'Cultural and Creative Work': {'Moving Up the Hierarchy': [],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['Q16917']},
 'Historical and Event Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': ['Q16917'],

In [8]:
client = Client()

def get_label(qid, language='en'):
    entity = client.get(qid, load=True)
    if entity.label:
        if language in entity.label:
            return entity.label[language]
        else:

            return next(iter(entity.label.values()))
    return qid


def convert_qids_to_labels(abstraction_nodes, language='en'):
    abstraction_nodes_names = {}
    for category, relations in abstraction_nodes.items():
        abstraction_nodes_names[category] = {}
        for relation_type, qids in relations.items():
            labels = [get_label(qid, language) for qid in qids]
            abstraction_nodes_names[category][relation_type] = labels
    return abstraction_nodes_names

In [9]:
abstraction_nodes_names = convert_qids_to_labels(abstraction_nodes)

In [10]:
abstraction_nodes_names

{'Geospatial Information': {'Moving Up the Hierarchy': ['Honolulu',
   'Hawaii Pacific Health'],
  'Moving Down the Hierarchy': ['Makiki'],
  'Lateral Connections': ['Makiki'],
  'General Context': ['United States of America', 'hospital']},
 'Occupation and Professional Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Temporal Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Biological and Taxonomic Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['hospital']},
 'Cultural and Creative Work': {'Moving Up the Hierarchy': [],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['hospital']},
 'Historical and Event Information': {'Mo

In [11]:
combined_abstraction_list = []
for key, sub_dict in abstraction_nodes_names.items():
    for sub_key, value in sub_dict.items():
        combined_abstraction_list.extend(value)

In [13]:
combined_abstraction_list = list(set(combined_abstraction_list))

In [14]:
combined_abstraction_list

['Hawaii Pacific Health',
 'hospital',
 'Makiki',
 'Honolulu',
 'United States of America']

In [19]:
def get_abstraction_mcq_prompt(question, options):
    prompt = f'''
    For the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:
    
    Question: 
    {question}

    Options:
    {options}
    
    Respond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present.
    '''
    return prompt.replace('    ', '')

In [20]:
q = 'Where was the 44th president of the United States born?'
prompt = get_abstraction_mcq_prompt(q, combined_abstraction_list)

In [21]:
print(prompt)


For the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:

Question: 
Where was the 44th president of the United States born?

Options:
['Hawaii Pacific Health', 'hospital', 'Makiki', 'Honolulu', 'United States of America']

Respond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present.



In [24]:
response = query_openai_model(prompt)

In [26]:
import json

In [27]:
json.loads(response[0])

{'answer': ['Honolulu', 'United States of America']}