In [26]:
from utils.openai_helpers import query_openai_model, query_openai_model_batch
from utils.wiki_helpers import get_abstraction_nodes, convert_abstraction_qids_to_labels, get_label
from utils.prompt_functions import get_abstraction_mcq_prompt

In [3]:
import pandas as pd
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np

In [4]:
wikidata_properties = {
    "Geospatial Information": {
        "Moving Up the Hierarchy": {
            "Located in the administrative territorial entity": "P131",
            "Part of": "P361",
            "Located on terrain feature": "P706"
        },
        "Moving Down the Hierarchy": {
            "Contains administrative territorial entity": "P150",
            "Location": "P276",
            "Located on street": "P669"
        },
        "Lateral Connections": {
            "Location": "P276",
            "Located on terrain feature": "P706"
        },
        "General Context": {
            "Country": "P17",
            "Instance of": "P31"
        }
    },
    "Occupation and Professional Information": {
        "Moving Up the Hierarchy": {
            "Field of work": "P101",
            "Part of": "P361",
            "Academic discipline": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Employer": "P108",
            "Member of": "P463",
            "Affiliation": "P1416"
        },
        "General Context": {
            "Occupation": "P106",
            "Position held": "P39"
        }
    },
    "Temporal Information": {
        "Moving Up the Hierarchy": {
            "Follows": "P155",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Concurrent with": "P1072",
            "Followed by": "P156"
        },
        "General Context": {
            "Point in time": "P585",
            "Duration": "P2047"
        }
    },
    "Biological and Taxonomic Information": {
        "Moving Up the Hierarchy": {
            "Parent taxon": "P171",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Taxon rank": "P105",
            "Has part": "P527"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Species": "P105"
        }
    },
    "Cultural and Creative Work": {
        "Moving Up the Hierarchy": {
            "Part of the series": "P179",
            "Genre": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subgenre": "P741"
        },
        "Lateral Connections": {
            "Inspired by": "P941",
            "Derivative work": "P4969"
        },
        "General Context": {
            "Instance of": "P31",
            "Creative work type": "P1448"
        }
    },
    "Historical and Event Information": {
        "Moving Up the Hierarchy": {
            "Part of": "P361",
            "Significant event": "P793"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Instance of": "P31"
        },
        "Lateral Connections": {
            "Followed by": "P156",
            "Follows": "P155"
        },
        "General Context": {
            "Point in time": "P585",
            "Location": "P276"
        }
    },
    "Organizational Information": {
        "Moving Up the Hierarchy": {
            "Parent organization": "P749",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Subsidiary": "P355",
            "Branch": "P1833"
        },
        "Lateral Connections": {
            "Affiliate": "P1416",
            "Member of": "P463"
        },
        "General Context": {
            "Industry": "P452",
            "Instance of": "P31"
        }
    },
    "Literary and Bibliographic Information": {
        "Moving Up the Hierarchy": {
            "Part of the series": "P179",
            "Genre": "P136"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subgenre": "P741"
        },
        "Lateral Connections": {
            "Inspired by": "P941",
            "Derivative work": "P4969"
        },
        "General Context": {
            "Instance of": "P31",
            "Author": "P50"
        }
    },
    "Scientific and Technical Information": {
        "Moving Up the Hierarchy": {
            "Field of work": "P101",
            "Part of": "P361"
        },
        "Moving Down the Hierarchy": {
            "Has part": "P527",
            "Subclass of": "P279"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Research topic": "P2579"
        }
    },
    "Social and Demographic Information": {
        "Moving Up the Hierarchy": {
            "Part of": "P361",
            "Member of": "P463"
        },
        "Moving Down the Hierarchy": {
            "Subclass of": "P279",
            "Has part": "P527"
        },
        "Lateral Connections": {
            "Related to": "P2789",
            "Similar to": "P1889"
        },
        "General Context": {
            "Instance of": "P31",
            "Demographic group": "P1448"
        }
    }
}



In [5]:
df = pd.read_csv('inputs/simple_multihop_examples - examples.csv')

In [6]:
df

Unnamed: 0,GENERATED_QUESTION,ITEM_1,ITEM_2,ITEM_3,ITEM_4,PROP_1,PROP_2,PROP_3
0,What is the seal described by the country that...,Q4871749,Q30,Q171663,Q14213,P17,P418,P2378
1,What is the author of the national anthem of t...,Q5248175,Q145,Q40807,Q1606590,P27,P85,P50
2,What is the main article of the list that is t...,Q669125,Q9726,Q3047934,Q207628,P86,P1455,P360
3,What is the geography associated with the coun...,Q3974535,Q1237470,Q38,Q216989,P175,P495,P2633
4,What is the location of the headquarters of th...,Q1721298,Q2736,Q253414,Q72,P641,P3719,P159
...,...,...,...,...,...,...,...,...
95,What is the name day associated with the first...,Q76535,Q292691,Q2390,Q110,P735,P1750,P361
96,What is the head of government of the administ...,Q2084020,Q170578,Q43783,Q10304754,P19,P131,P6
97,What is the currency used by the country that ...,Q7497242,Q1033,Q5440850,Q1485655,P27,P155,P38
98,What is the history of the writing system used...,Q6174806,Q5576697,Q8229,Q3772237,P734,P282,P2184


In [7]:
# prompts_list = []
# for i in range(len(df)):
#     question = df.iloc[i]['GENERATED_QUESTION']
#     answer_qid = df.iloc[i]['ITEM_4']
#     abstraction_nodes = get_abstraction_nodes(answer_qid, wikidata_properties)
#     abstraction_nodes_names = convert_abstraction_qids_to_labels(abstraction_nodes)
#     combined_abstraction_list = []
#     for key, sub_dict in abstraction_nodes_names.items():
#         for sub_key, value in sub_dict.items():
#             combined_abstraction_list.extend(value)
#     combined_abstraction_list = list(set(combined_abstraction_list))
#     prompt = get_abstraction_mcq_prompt(question, combined_abstraction_list)
#     prompts_list.append(prompt)

In [8]:
def process_row(index, row, wikidata_properties):
    question = row['GENERATED_QUESTION']
    answer_qid = row['ITEM_4']
    answer = get_label(answer_qid)
    abstraction_nodes = get_abstraction_nodes(answer_qid, wikidata_properties)
    abstraction_nodes_names, mapping_label_to_qid = convert_abstraction_qids_to_labels(abstraction_nodes)
    combined_abstraction_list = list(mapping_label_to_qid.keys())

    
    prompt = get_abstraction_mcq_prompt(question, combined_abstraction_list)
    # print(f"Processed row {index}")
    return index, prompt, mapping_label_to_qid, answer

def generate_prompts_multi(df, wikidata_properties, max_workers=10):
    prompts_dict = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_row, i, df.iloc[i], wikidata_properties): i for i in range(len(df))}
        with tqdm(total=len(df)) as pbar:
            for future in as_completed(futures):
                index, result, mapping_dict_abs, answer = future.result()
                prompts_dict[index] = result, mapping_dict_abs, answer
                pbar.update(1)
    return prompts_dict

In [8]:
prompts_data = generate_prompts_multi(df, wikidata_properties)

100%|██████████| 100/100 [03:31<00:00,  2.12s/it]


In [9]:
process_row(0, df.iloc[0], wikidata_properties)?

Processed row 0


(0,
 "\nFor the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:\n\nQuestion: \nWhat is the seal described by the country that hosted the Battle of Mobley's Meeting House issued by?\n\nOptions:\n['United States Cabinet', 'United States of America', 'public office', 'foreign minister', 'secretary of state', 'Secretary of State']\n\nRespond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present.\n",
 {'United States Cabinet': 'Q639738',
  'United States of America': 'Q30',
  'public office': 'Q294414',
  'foreign minister': 'Q7330070',
  'secretary of state': 'Q736559',
  'Secretary of State': 'Q533501'})

In [10]:

np.save('inputs/prompts_list_simple_examples.npy', prompts_data)

In [9]:
prompts_data = np.load('inputs/prompts_list_simple_examples.npy', allow_pickle=True).item()

In [10]:
prompts_data

{9: ("\nFor the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:\n\nQuestion: \nWhat is the birthplace of the patron saint of the sport played in the 2009 NCAA Division I Baseball Tournament?\n\nOptions:\n['Cascia', 'Italy', 'frazione']\n\nRespond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present.\n",
  {'Cascia': 'Q20384', 'Italy': 'Q38', 'frazione': 'Q1134686'}),
 3: ("\nFor the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:\n\nQuestion: \nWhat is the geography associated with the country of origin of the performer who played Stormbringer Ruler?\n\nOptions:\n['Italy', 'geography of geographic location', 'geography of Europe']\n\nRespond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present

In [11]:
prompts = [prompts_data[i][0] for i in range(len(prompts_data))]
mapping_dics = [prompts_data[i][1] for i in range(len(prompts_data))]

In [12]:
mapping_dics

[{'United States Cabinet': 'Q639738',
  'United States of America': 'Q30',
  'public office': 'Q294414',
  'foreign minister': 'Q7330070',
  'secretary of state': 'Q736559',
  'Secretary of State': 'Q533501'},
 {'human': 'Q5', 'poet': 'Q49757', 'composer': 'Q36834', 'writer': 'Q36180'},
 {'type of work of art': 'Q116474095',
  'musical work': 'Q2188189',
  'composer': 'Q36834',
  'composition': 'Q462437',
  'musical work/composition': 'Q105543609',
  'literary composing': 'Q1333743',
  'music composing': 'Q11895763',
  'music composition': 'Q105107008'},
 {'Italy': 'Q38',
  'geography of geographic location': 'Q46865913',
  'geography of Europe': 'Q119716'},
 {'Zürich District': 'Q660732',
  'Greater Zurich Area': 'Q30998',
  'RZU': 'Q95080684',
  'Zurich metropolitan area': 'Q690149',
  'Canton of Zürich': 'Q11943',
  'Kreis 1': 'Q445559',
  'District 2': 'Q456153',
  'Kreis 3': 'Q675017',
  'Kreis 4': 'Q677133',
  'Kreis 5': 'Q460885',
  'District 6': 'Q456170',
  'District 7': 'Q456

In [13]:
for i in range(len(prompts)):
    if len(mapping_dics[i]) == 0:
        print(f"Empty mapping dict for index {i}")
        continue

In [14]:
responses = query_openai_model_batch(prompts)

Processing prompts:   0%|          | 0/100 [00:00<?, ?it/s]

Processing prompts: 100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


In [15]:
responses

{0: ('{\n  "answer": [\n    "United States of America"\n  ]\n}',
  CompletionUsage(completion_tokens=16, prompt_tokens=124, total_tokens=140)),
 3: ('{\n  "answer": [\n    "geography of Europe"\n  ]\n}',
  CompletionUsage(completion_tokens=16, prompt_tokens=110, total_tokens=126)),
 1: ('{\n  "answer": ["poet", "composer", "writer"]\n}',
  CompletionUsage(completion_tokens=16, prompt_tokens=105, total_tokens=121)),
 7: ('{\n  "answer": [\n    "Finland"\n  ]\n}',
  CompletionUsage(completion_tokens=14, prompt_tokens=204, total_tokens=218)),
 6: ('{\n  "answer": [\n    "As-Salam al-Malaki"\n  ]\n}',
  CompletionUsage(completion_tokens=19, prompt_tokens=116, total_tokens=135)),
 2: ('{\n  "answer": [\n    "musical work",\n    "composition",\n    "musical work/composition",\n    "music composition"\n  ]\n}',
  CompletionUsage(completion_tokens=32, prompt_tokens=139, total_tokens=171)),
 5: ('{\n  "answer": [\n    "United States of America",\n    "secretary of state",\n    "Secretary of Sta

In [19]:
abstract_valid_answers = {}
for i in range(len(responses)):
    response = responses[i][0]
    mapping_dict = mapping_dics[i]
    response = json.loads(response)
    abstract_valid_answers[i] = response['answer']

In [None]:
abstract_valid_answers

In [23]:
df['ABSTRACT_VALID'] = list(abstract_valid_answers.values())

In [24]:
df

Unnamed: 0,GENERATED_QUESTION,ITEM_1,ITEM_2,ITEM_3,ITEM_4,PROP_1,PROP_2,PROP_3,ABSTRACT_VALID
0,What is the seal described by the country that...,Q4871749,Q30,Q171663,Q14213,P17,P418,P2378,[United States of America]
1,What is the author of the national anthem of t...,Q5248175,Q145,Q40807,Q1606590,P27,P85,P50,"[poet, composer, writer]"
2,What is the main article of the list that is t...,Q669125,Q9726,Q3047934,Q207628,P86,P1455,P360,"[musical work, composition, musical work/compo..."
3,What is the geography associated with the coun...,Q3974535,Q1237470,Q38,Q216989,P175,P495,P2633,[geography of Europe]
4,What is the location of the headquarters of th...,Q1721298,Q2736,Q253414,Q72,P641,P3719,P159,"[Zürich District, Greater Zurich Area, Zurich ..."
...,...,...,...,...,...,...,...,...,...
95,What is the name day associated with the first...,Q76535,Q292691,Q2390,Q110,P735,P1750,P361,[]
96,What is the head of government of the administ...,Q2084020,Q170578,Q43783,Q10304754,P19,P131,P6,[governor of Sergipe]
97,What is the currency used by the country that ...,Q7497242,Q1033,Q5440850,Q1485655,P27,P155,P38,[Nigeria]
98,What is the history of the writing system used...,Q6174806,Q5576697,Q8229,Q3772237,P734,P282,P2184,[aspect of history]


In [25]:
df.to_csv('outputs/simple_multihop_examples_with_abstract_valid.csv', index=False)

In [18]:
json.loads(responses[1][0])

{'answer': ['poet', 'composer', 'writer']}

In [4]:
qid = 'Q6366688' #'Q76' 
abstraction_nodes = get_abstraction_nodes(qid, wikidata_properties)
abstraction_nodes

{'Geospatial Information': {'Moving Up the Hierarchy': ['Q18094', 'Q5684342'],
  'Moving Down the Hierarchy': ['Q6739512'],
  'Lateral Connections': ['Q6739512'],
  'General Context': ['Q30', 'Q16917']},
 'Occupation and Professional Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Temporal Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Biological and Taxonomic Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['Q16917']},
 'Cultural and Creative Work': {'Moving Up the Hierarchy': [],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['Q16917']},
 'Historical and Event Information': {'Moving Up the Hierarchy': ['Q5684342'],
  'Moving Down the Hierarchy': ['Q16917'],

In [5]:
abstraction_nodes_names = convert_abstraction_qids_to_labels(abstraction_nodes)

In [6]:
abstraction_nodes_names

{'Geospatial Information': {'Moving Up the Hierarchy': ['Honolulu',
   'Hawaii Pacific Health'],
  'Moving Down the Hierarchy': ['Makiki'],
  'Lateral Connections': ['Makiki'],
  'General Context': ['United States of America', 'hospital']},
 'Occupation and Professional Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Temporal Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': []},
 'Biological and Taxonomic Information': {'Moving Up the Hierarchy': ['Hawaii Pacific Health'],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['hospital']},
 'Cultural and Creative Work': {'Moving Up the Hierarchy': [],
  'Moving Down the Hierarchy': [],
  'Lateral Connections': [],
  'General Context': ['hospital']},
 'Historical and Event Information': {'Mo

In [7]:
combined_abstraction_list = []
for key, sub_dict in abstraction_nodes_names.items():
    for sub_key, value in sub_dict.items():
        combined_abstraction_list.extend(value)

In [8]:
combined_abstraction_list = list(set(combined_abstraction_list))

In [9]:
combined_abstraction_list

['hospital',
 'Honolulu',
 'Makiki',
 'United States of America',
 'Hawaii Pacific Health']

In [11]:
q = 'Where was the 44th president of the United States born?'
prompt = get_abstraction_mcq_prompt(q, combined_abstraction_list)

In [12]:
print(prompt)


For the following question, choose all suitable candidates from the provided options that can reasonably asnwer the question:

Question: 
Where was the 44th president of the United States born?

Options:
['hospital', 'Honolulu', 'Makiki', 'United States of America', 'Hawaii Pacific Health']

Respond with a JSON object with 'answer' key containing a list of the selected options, or an empty list if no suitable options are present.



In [24]:
response = query_openai_model(prompt)

In [27]:
json.loads(response[0])

{'answer': ['Honolulu', 'United States of America']}