In [1]:
# prepare the few_shot samples

import os
import argparse
import os
import json
import ast
import re
import logging
import multiprocessing as mp
import wandb
import numpy as np
import datetime
import random
import aiohttp
import asyncio
import copy
from src.qa_prediction.evaluate_results import eval_result
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from functools import partial
from openai import OpenAI
from datasets import load_dataset
from src import utils
from src.utils import prompt_list_cwq, prompt_list_webqsp

dataset_name = "rmanluo/RoG-webqsp"

dataset = load_dataset(dataset_name, split="train")

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-23 15:51:21,655] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
id_list = [
   "WebQTrn-471",
   "WebQTrn-485",
   "WebQTrn-928",
   "WebQTrn-999",
   "WebQTrn-2016"
]

shot_list = []
for data in tqdm(dataset):
   if data["id"] in id_list:
      shot_list.append(data)

100%|██████████| 2826/2826 [00:52<00:00, 54.18it/s]


In [3]:
def convert_tuples_to_string(tuples):
   elements = []
   for tup in tuples:
      for element in tup:
         if element not in elements:
               elements.append(element)
   return "->".join(elements) 

In [4]:
sample_list = []

In [5]:
def get_embedding(texts, model="text-embedding-3-small"):
    client = OpenAI(api_key="xx")
    response = client.embeddings.create(
        model=model, input=texts
    )
    return [item.embedding for item in response.data]

In [6]:
def prepare_options_for_each_step(q_entity, query, graph, next_entity="None") -> list:
    """
    prepare options for each step of the reasoning path
    """
    if next_entity == "None":
        raw_options = utils.get_entity_edges([q_entity], graph)
    else:
        raw_options = utils.get_entity_edges([next_entity], graph) # get edges of the entities 
    
    def vector_rag_engine(query, options, top_k=10):
        print("query: ", query)
        print("options: ", options)
        texts = [query] + options
        embeddings = get_embedding(texts)
        query_embedding = np.array(embeddings[0])
        option_embeddings = np.array(embeddings[1:])
        similarities = cosine_similarity([query_embedding], option_embeddings)
        top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]
        top_k_options = [options[i] for i in top_k_indices]
        # corresponding_neighbors = [neighbors[i] for i in top_k_indices]
        
        return top_k_options
        # return [f"{i+1}: {option} -> {neighbor}" for i, (option, neighbor) in enumerate(zip(top_k_options, corresponding_neighbors))]
    
    retrieved_options = vector_rag_engine(query, raw_options) # de-duplicate the same options
    processed_options = []
    for i, option in enumerate(retrieved_options):
        processed_options.append(f"{i+1}: {option}")

    return processed_options

In [7]:
async with aiohttp.ClientSession() as session:
   for data in shot_list:
      graph = utils.build_graph(data["graph"])
      shortest_paths = utils.get_truth_paths(data["q_entity"], data["a_entity"], graph)
      negative_path_1_hop = utils.get_negative_paths(data["q_entity"], data["a_entity"], graph, n_neg=2, hop=2)
      negative_path_2_hop = utils.get_negative_paths(data["q_entity"], data["a_entity"], graph, n_neg=2, hop=3)
      shortest_paths_str_list = [convert_tuples_to_string(path) for path in shortest_paths]
      negative_path_1_hop_str_list = [convert_tuples_to_string(path) for path in negative_path_1_hop]
      negative_path_2_hop_str_list = [convert_tuples_to_string(path) for path in negative_path_2_hop]
      # raw_options, neighbors = utils.get_entity_edges(data["q_entity"], graph)
      # raw_options = [f"{i+1}: {option}" for i, option in enumerate(raw_options)]
      
      raw_options = prepare_options_for_each_step(data["q_entity"], data["question"], graph)
      
      index_list = []
      for i, option in enumerate(raw_options):
         for path in shortest_paths_str_list:
            if path.__contains__(option.split(": ")[1]):
               index_list.append(i+1)
               
      raw_options = "\n".join(raw_options)
      sample_list.append({
         "id": data["id"],
         "question": data["question"],
         "q_entity": data["q_entity"],
         "a_entity": data["a_entity"],
         "raw_options": raw_options,
         "index_list": index_list,
         "shortest_paths": shortest_paths_str_list,
         "negative_path_1_hop": negative_path_1_hop_str_list,
         "negative_path_2_hop": negative_path_2_hop_str_list
      })

query:  what art movements was henri matisse involved in
options:  ([], [])


TypeError: can only concatenate list (not "tuple") to list

In [None]:
for data in shot_list:
   graph = utils.build_graph(data["graph"])
   

In [None]:
with open(f'prompts/few_shot_samples_webqsp.txt', 'w') as file:
   file.write(json.dumps(sample_list, indent=3))

In [None]:
shortest_paths_str_list

['Selena Gomez->people.person.places_lived->m.04hmnxs->people.place_lived.location->New York City',
 'Selena Gomez->people.person.nationality->United States of America->location.location.containedby->New York City']

In [None]:
negative_path_2_hop

[[('Selena Gomez', 'music.artist.label', 'Avex Trax'),
  ('Avex Trax',
   'music.record_label.releases',
   'Super Eurobeat, Volume 110: Millennium Anniversary Non-Stop Megamix')],
 [('Selena Gomez', 'music.artist.track', 'Do It'),
  ('Do It', 'music.artist.track', 'Selena Gomez')]]

In [None]:
shortest_paths

[[('Selena Gomez', 'people.person.places_lived', 'm.04hmnxs'),
  ('m.04hmnxs', 'people.place_lived.location', 'New York City')],
 [('Selena Gomez', 'people.person.nationality', 'United States of America'),
  ('United States of America',
   'location.location.containedby',
   'New York City')]]

In [None]:
data["q_entity"]

['Selena Gomez']

In [None]:
x = "adasda"

In [None]:
y = "1: adasda"

False

In [9]:
x = "Given a question and the starting entity from a knowledge graph, you are asked to answer whether it's sufficient for you to answer the question with the following reasoning path. \nQuestion: what did james k polk do before he was president \nReasoning path: James K. Polk->\nbase.kwebbase.kwtopic.has_sentences->In 1825 Polk was elected to the U.S. House of Representatives, where he continued his support of Jackson, even though the House had passed him over as President in favor of John Quincy Adams .\nbase.kwebbase.kwsentence.kwtopic->In 1835, Jackson rewarded Polk by making him Speaker of the House.\nbase.kwebbase.kwsentence.kwtopic->In 1839, in an effort to win back Tennessee for the Democrats, Polk resigned as Speaker and ran for State Governor.\nbase.kwebbase.kwsentence.kwtopic->As he set about replacing federalists with loyal Democrats, Polk was deluged with job applicants, and some of his choices nettled the administration.\nbase.kwebbase.kwsentence.kwtopic->Four years later, when Jackson became President, Polk worked tirelessly on Jackson's program.\nIf it is sufficient to answer the question, respond with 'Yes'; otherwise, respond with 'No'"

In [10]:
x

"Given a question and the starting entity from a knowledge graph, you are asked to answer whether it's sufficient for you to answer the question with the following reasoning path. \nQuestion: what did james k polk do before he was president \nReasoning path: James K. Polk->\nbase.kwebbase.kwtopic.has_sentences->In 1825 Polk was elected to the U.S. House of Representatives, where he continued his support of Jackson, even though the House had passed him over as President in favor of John Quincy Adams .\nbase.kwebbase.kwsentence.kwtopic->In 1835, Jackson rewarded Polk by making him Speaker of the House.\nbase.kwebbase.kwsentence.kwtopic->In 1839, in an effort to win back Tennessee for the Democrats, Polk resigned as Speaker and ran for State Governor.\nbase.kwebbase.kwsentence.kwtopic->As he set about replacing federalists with loyal Democrats, Polk was deluged with job applicants, and some of his choices nettled the administration.\nbase.kwebbase.kwsentence.kwtopic->Four years later, whe

In [11]:
x= "Answer: [Yes, Yes, Yes, Yes, Yes]"

In [12]:
termination_checks = ast.literal_eval(x)

SyntaxError: invalid syntax (<unknown>, line 1)

In [1]:
x = "Jamaican English"

In [2]:
x.split(",")

['Jamaican English']

In [3]:
x.replace("Answer: ", "").replace("[", "").replace("]", "").split(", ")

['Jamaican English']

In [5]:
import re
# The string from which you want to extract the list of numbers
input_string = "Selected reasoning paths: [31, 32, 33, 34, 35]"

# Use regular expression to find all occurrences of one or more digits
matches = re.findall(r'\d+', input_string)

# Convert the found strings to integers
extracted_numbers = [int(match) for match in matches]

In [6]:
extracted_numbers

[31, 32, 33, 34, 35]