In [1]:
from datasets import load_dataset
import torch
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import time
from tqdm.notebook import tqdm
import numpy as np # linear algebra
import pandas as pd 

In [2]:
OUT_PATH="D://sweb//extracted//"
question_id = 2
timer=0

In [3]:
dataset = load_dataset("AmazonScience/mintaka",trust_remote_code=True)

In [4]:

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

data = dataset['test']

dataset2 = MyDataset(data)

df = pd.DataFrame([sample for sample in dataset2])
df

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity
0,fae46b21,en,What man was a famous American author and also...,Mark Twain,history,intersection,"[{'name': 'Q1497', 'entityType': 'entity', 'la...","[{'name': 'Q7245', 'label': 'Mark Twain'}]"
1,bc8713cc,en,How many Academy Awards has Jake Gyllenhaal be...,1,movies,count,"[{'name': 'Q133313', 'entityType': 'entity', '...","[{'name': 'Q106291', 'label': 'Academy Award f..."
2,d2a03f72,en,"Who is older, The Weeknd or Drake?",Drake,music,comparative,"[{'name': 'Q2121062', 'entityType': 'entity', ...","[{'name': 'Q33240', 'label': 'Drake'}]"
3,9a296167,en,How many children did Donald Trump have?,5,history,count,"[{'name': 'Q22686', 'entityType': 'entity', 'l...","[{'name': 'Q3713655', 'label': 'Donald Trump J..."
4,e343ad26,en,Is the main hero in Final Fantasy IX named Kuja?,No,videogames,yesno,"[{'name': 'Q474573', 'entityType': 'entity', '...",[]
...,...,...,...,...,...,...,...,...
3995,d52b03ee,en,Who was the first woman mayor of San Francisco?,Dianne Feinstein,politics,ordinal,"[{'name': 'Q62', 'entityType': 'entity', 'labe...","[{'name': 'Q230733', 'label': 'Dianne Feinstei..."
3996,07f2947c,en,Where was the last Republican mayor of Boston ...,"Portland, Maine",politics,ordinal,"[{'name': 'Q100', 'entityType': 'entity', 'lab...","[{'name': 'Q49201', 'label': 'Portland'}]"
3997,58727fb0,en,How long was the 53rd mayor of Boston in office?,"20 years, 6 months",politics,ordinal,"[{'name': 'Q100', 'entityType': 'entity', 'lab...",[]
3998,a25818a2,en,Who was the first British monarch to have a pr...,George I,politics,ordinal,"[{'name': 'Q145', 'entityType': 'entity', 'lab...","[{'name': 'Q130805', 'label': 'George I of Gre..."


In [5]:
def extract_question_entities(question_index):
    question_entities = []
    entity_list = df.iloc[question_index]["questionEntity"]
    for i in entity_list:
         question_entities.append(i["name"])
#     print(question_entities)
    return question_entities



In [6]:
def run_query(entities):
    if len(entities)==0:
        return [],[]
    entities[0]=" wd:"+entities[0]
    entities=" wd:".join(entities)
    
    endpoint_url = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(endpoint_url)


    query = """
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX wikibase: <http://wikiba.se/ontology#>
        
        SELECT ?subLabel ?p2Label ?valueLabel ?value
        WHERE {{
          VALUES ?entity {{ {0} }}
          ?entity ?property ?value.
          BIND(?entity as ?sub)

          ?p2 wikibase:directClaim ?property.   
          FILTER(STRSTARTS(STR(?property), "http://www.wikidata.org/prop/direct/"))
          SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
          }}

    }}
     """.format(entities)


    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

   
    try:
        results = sparql.query().convert()
    except Exception as e:
         print("Error executing SPARQL query:", e)
         return None
    objects=[]
    triples=[]
    statements = results["results"]["bindings"]
    if statements:
        for statement in statements:
            object_id = statement["value"]["value"]
            property_value = statement["p2Label"]["value"]
            object_value = statement["valueLabel"]["value"]
            sub_value = statement["subLabel"]["value"]
           
            if object_id!=object_value:
                objects.append(str(object_id).split("/")[-1])
            triples.append(f"{sub_value} : {property_value} : {object_value}")
    
    return objects,triples



In [7]:
trips=[]
z=300
def hop_forward(entities,k):
    global timer
    if k==0:
        return
    if entities==[]:
        return
    all_objects=[]
    for i in range(0,len(entities),z):
        temp=entities[i:i+z]
        objects,triples = run_query(temp)
        all_objects.extend(objects)
        trips.extend(triples)
    hop_forward(all_objects,k-1)


In [8]:
def create_csv(all_triples,idx):
    triples=[]
    for i in all_triples:
        trip = i.split(" : ")
        triples.append(trip)
    df_triples = pd.DataFrame(triples,columns=["subject","predicate","object"])
    df_triples.to_csv(f"question_{idx}.csv")

In [9]:
def create_txt(all_triples,idx):
    # print(all_triples)
    with open(f"{OUT_PATH}question_{idx}.txt","w",encoding='utf-8') as f:
        for i in all_triples:
            trip = " ".join(i.split(" : "))
            f.write(trip)
            f.write("\n")
        f.close()
    # print("done")


In [10]:
def find_query(entity):
    
    endpoint_url = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(endpoint_url)


    query = """
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX wikibase: <http://wikiba.se/ontology#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        SELECT DISTINCT ?qid
        WHERE {{

        BIND( STRLANG("{0}", "en") AS ?label ) .
        ?item rdfs:label ?label .

        BIND(STRAFTER(STR(?item), STR(wd:)) AS ?qid) .

    }}
     """.format(entity)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

   
    try:
        results = sparql.query().convert()
    except Exception as e:
         print("Error executing SPARQL query:", e)
         return None
    
    statements = results["results"]["bindings"]
    return statements[0]['qid']['value']

In [11]:
def remove_occurences(l):
    l2=[]
    for i in range(len(l)):
        if l[i]!='None':
            if l[i][0]=="Q":
                l2.append(l[i])
            else:
                try:
                    x=find_query(l[i])
                    l2.append(x)
                except:
                    continue
                
    return l2

In [12]:
def ask_question(idx):
    global trips
    question_entities = list(set(extract_question_entities(idx)))
    # print(question_entities)
    question_entities=remove_occurences(question_entities)
    # print(question_entities)
    all_triples = []
    objects,triples = run_query(question_entities)
    remove_occurences(objects)
    all_triples.extend(triples)
    x=0
    hop_forward(objects,1)
    all_triples.extend(trips)
    trips=[]
    x+=1
            
    # create_csv(all_triples,idx)
    create_txt(all_triples,idx)
    return all_triples

In [13]:
# all_triples = ask_question(question_id)
# len(all_trples)4
#2565
#2721
#2722
# start=3000
# end=4000
# for i in tqdm(range(start,end)):
#     ask_question(i)
#     print(f"Question {i} : Done ")

In [14]:
import threading

# Sample function that will be executed in threads
def thread_function(iterable, thread_id):
    # Initialize tqdm for the thread
    progress_bar = tqdm(iterable, desc=f"Thread {thread_id}")

    # Process each item in the iterable
    for i in progress_bar:
        # Perform your task here
        try:
            ask_question(i) # Replace with your task
            progress_bar.set_postfix({"question": i})
        except:
            pass
# List of iterables to be processed by threads
iterables = [
    range(3000,3300),  # First thread will process range(100)
    range(3300,3600),  # Second thread will process range(200)
    range(3600,4000),  # Third thread will process range(300)
]

# Create and start threads
threads = []
for i, iterable in enumerate(iterables):
    thread = threading.Thread(target=thread_function, args=(iterable, i))
    thread.start()
    threads.append(thread)

# Wait for all threads to finish
for thread in threads:
    thread.join()


Thread 2:   0%|          | 0/400 [00:00<?, ?it/s]

Thread 0:   0%|          | 0/300 [00:00<?, ?it/s]

Thread 1:   0%|          | 0/300 [00:00<?, ?it/s]

In [None]:
import os
l=[]
for i in os.listdir(OUT_PATH):
    with open(OUT_PATH+i,"r",encoding='utf-8') as f:
        l.append(f.read())
        f.close()