# Hierarchy construction of TabFact- Wiki

In [None]:
import pandas as pd
from collections import deque
import ast
import os
import pickle
import networkx as nx
import os.path
import pickle
import sys

import io
import requests

from concurrent.futures import ThreadPoolExecutor
import networkx as nx

In [None]:
# This is for querying the sub-hierarchical tree in each lowest-type
def query_wikidata_api(wiki_url):
    # Wikidata SPARQL endpoint URL
    endpoint_url = "https://query.wikidata.org/sparql"

    # Define the request headers with the user agent
    headers = {
        "User-Agent": "My-App/1.0"
    }

    # Define the request parameters
    sparql_query = f"""
    SELECT ?x ?xLabel ?classLabel ?superclassLabel ?superclass2Label ?superclass3Label ?superclass4Label ?superclass5Label ?superclass6Label WHERE {{
    <{wiki_url}> schema:about ?x.
    ?x wdt:P31 ?class. #instance Of
    ?class wdt:P279 ?superclass. #subclass of superclass1
    ?superclass wdt:P279 ?superclass2. #subclass of superclass2
    ?superclass2 wdt:P279 ?superclass3. #subclass of superclass3
    ?superclass3 wdt:P279 ?superclass4. #subclass of superclass4
    ?superclass4 wdt:P279 ?superclass5. #subclass of superclass5
    ?superclass5 wdt:P279 ?superclass6. #subclass of superclass6
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """

    # Send the GET request to the Wikidata API
    response = requests.get(endpoint_url, params={"format": "json", "query": sparql_query}, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: Unable to fetch data for {wiki_url}.")
        return None




In [None]:
# Parallel querying

def query_wikidata_parallel(wiki_urls):
    # Query the Wikidata API for each URL in parallel

    with ThreadPoolExecutor() as executor:
        results_json_list = list(executor.map(query_wikidata_api, list(wiki_urls.values())))
    # Convert the JSON results to DataFrames
    dataframes = []
    for index, results_json in enumerate(results_json_list):
        if results_json:
            rows = []
            for item in results_json["results"]["bindings"]:
                row = {
                    "x": item["x"]["value"],
                    "xLabel": item["xLabel"]["value"],
                    "classLabel": item["classLabel"]["value"],
                    "superclassLabel": item["superclassLabel"]["value"],
                    "superclass2Label": item["superclass2Label"]["value"],
                    "superclass3Label": item["superclass3Label"]["value"],
                    "superclass4Label": item["superclass4Label"]["value"],
                    "superclass5Label": item["superclass5Label"]["value"],
                    "superclass6Label": item["superclass6Label"]["value"]
                }
                rows.append(row)

            if len(rows):
                df = pd.DataFrame(rows)
                data_path = os.path.join(os.getcwd(), "datasets/TabFact/Label", list(wiki_urls.keys())[index])
                df.to_csv(data_path)
                dataframes.append(df)

    return dataframes

#Start crawling
def parallel_crawling(gt_csv:dict):
    for i in range(0, 11):  # 160

        end = (i + 1) * 400 - 1
        if end >= len(gt_csv):
            end = len(gt_csv)
        start = i * 400
        slice = gt_csv[start:end]

        result_diction = dict(zip(slice.iloc[:, 0], slice.iloc[:, 2]))

        dataframes_list = query_wikidata_parallel(result_diction)


In [None]:

def compute_max_distance(graph, start_node, target_node):
    visited = set()
    queue = deque([(start_node, 0)])  # (节点, 距离)
    max_distance = -1

    while queue:
        node, distance = queue.popleft()

        if node not in visited:
            visited.add(node)
            max_distance = max(max_distance, distance)

            if node == target_node:
                return max_distance

            for successor in graph.successors(node):
                queue.append((successor, distance + 1))

    return max_distance

## Start constructing the data

In [None]:
# the target path of TabFact data
target_path = os.path.join(os.getcwd(), "datasets/TabFact/")


abstract = [ 'PhysicalActivity','object', 'result', 'temporal entity', 'inconsistency', 'noun', 'noun phrase', 'remains', 'use',
            'independent continuant', 'observable entity', 'artificial entity', 'natural physical object',
            'occurrence', 'relation', 'group of physical objects', 'economic entity', 'group of works',
            'concrete object', 'three-dimensional object', 'part', 'geographic entity', 'artificial geographic entity',
            'source', 'group or class of physical objects', 'role', 'phenomenon', 'physical entity', 'means',
            'spatio-temporal entity', 'spatial entity', 'one-dimensional space', 'physical object',
            'continuant', 'collective entity', 'space object', 'type', 'information', 'anatomical entity',
            'output', 'abstract object', 'class', 'non-physical entity', 'integral', 'quantity', 'former entity',
            'occurrent', 'cause', 'idiom', 'lect', 'modification', 'alteration', 'control', 'consensus',
            'social relation', 'process', 'rivalry', 'mental process', 'condition',
            'social phenomenon', 'manifestation', 'work', 'source of information', 'knowledge type', 'action',
            'time interval', 'interaction', 'record', 'language variety', 'intentional human activity',
            'status', 'group of living things', 'agent', 'sign', 'content', 'converter', 'resource', 'metaclass',
            'unit', 'human activity','effect', 'archives', 'sub-fonds', 'evaluation',
            'interface', 'contributing factor', 'undesirable characteristic', 'structure', 'method', 'matter', 'change',
            'physical phenomenon', 'binary relation', 'building work', 'power', 'management', 'long, thin object',
            'definite integral', 'physical property', 'multi-organism process', 'data', 'multiset', 'line',
            'proper noun', 'physicochemical process', 'group', 'collection', 'historical source'
            'interaction', 'information resource', 'list', 'plan', 'scale', 'memory', 'social structure',
            'source text', 'open content', 'written work', 'strategy', 'group of humans', 'system', 'deformation',
            'representation', 'multicellular organismal process', 'operator', 'social system']
top = ['Place', 'Action', 'Intangible', 'Organization', 'CreativeWork', 'MedicalEntity', 'BioChemEntity', 'Event', 'Product', 'Person', 'Taxon']



In [None]:


ground_label_name1 = "01SourceTables.csv"
data_path = os.path.join(os.getcwd(), "datasets/TabFact/", ground_label_name1)
ground_truth_csv = pd.read_csv(data_path, encoding='latin-1')
result_dict = dict(zip(ground_truth_csv.iloc[:, 0], ground_truth_csv.iloc[:, 2]))
names = ground_truth_csv["fileName"].unique()
labels = os.listdir(os.path.join(os.getcwd(), "datasets/TabFact/Label"))
no_labels = [i for i in names if i not in labels]
# ground_truth_csv = ground_truth_csv[ground_truth_csv["fileName"].isin(no_labels)]
ground_truth = dict(zip(ground_truth_csv.iloc[:, 0], ground_truth_csv.iloc[:, 4]))


similar_words = {}
with open("filter_sim_all.pkl", "rb") as file:
    all_sims = pickle.load(file)
for key, value in all_sims.items():
    for tuple in value.keys():
        word = tuple[0]
        if tuple[0] in similar_words.keys():
            if tuple[1] not in similar_words[word]:
                similar_words[word].append(tuple[1])
        else:
            similar_words[word] = [tuple[1]]

for word, similar_word_list in similar_words.items():
    if len(similar_word_list) == 1:
        similar_words[word] = similar_word_list[0]
print(similar_words)
#unique_items = list(set(similar_words.values()))



node_length = 0
G = nx.DiGraph()
for index, row in ground_truth_csv.iterrows():
    if row["fileName"] in labels:
        label_path = os.path.join(os.getcwd(), "datasets/TabFact/Label")
        df = pd.read_csv(os.path.join(label_path, row["fileName"]), encoding='UTF-8').iloc[:, 3:9]
        for _, row2 in df.iterrows():
            labels_table = row2.dropna().tolist()
            for i in range(len(labels_table) - 1):
                if labels_table[i + 1] != labels_table[i]:
                    #if labels_table[i + 1] not in abstract and labels_table[i] not in abstract:
                        child_type = labels_table[i]
                        if labels_table[i + 1] in G.nodes():
                                if labels_table[i] not in nx.ancestors(G, labels_table[i + 1]):
                                            G.add_edge(labels_table[i + 1], child_type)
                                            continue
                        else:
                                        G.add_edge(labels_table[i + 1], child_type)
                                        continue
    else:
        if row["class"] != " ":
            superclass = row["class"]
            classX = row["superclass"]
            all_nodes = {superclass, classX}
            all_nodes = all_nodes - set(G.nodes())
            G.add_nodes_from(all_nodes)
            G.add_edge(superclass,classX)


In [None]:
# Dump hierarchy


with open(os.path.join(target_path, "graphGroundTruth2.pkl"), "wb") as file:
    pickle.dump(G, file)

In [None]:
# The following is for the combining the schema.org
"""
for index, row in ground_truth_csv.iterrows():
    if row["fileName"] in labels:
        label_path = os.path.join(os.getcwd(), "datasets/TabFact/Label")
        df = pd.read_csv(os.path.join(label_path, row["fileName"]), encoding='UTF-8').iloc[:, 3:9]
        for _, row2 in df.iterrows():
            labels_table = row2.dropna().tolist()
            for i in range(len(labels_table) - 1):
                if labels_table[i + 1] != labels_table[i]:
                    if labels_table[i + 1] not in abstract and labels_table[i] not in abstract:
                        child_type = similar_words[labels_table[i]] \
                            if labels_table[i] in similar_words.keys() else labels_table[i]
                        if child_type in top:
                            break
                        else:
                            if labels_table[i + 1] in G.nodes():
                                if labels_table[i] not in nx.ancestors(G, labels_table[i + 1]):
                                    if labels_table[i + 1] not in similar_words.keys():
                                        if labels_table[i + 1] != child_type \
                                                and "process" not in labels_table[i + 1].lower() \
                                                and "process" not in child_type.lower():
                                            G.add_edge(labels_table[i + 1], child_type)

                                            continue
                                    else:
                                        if similar_words[labels_table[i + 1]] != child_type and "process" not in \
                                                labels_table[i + 1].lower() \
                                                and "process" not in child_type.lower():
                                            G.add_edge(similar_words[labels_table[i + 1]], child_type)
                                            break
                            else:
                                if labels_table[i + 1] not in similar_words.keys():
                                    if labels_table[i + 1] != child_type and "process" not in labels_table[
                                        i + 1].lower() \
                                            and "process" not in child_type.lower():
                                        G.add_edge(labels_table[i + 1], child_type)
                                        continue
                                else:
                                    if similar_words[labels_table[i + 1]] != child_type and "process" not in \
                                            labels_table[i + 1].lower() \
                                            and "process" not in child_type.lower():
                                        G.add_edge(similar_words[labels_table[i + 1]], child_type)
                                        break
"""

### Read the hierarchy
We first detect what is in the top level


In [8]:

target_path = os.path.join(os.getcwd(), "datasets/TabFact/")
file_path = os.path.join(os.path.join(target_path, "graphGroundTruth2.pkl"))
print(file_path)
with open(file_path, "rb") as file:
    G = pickle.load(file)
    Top_level_nodes = [i for i in G.nodes if G.in_degree(i) == 0]
print(Top_level_nodes,len(Top_level_nodes))

/mnt/d/CurrentDataset/datasets/TabFact/graphGroundTruth2.pkl


EOFError: Ran out of input

In [None]:
labels = os.listdir(os.path.join(os.getcwd(), "datasets/TabFact/Label"))
ground_label_name = "01SourceTables.csv"
data_path = os.path.join(os.getcwd(), "datasets/TabFact/", ground_label_name)
ground_truth_csv = pd.read_csv(data_path, encoding='latin-1')
for index, row in ground_truth_csv.iterrows():
    if row["fileName"] in labels:
        label_path = os.path.join(os.getcwd(), "datasets/TabFact/Label")
        df = pd.read_csv(os.path.join(label_path, row["fileName"]), encoding='UTF-8').iloc[:, 3:9]
        lowest_types = df.iloc[:, 0].unique()
        top_level_types = []
        for type_low in lowest_types:
            if type_low in G.nodes():
                parent_top_per = [item for item in nx.ancestors(G, type_low) if G.in_degree(item) == 0]
                for top_per in parent_top_per:
                    if top_per not in top_level_types:
                        top_level_types.append(top_per)
        ground_truth_csv.iloc[index, 4] = lowest_types
        ground_truth_csv.iloc[index, 5] = top_level_types
ground_truth_csv.to_csv(os.path.join(target_path, "new_test_origin.csv"))