In [1]:
import json

with open("./wikidata_mapping_matched.json","r") as f:
    data = json.load(f)

extracted_list = []
for item in data["mapping_results"]:   
    extracted_info = {
        "wikidata_id": item["wikidata_id"],
        "wikidata_label": item["wikidata_label"],
        "data_types": [sub_item[0] for sub_item in item.get("merged_types", [])]
    }
    extracted_list.append(extracted_info)

with open("./wikidata_mapping_extracted.json", "w") as f:
    json.dump(extracted_list, f, indent=4)





In [2]:
with open("./wikidata_mapping_extracted.json","r") as f:
    data = json.load(f)
unique_wikidata_ids = {item["wikidata_id"] for item in data if "wikidata_id" in item}

print(f"Unique wikidata_id: {len(unique_wikidata_ids)}")




Unique wikidata_id: 889


In [3]:
import json
from collections import defaultdict

with open("./wikidata_mapping_extracted.json", "r", encoding="utf-8") as f:
    data = json.load(f)

merged_data = defaultdict(lambda: {"wikidata_label": "", "data_types": set()})

for item in data:
    if "wikidata_id" in item and "wikidata_label" in item and "data_types" in item:
        wid = item["wikidata_id"]
        merged_data[wid]["wikidata_label"] = item["wikidata_label"]  
        merged_data[wid]["data_types"].update(item["data_types"]) 

merged_list = [
    {"wikidata_id": wid, "wikidata_label": info["wikidata_label"], "data_types": list(info["data_types"])}
    for wid, info in merged_data.items()
]

with open("./wikidata_mapping_merged.json", "w", encoding="utf-8") as f:
    json.dump(merged_list, f, indent=4, ensure_ascii=False)

print(len(merged_list))


889


In [4]:
import json
import pandas as pd

# 读取 JSON 文件
with open("wikidata_mapping_extracted.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# 解析 JSON，将 data_types 拆分成多行
rows = []
for entry in data:
    wikidata_id = entry["wikidata_id"]
    wikidata_label = entry["wikidata_label"]
    data_types = entry["data_types"]  # 可能是多个

    for dtype in data_types:
        rows.append([dtype, wikidata_label, wikidata_id])

# 创建 DataFrame
df = pd.DataFrame(rows, columns=["datatype", "wiki_word", "qid"])

# 保存为 CSV
df.to_csv("datatype_wiki.csv", index=False, encoding="utf-8")


In [9]:
# 读取 JSON 文件
with open("./wikidata_isolated_matched.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# 解析 JSON，将 data_types 拆分成多行
rows = []
for entry in data["isolated_type"]:
    wikidata_id = entry["wikidata_id"]
    wikidata_label = entry["wikidata_label"]
    data_types = entry["datatype"]  

    rows.append([data_types, wikidata_label, wikidata_id])

# 创建 DataFrame
df = pd.DataFrame(rows, columns=["datatype", "wiki_word", "qid"])

print(len(df['qid'].dropna().unique().tolist()))

# 保存为 CSV
df.to_csv("datatype_isolated_wiki.csv", index=False, encoding="utf-8")

179


In [11]:
# merge two csv files
df1 = pd.read_csv("datatype_wiki.csv")
df2 = pd.read_csv("datatype_isolated_wiki.csv")
df = pd.concat([df1, df2])
print(len(df['datatype'].dropna().unique().tolist()))
df.to_csv("datatype_wiki_merged.csv", index=False, encoding="utf-8")
print(df.shape)
# delete the duplicated rows
df = df.drop_duplicates()
df.to_csv("datatype_wiki_merged.csv", index=False, encoding="utf-8")
print(df.shape)

3894
(3894, 3)
(3894, 3)


In [7]:
data_types = pd.read_csv('./datatypes_relations_two_end.csv')

# change the column name from 'word' to 'qid'
data_types = data_types.rename(columns={'word': 'qid_1'})
data_types = data_types.rename(columns={'word2': 'qid_2'})

data_types = data_types.rename(columns={'word_label': 'wiki_word_1'})
data_types = data_types.rename(columns={'word2_label': 'wiki_word_2'})

data_types.to_csv('datatypes_relations_two_end.csv', index=False)

In [8]:
import pandas as pd
from itertools import product

# 读取 CSV 文件
datatype_wiki = pd.read_csv("./datatype_wiki_merged.csv")
relations = pd.read_csv("./datatypes_relations_two_end.csv")

# 处理 qid_to_datatype，使其支持一对多映射
qid_to_datatype = datatype_wiki.groupby("qid")["datatype"].apply(list).to_dict()

# 处理 qid_to_wiki_word（唯一映射）
qid_to_wiki_word = dict(zip(datatype_wiki["qid"], datatype_wiki["wiki_word"]))

# 展开 relations，每个 qid 可能对应多个 datatype，需要拆分成多行
expanded_rows = []

for _, row in relations.iterrows():
    qid_1, qid_2, relation, relation_label = row["qid_1"], row["qid_2"], row["relation"], row["relation_label"]
    
    # 获取 qid_1 和 qid_2 对应的 datatypes（如果不存在，则为空列表）
    datatypes_1 = qid_to_datatype.get(qid_1, [""])  
    datatypes_2 = qid_to_datatype.get(qid_2, [""])  

    # 获取 wiki_word
    wiki_word_1 = qid_to_wiki_word.get(qid_1, "")
    wiki_word_2 = qid_to_wiki_word.get(qid_2, "")

    # 生成所有 (datatype_1, datatype_2) 组合，并展开成多行
    for data_1, data_2 in product(datatypes_1, datatypes_2):
        expanded_rows.append([qid_1, wiki_word_1, data_1, relation,relation_label, qid_2, wiki_word_2, data_2])

# 创建新的 DataFrame
expanded_df = pd.DataFrame(expanded_rows, columns=["qid_1","wiki_word_1", "datatype_1", "relation","relation_label", "qid_2", "wiki_word_2","datatype_2"])

# 保存结果
expanded_df.to_csv("expanded_datatype_relations_two_end.csv", index=False)
