In [3]:
import pandas as pd

# Load the words_no_qid_update.xlsx file
words_no_qid_update = pd.read_excel('words_no_qid_update.xlsx')

# change the column name from 'word' to 'modality'
words_no_qid_update = words_no_qid_update.rename(columns={'word': 'modality'})

# load the words_with_qid.csv
words_with_qid = pd.read_csv('words_with_qid.csv')

# change the column name from 'word' to 'modality'
words_with_qid = words_with_qid.rename(columns={'word': 'modality'})

# add a column wiki_word to words_with_qid and same content as modality
words_with_qid['wiki_word'] = words_with_qid['modality']

# wiki_word column be the second column
words_with_qid = words_with_qid[['modality', 'wiki_word', 'qid']]

# merge the two dataframes
words = pd.concat([words_no_qid_update, words_with_qid])

# save the dataframe to a csv file
words.to_csv('modality_wiki.csv', index=False)


### Two-end

In [7]:
modalities = pd.read_csv('./modality_relations_two_end.csv')

# change the column name from 'word' to 'qid'
modalities = modalities.rename(columns={'word': 'qid_1'})
modalities = modalities.rename(columns={'word2': 'qid_2'})

modalities = modalities.rename(columns={'word_label': 'wiki_word_1'})
modalities = modalities.rename(columns={'word_label2': 'wiki_word_2'})

modalities.to_csv('modality_relations_two_end.csv', index=False)



In [8]:
import pandas as pd
from itertools import product

# 读取 CSV 文件
modality_wiki = pd.read_csv("modality_wiki.csv")
relations = pd.read_csv("modality_relations_two_end.csv")

# 处理 qid_to_modality，使其支持一对多映射
qid_to_modality = modality_wiki.groupby("qid")["modality"].apply(list).to_dict()

# 处理 qid_to_wiki_word（唯一映射）
qid_to_wiki_word = dict(zip(modality_wiki["qid"], modality_wiki["wiki_word"]))

# 展开 relations，每个 qid 可能对应多个 modality，需要拆分成多行
expanded_rows = []

for _, row in relations.iterrows():
    qid_1, qid_2, relation, relation_label = row["qid_1"], row["qid_2"], row["relation"], row["relation_label"]
    
    # 获取 qid_1 和 qid_2 对应的 modalities（如果不存在，则为空列表）
    modalities_1 = qid_to_modality.get(qid_1, [""])  
    modalities_2 = qid_to_modality.get(qid_2, [""])  

    # 获取 wiki_word
    wiki_word_1 = qid_to_wiki_word.get(qid_1, "")
    wiki_word_2 = qid_to_wiki_word.get(qid_2, "")

    # 生成所有 (modality_1, modality_2) 组合，并展开成多行
    for mod_1, mod_2 in product(modalities_1, modalities_2):
        expanded_rows.append([qid_1, wiki_word_1, mod_1, relation,relation_label,qid_2, wiki_word_2, mod_2])

# 创建新的 DataFrame
expanded_df = pd.DataFrame(expanded_rows, columns=["qid_1", "wiki_word_1", "modality_1", "relation","relation_label", "qid_2", "wiki_word_2","modality_2"])

# 保存结果
expanded_df.to_csv("expanded_modality_relations_two_end.csv", index=False)


### One-end

In [10]:
modalities = pd.read_csv('./modality_relations_one_end.csv')

# change the column name from 'word' to 'qid'
modalities = modalities.rename(columns={'word': 'qid_1'})
modalities = modalities.rename(columns={'related_word': 'qid_2'})

modalities = modalities.rename(columns={'word_label': 'wiki_word_1'})
modalities = modalities.rename(columns={'related_word_label': 'wiki_word_2'})

modalities.to_csv('modality_relations_one_end.csv', index=False)

In [11]:
import pandas as pd
from itertools import product

# 读取 CSV 文件
modality_wiki = pd.read_csv("modality_wiki.csv")  # 原始modality映射
relations_one_end = pd.read_csv("./modality_relations_one_end.csv")  # 只处理one-end.csv

# 处理 qid_to_modality，使其支持一对多映射
qid_to_modality = modality_wiki.groupby("qid")["modality"].apply(list).to_dict()

# 处理 qid_to_wiki_word（唯一映射）
qid_to_wiki_word = dict(zip(modality_wiki["qid"], modality_wiki["wiki_word"]))

# 处理 one-end.csv，只展开 qid_1 的 modalities
expanded_rows_one_end = []
for _, row in relations_one_end.iterrows():
    qid_1, qid_2, relation, relation_label = row["qid_1"], row["qid_2"], row["relation"], row["relation_label"]
    
    # 获取 qid_1 可能的 modalities（可能多个）
    modalities_1 = qid_to_modality.get(qid_1, [""])

    # 获取 wiki_word
    wiki_word_1 = qid_to_wiki_word.get(qid_1, "")
    wiki_word_2 = row["wiki_word_2"]  # 直接使用 one-end.csv 中的 wiki_word_2

    # 生成所有 (modality_1) 组合，并展开成多行
    for mod_1 in modalities_1:
        expanded_rows_one_end.append([qid_1, wiki_word_1, mod_1, relation,relation_label, qid_2, wiki_word_2])

# 创建 DataFrame
expanded_df_one_end = pd.DataFrame(expanded_rows_one_end, columns=["qid_1", "wiki_word_1","modality_1", "relation","relation_label", "qid_2", "wiki_word_2"])

# 保存最终合并的数据
expanded_df_one_end.to_csv("expanded_modality_relations_one_end.csv", index=False)

