In [1]:
import pandas as pd
import json

# Generate idxes

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path, sep='|', header=None, names=['entity1', 'relation', 'entity2'])
    return df.drop_duplicates()

def create_entity_to_idx(df):
    entity_to_idx = {}
    current_idx = 0

    for _, row in df.iterrows():
        entity1 = row['entity1']
        entity2 = row['entity2']

        if entity1 not in entity_to_idx:
            entity_to_idx[entity1] = current_idx
            current_idx += 1
        if entity2 not in entity_to_idx:
            entity_to_idx[entity2] = current_idx
            current_idx += 1
            
    return entity_to_idx

def create_edge_index_and_relations(df, entity_to_idx):
    edge_index = []
    relations = []

    for _, row in df.iterrows():
        entity1 = row['entity1']
        relation = row['relation']
        entity2 = row['entity2']
        
        node1 = entity_to_idx[entity1]
        node2 = entity_to_idx[entity2]
        
        edge_index.append([node1, node2])
        relations.append(relation)

    return edge_index, relations


def save_data_json(entity_to_idx, edge_index, relations, filename='../Datasets/MetaQA_dataset/processed/idxes.json'):
    with open(filename, 'w') as f:
        json.dump({
            'entity_to_idx': entity_to_idx,
            'edge_index': edge_index,
            'relations': relations
        }, f)

In [3]:
path = '../Datasets/MetaQA_dataset/kb.txt'
df_unique = load_data(path)
    
entity_to_idx = create_entity_to_idx(df_unique)
edge_index, relations = create_edge_index_and_relations(df_unique, entity_to_idx)
    
save_data_json(entity_to_idx, edge_index, relations)