In [3]:
import csv
import json
import os

def build_vocab_from_csvs(folder_path):
    entities = set()
    relations = set()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, mode='r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                next(reader)  # Skip header
                for row in reader:
                    subject, relation, obj = [x.strip() for x in row]
                    entities.update([subject, obj])
                    relations.add(relation)

    entity2id = {entity: idx for idx, entity in enumerate(sorted(entities))}
    relation2id = {rel: idx for idx, rel in enumerate(sorted(relations))}

    return entity2id, relation2id

def csvs_to_json(folder_path, output_json_path):
    entity2id, relation2id = build_vocab_from_csvs(folder_path)
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, mode='r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                next(reader)  # Skip header
                for row in reader:
                    subject, relation, obj = [x.strip() for x in row]

                    entry = {
                        "instruction": (
                            "Given a triple from a knowledge graph. Each triple consists of a head entity, "
                            "a relation, and a tail entity. Please determine the correctness of the triple and response True or False."
                        ),
                        "input": f"\nThe input triple: \n( {subject}, {relation}, {obj} )\n",
                        "output": "True",
                        "embedding_ids": [
                            entity2id[subject],
                            relation2id[relation],
                            entity2id[obj]
                        ]
                    }
                    data.append(entry)

    with open(output_json_path, mode='w', encoding='utf-8') as jsonfile:
        json.dump(data, jsonfile, indent=4)

In [4]:
csvs_to_json('Leo/Riverside/Triples_CSVs', "test.json")