<a href="https://colab.research.google.com/github/RegNLP/GraphRAG4RegGraph/blob/main/RegSumDatasetSplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import random

def split_json_data(input_file, train_file, validation_file, test_file, train_ratio=0.7, validation_ratio=0.15):
    """
    Reads a JSON file, shuffles the data, splits it into training, validation, and test sets,
    and writes each set to separate JSON files.

    Args:
        input_file (str): Path to the input JSON file.
        train_file (str): Path for saving the training set.
        validation_file (str): Path for saving the validation set.
        test_file (str): Path for saving the test set.
        train_ratio (float): Proportion of data to use for training.
        validation_ratio (float): Proportion of data to use for validation.
            The remainder will be used for testing.
    """
    # Load the JSON data from the input file
    with open(input_file, 'r') as f:
        data = json.load(f)

    # Shuffle the data to ensure random splitting
    random.shuffle(data)

    # Calculate split indices
    total = len(data)
    train_end = int(total * train_ratio)
    validation_end = train_end + int(total * validation_ratio)

    # Split the data
    train_data = data[:train_end]
    validation_data = data[train_end:validation_end]
    test_data = data[validation_end:]

    # Write the splits to separate JSON files
    with open(train_file, 'w') as f:
        json.dump(train_data, f, indent=4)

    with open(validation_file, 'w') as f:
        json.dump(validation_data, f, indent=4)

    with open(test_file, 'w') as f:
        json.dump(test_data, f, indent=4)

    print(f"Data split into {len(train_data)} training, {len(validation_data)} validation, and {len(test_data)} test items.")

if __name__ == '__main__':
    # Define file paths
    input_json = '/content/drive/MyDrive/Colab Notebooks/ExtractedSummaries/summaries_content.json'
    train_json = '/content/drive/MyDrive/Colab Notebooks/ExtractedSummaries/train_RegSum_Data.json'
    validation_json = '/content/drive/MyDrive/Colab Notebooks/ExtractedSummaries/validation_RegSum_Data.json'
    test_json = '/content/drive/MyDrive/Colab Notebooks/ExtractedSummaries/test_RegSum_Data.json'

    # Call the function with default split ratios (70% train, 15% validation, 15% test)
    split_json_data(input_json, train_json, validation_json, test_json)


Data split into 247 training, 53 validation, and 54 test items.
