In [1]:
import os
import json

def init_phonological_generalizations_data(directory=None, output_dir=None):
    '''
    This function initializes and categorizes phonological generalizations data from a given directory.
    It returns a dictionary where keys are the problem types (morphology, transliteration, stress, multilingual) 
    and values are lists of corresponding problem datasets. It also saves each problem as a separate JSON file
    with the source language and type included in the filename.
    '''
    url = 'https://github.com/saujasv/phonological-generalizations.git'
    json_dir = 'phonological-generalizations/data/problems'

    # Clone the repository if it does not exist
    if not os.path.exists(json_dir):
        os.system(f"git clone {url}")
    
    # Dictionary to hold categorized problems
    categorized_problems = {
        'morphology': [],
        'transliteration': [],
        'stress': [],
        'multilingual': []
    }

    # Check if the directory exists
    if os.path.exists(json_dir):
        # Load all JSON files in the directory
        for filename in os.listdir(json_dir):
            if filename.endswith('.json'):
                with open(os.path.join(json_dir, filename), 'r') as file:
                    problem_data_list = json.load(file)

                    # Ensure problem_data_list is a list
                    if isinstance(problem_data_list, list):
                        # Iterate over each problem data in the list
                        for problem_data in problem_data_list:
                            # Determine the type of the problem from the 'type' field
                            problem_type = problem_data.get('type', None)
                            if problem_type:
                                # Extract source and target languages
                                languages = problem_data.get('languages', [])
                                source_language = languages[0] if len(languages) > 0 else ""
                                target_language = languages[1] if len(languages) > 1 else ""

                                # Extract meta information if available
                                meta = problem_data.get('meta', "")

                                # Extract other relevant information
                                families = problem_data.get('families', [])
                                columns = problem_data.get('columns', [])

                                # Split the data into train and test sets
                                train_data = []
                                test_data = []
                                for item in problem_data['data']:
                                    if '?' in item[0] or '?' in item[1]:
                                        test_data.append(item)
                                    else:
                                        train_data.append(item)
                                
                                # Create the new problem structure with train and test data
                                new_problem_data = {
                                    "source_language": source_language,
                                    "target_language": target_language,
                                    #"meta": meta,
                                    "type": problem_type,
                                    "families": families,
                                    #"columns": columns,
                                    "train": train_data,
                                    "test": test_data
                                }

                                # Append the problem data to the corresponding category
                                categorized_problems[problem_type].append(new_problem_data)

                                # Save each problem as a JSON file using source language and type
                                if output_dir:
                                    category_dir = os.path.join(output_dir, problem_type)
                                    if not os.path.exists(category_dir):
                                        os.makedirs(category_dir)

                                    filename = f"{source_language}_{problem_type}.json"
                                    problem_file = os.path.join(category_dir, filename)

                                    with open(problem_file, 'w') as pf:
                                        json.dump(new_problem_data, pf, indent=4)

    return categorized_problems

# Usage example:
output_directory = "./categorized_problems1"
categorized_problems = init_phonological_generalizations_data(output_dir=output_directory)
