In [7]:
import sys
import pickle
import json
from dataclasses import is_dataclass, fields

sys.path.insert(0,r"/Users/feiyu/Desktop/code/NewCoEditor/src")
from coeditor.common import *
from coeditor.dataset import *

In [8]:
def instance_to_json(obj: Any) -> Dict[str, Any]:
    """
    Serialize a dataclass instance or dict instance to a JSON-serializable dictionary.
    
    Args:
        obj: A dataclass instance to serialize
        include_class_name: Whether to include the class name as a key in the output
    
    Returns:
        A JSON-serializable dictionary representing the dataclass
    
    Example:
        >>> @dataclass
        >>> class B:
        >>>     num: int
        >>> @dataclass
        >>> class A:
        >>>     obj: B
        >>> dataclass_to_json(A(obj=B(num=1)))
        {'A': {'obj': {'B': {'num': 1}}}}
    """
    if not is_dataclass(obj) and not isinstance(obj, dict):
        raise TypeError(f"Expected a dataclass instance, got {type(obj).__name__}")
    
    def _serialize(value: Any) -> Any:
        if is_dataclass(value):
            result = {"__class__": type(value).__name__}

            for field in fields(value):
                field_value = getattr(value, field.name)
                result[field.name] = _serialize(field_value)

            return result
        elif isinstance(value, dict):
            return {k: _serialize(v) for k, v in value.items()}
        elif isinstance(value, (list, tuple, set)):
            return [_serialize(item) for item in value]
        # Handle NumPy arrays
        elif isinstance(value, np.ndarray):
            return value.tolist()
        # Handle range objects
        elif isinstance(value, range):
            return {
                "__type__": "range",
                "start": value.start,
                "stop": value.stop,
                "step": value.step
            }
        else:
            return value
    
    return _serialize(obj)

def serialize_to_json(obj: Any, fp: Optional[str | Path] = None, indent: int = 2) -> Optional[str]:
    """
    Serialize a dataclass instance to a JSON string or file.
    
    Args:
        obj: A dataclass instance to serialize
        fp: Optional file path to write the JSON to
        indent: Number of spaces for indentation in the JSON output
    
    Returns:
        JSON string if fp is None, otherwise None (writes to file)
    """
    data = instance_to_json(obj)
    
    if fp is not None:
        with open(fp, 'w') as f:
            json.dump(data, f, indent=indent)
        return None
    
    return json.dumps(data, indent=indent)

In [9]:
pickle_abs_path = Path("/Users/feiyu/Desktop/code/NewCoEditor/datasets_root/perm2k/processed/C3ProblemGenerator(VERSION=3.1, analyzer=())/deepseek-ai~DeepSeek-V3(1000, is_training=False)")
json_abs_path = Path(WORK_DIR) / "datasets_root"

split = ("train", "test", "valid")

with open(pickle_abs_path, "rb") as f:
    problems: C3ProblemDataset[C3Problem] = pickle.load(f)

def serialize_dataset(split: Tuple[str, ...]) -> None:
        filtered_problems = {s: problems[s] for s in split}
        serialize_to_json(filtered_problems, json_abs_path / f"problems_{'_'.join(split)}.json")

def serialize_single_instance(split: str, id: int) -> None:
    with open(pickle_abs_path, "rb") as f:
        problems: C3ProblemDataset[C3Problem] = pickle.load(f)
        serialize_to_json(problems[split][id], json_abs_path / f"problems_{split}_{id}.json")

In [10]:
# serialize_dataset(split)
serialize_single_instance("train", 0)

TypeError: list indices must be integers or slices, not str