# Data Utils

> the place the functions for handling data in more convenient ways.

In [None]:
#| default_exp utility.data_utils
#| export
import json
import hashlib
import dataclasses
import datetime
import numpy as np
from typing import Dict, Any, Type, T
import importlib

### dataclass convenience functions  

In [None]:
#| export
def f_data_class_to_dict(thing: object) -> Dict[str, Any]:
    """
    Convert a dataclass object to a dictionary.

    Parameters:
    - thing: the object to convert to a dictionary

    Returns:
    - a dictionary representation of the object
    """
    # Extract the fields of the object
    fields = dataclasses.fields(thing)
    if isinstance(thing, type):
        # Raise an error if the function is called on a class, rather than an instance
        raise TypeError(f'got {type}, expected instance')

    # Initialize an empty dictionary to store the field names and values
    return_dict = {}

    # Check for the presence of the _hash_exclude_ attribute
    exclude = getattr(thing, '_hash_exclude_', ())

    # Add the fields to the dictionary, skipping any that are in the exclude list
    for field in fields:
        if field.name in exclude:
            continue
    
    return dataclasses.asdict(thing)

In [None]:
#| export
def f_to_dict(item):
    """
    Convert the given item to a dictionary.

    Parameters:
    - item: the object to convert to a dictionary

    Returns:
    - a dictionary representation of the object
    """
    # Try to convert the item to a dictionary using the f_data_class_to_dict function
    try:
        return f_data_class_to_dict(item)
    except TypeError:
        pass

    # Handle values based on their type
    if isinstance(item, bytes):
        # Convert bytes to a string
        return item.decode("utf-8").strip()
    elif isinstance(item, (int, float, str, bool, dict, list, tuple, type(None))):
        # These types are JSON serializable, so return them as-is
        return item
    elif isinstance(item, np.ndarray):
        # Convert NumPy arrays to a list of lists
        return item.tolist()
    elif isinstance(item, datetime.datetime):
        # Convert datetime objects to a string representation
        return item.isoformat()
    else:
        # Raise a TypeError if the object is not JSON serializable
        raise TypeError(f"Object of type {type(item).__name__} is not JSON serializable")




In [None]:
#| export
def f_get_hash(item: object, prefix: str = 'h1') -> bytes:
    """
    Generate a hash of the given item.

    Parameters:
    - item: the object to generate a hash for
    - prefix: a prefix to include in the generated hash (default 'h1_')
    
    Returns:
    - a bytes object containing the generated hash
    
    originally inspired by https://death.andgravity.com/stable-hashing
    """
    # Convert the item to a JSON string
    item_as_json = json.dumps(item, default=f_to_dict, ensure_ascii=False,
                              sort_keys=True, indent=None, separators=(',', ':'))
    
    # Generate a hash of the JSON string
    item_as_hash = hashlib.md5(item_as_json.encode('utf-8')).hexdigest()
    
    # Return the prefix and hash as a bytes object
    return f'{prefix}_{item_as_hash}'

In [None]:
#| export
def f_from_dict(item: Dict[str, Any]) -> object:
    """
    Convert a dictionary to an object.

    Parameters:
    - item: the dictionary to convert to an object

    Returns:
    - an object representation of the dictionary
    """
    # Check if the dictionary contains a special key that indicates it should be converted to a specific type
    if '__type__' in item:
        # Extract the type information
        type_name = item.pop('__type__')
        module_name = item.pop('__module__')

        # Retrieve the type from the specified module
        module = importlib.import_module(module_name)
        type_ = getattr(module, type_name)

        # Check if the type is a dataclass
        if dataclasses.is_dataclass(type_):
            # Convert the dictionary to a dataclass object
            return f_data_class_from_dict(type_, item)
        else:
            # Convert the dictionary to the specified type using the from_dict method
            return type_.from_dict(item)
    elif '__ndarray__' in item:
        # Convert a list of lists back to a NumPy array
        return np.array(item['__ndarray__'])
    elif '__datetime__' in obj:
        # Convert a string representation back to a datetime object
        return datetime.fromisoformat(obj['__datetime__'])
    else:
        # If the dictionary doesn't contain type information, just return it as-is
        return item

In [None]:
#| export
def f_data_class_from_dict(type_: type, data: Dict[str, Any]) -> object:
    """
    Convert a dictionary to a dataclass object.

    Parameters:
    - type_: the type of object to create
    - data: the dictionary to convert to an object

    Returns:
    - an object of the specified type, initialized with the data from the dictionary
    """
    # Extract the fields of the object
    fields = dataclasses.fields(type_)

    # Initialize an empty dictionary to store the field names and values
    field_values = {}

    # Add the fields to the dictionary
    for field in fields:
        # Check if the field is present in the data
        if field.name in data:
            # If it is, add it to the dictionary
            field_values[field.name] = data[field.name]
        else:
            # If it's not present, use the default value for the field
            field_values[field.name] = field.default

    # Create a new object of the specified type and initialize it with the field values
    return type_(**field_values)

In [None]:
#| export
def f_dict_to_obj(d: Dict[str, Any], cls: Type[T]) -> T:
    """
    Convert a dictionary to an object of the given class.

    Parameters:
    - d: the dictionary to convert to an object
    - cls: the class to use for creating the object

    Returns:
    - an object of the given class, initialized with the values from the dictionary
    """
    return cls(**d)

### test

In [None]:
#|eval: false
def test_f_get_hash():
    # Test generating a hash for a simple object
    obj = {"key": "value"}
    expected_hash = 'h1_a7353f7cddce808de0032747a0b7be50'
    assert f_get_hash(obj) == expected_hash

def test_f_to_dict():
    # Test converting a simple object to a dictionary
    obj = {"key": "value"}
    expected_dict = {"key": "value"}
    assert f_to_dict(obj) == expected_dict

    # Test converting a datetime object to a dictionary
    obj = datetime.datetime(2020, 1, 1, 12, 0, 0)
    expected_dict = "2020-01-01T12:00:00"
    assert f_to_dict(obj) == expected_dict

def test_f_data_class_to_dict():
    # Define a simple data class
    @dataclasses.dataclass
    class DataClass:
        field1: str
        field2: int

    # Test converting a data class object to a dictionary
    obj = DataClass("value1", 123)
    expected_dict = {"field1": "value1", "field2": 123}

    assert f_data_class_to_dict(obj) == expected_dict

@dataclasses.dataclass
class DataClass:
    field1: str
    field2: int
def test_f_from_dict():
    # Define a simple data class
    # Test converting a dictionary to a data class object
    obj_dict = {"__type__": "DataClass", "__module__": __name__, "field1": "value1", "field2": 123}
    expected_obj = DataClass("value1", 123)
    assert f_from_dict(obj_dict) == expected_obj

def test_dict_to_obj():
    # Define a simple class
    class SimpleClass:
        def __init__(self, field1, field2):
            self.field1 = field1
            self.field2 = field2
        def __eq__(self, other):
            # Compare the field values of the objects
            return self.field1 == other.field1 and self.field2 == other.field2

    # Test converting a dictionary to an object
    obj_dict = {"field1": "value1", "field2": 123}
    expected_obj = SimpleClass("value1", 123)

    # Convert the dictionary to an object
    obj = f_dict_to_obj(obj_dict, SimpleClass)


    assert obj == expected_obj


In [None]:
#|eval: false
test_f_get_hash()
test_f_to_dict()
test_f_data_class_to_dict()
test_f_from_dict()
test_dict_to_obj()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()