There are only 3 possibilities:
1. Expected JSON key is absent (key shortage)
2. JSON includes unexpected key (key excess)
3. Expected JSON key provides unexpected type (key mismatch)
    -  Null when expected to be required 
       -  `type` -> `Optional[type]` 
    -  Union type when expected singular type 
       - `List[str]` -> `Literal[False] | List[str]` (yes there are people doing this)
       -  `Literal[False]` -> `bool`


Assuming a representative sample of JSON, ChatGPT will handle all of `2` and `3`, while not noticing instances of `1`.

We can generate a TypedDict and Class from pasting the JSON samples to ChatGPT, and use a simple loop through the available data to check through each conditionally absent key.

In [30]:
# ChatGPT refuses to use the TypedDict("Type", {...}) syntax, even when prompted. 
# Unless you copy paste  class based output and reply "Convert these to the syntax: Type = TypedDict("Type", {...})"

# Select all phrase "TypedDict", move multiple cursor to single quotes around type name, and replace with double
# Select all single quotes after a space, replace space with newline, tab
# Select all single quote after an open bracket, replace with newline

from typing import TypedDict, Optional, List

ContactDict = TypedDict("ContactDict", {
    'home': Optional[str], 
    'work': Optional[str]
})
AddressDict = TypedDict("AddressDict", {
    'street': str, 
    'city': str, 
    'zipcode': str
})
PersonDict = TypedDict("PersonDict", {
    'name': str, 
    'age': int, 
    'address': Optional[AddressDict], 
    'contacts': ContactDict, 
    'hobbies': List[str]
})
LocationDict = TypedDict("LocationDict", {
    'country': str, 
    'state': str, 
    'city': str
})
TeamDict = TypedDict("TeamDict", {
    'teams': List[str], 
    'team_lead': str
})
DepartmentDict = TypedDict("DepartmentDict", {
    'engineering': TeamDict, 
    'marketing': TeamDict
})
CompanyDict = TypedDict("CompanyDict", {
    'name': str, 
    'location': LocationDict, 
    'departments': DepartmentDict, 
    'industry': str
})
ParticipantDict = TypedDict("ParticipantDict", {
    'name': str, 
    'role': str
})
EventDict = TypedDict("EventDict", {
    'name': str, 
    'date': str, 
    'location': LocationDict, 
    'codes': List[int], 
    'participants': List[ParticipantDict]
})
DataDict = TypedDict("DataDict", {
    'valid': bool, 
    'person': PersonDict, 
    'company': CompanyDict, 
    'events': List[EventDict]
})

The above would go into its own file, either in a `typings` or `models` folder.


Copy the above declarations to ChatGPT with prompt "Make me Python classes with __init__ methods that take one singular parameter of the dictionary's type":

In [31]:
from typing import Optional, List

# class ContactDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.home = data.get('home') if data and 'home' in data else None
#         self.work = data.get('work') if data and 'work' in data else None

# class AddressDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.street = data.get('street') if data and 'street' in data else ''
#         self.city = data.get('city') if data and 'city' in data else ''
#         self.zipcode = data.get('zipcode') if data and 'zipcode' in data else ''

# class PersonDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.name = data.get('name') if data and 'name' in data else ''
#         self.age = data.get('age') if data and 'age' in data else 0
#         self.address = AddressDict(data.get('address')) if data and 'address' in data else None
#         self.contacts = ContactDict(data.get('contacts')) if data and 'contacts' in data else None
#         self.hobbies = data.get('hobbies') if data and 'hobbies' in data else []

# class LocationDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.country = data.get('country') if data and 'country' in data else ''
#         self.state = data.get('state') if data and 'state' in data else ''
#         self.city = data.get('city') if data and 'city' in data else ''

# class TeamDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.teams = data.get('teams') if data and 'teams' in data else []
#         self.team_lead = data.get('team_lead') if data and 'team_lead' in data else ''

# class DepartmentDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.engineering = TeamDict(data.get('engineering')) if data and 'engineering' in data else None
#         self.marketing = TeamDict(data.get('marketing')) if data and 'marketing' in data else None

# class CompanyDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.name = data.get('name') if data and 'name' in data else ''
#         self.location = LocationDict(data.get('location')) if data and 'location' in data else None
#         self.departments = DepartmentDict(data.get('departments')) if data and 'departments' in data else None
#         self.industry = data.get('industry') if data and 'industry' in data else ''

# class ParticipantDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.name = data.get('name') if data and 'name' in data else ''
#         self.role = data.get('role') if data and 'role' in data else ''

# class EventDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.name = data.get('name') if data and 'name' in data else ''
#         self.date = data.get('date') if data and 'date' in data else ''
#         self.location = LocationDict(data.get('location')) if data and 'location' in data else None
#         self.codes = data.get('codes') if data and 'codes' in data else []
#         self.participants = [ParticipantDict(participant) for participant in data.get('participants', [])] if data and 'participants' in data else []

# class DataDict:
#     def __init__(self, data: Optional[dict] = None):
#         self.valid = data.get('valid') if data and 'valid' in data else False
#         self.person = PersonDict(data.get('person')) if data and 'person' in data else None
#         self.company = CompanyDict(data.get('company')) if data and 'company' in data else None
#         self.events = [EventDict(event) for event in data.get('events', [])] if data and 'events' in data else []


Here, ChatGPT made some assumptions about how we want to handle the nested initialization. It assumes that if an inner layer is passed None, we want to initialize the inner layer. It assumes this for so long, that it even applies it to the outermost layer. This implies that it did not pick up on the idea that we are deserializing JSON.

In most real cases, you wouldn't necessarily do that. Particularly, if you are ingesting this data. If you want to ensure that after deserialization the structure is maintained, this may be desireable (such as when *serving* as API, so that your consumers can easily know the total list of possible fields).

In our case, we will reprompt with:

"Make me Python classes with __init__ methods that take one singular parameter of their dictionary's type.
I want to pass deserialized JSON into the topmost class."

In [32]:
from typing import Optional, List, TypedDict

class Contact:
    def __init__(self, data: ContactDict):
        self.home = data['home']
        self.work = data['work']

class Address:
    def __init__(self, data: AddressDict):
        self.street = data['street']
        self.city = data['city']
        self.zipcode = data['zipcode']

class Person:
    def __init__(self, data: PersonDict):
        self.name = data['name']
        self.age = data['age']
        self.address = Address(data['address']) if data['address'] else None
        self.contacts = Contact(data['contacts'])
        self.hobbies = data['hobbies']

class Location:
    def __init__(self, data: LocationDict):
        self.country = data['country']
        self.state = data['state']
        self.city = data['city']

class Team:
    def __init__(self, data: TeamDict):
        self.teams = data['teams']
        self.team_lead = data['team_lead']

class Department:
    def __init__(self, data: DepartmentDict):
        self.engineering = Team(data['engineering'])
        self.marketing = Team(data['marketing'])

class Company:
    def __init__(self, data: CompanyDict):
        self.name = data['name']
        self.location = Location(data['location'])
        self.departments = Department(data['departments'])
        self.industry = data['industry']

class Participant:
    def __init__(self, data: ParticipantDict):
        self.name = data['name']
        self.role = data['role']

class Event:
    def __init__(self, data: EventDict):
        self.name = data['name']
        self.date = data['date']
        self.location = Location(data['location'])
        self.codes = data['codes']
        self.participants = [Participant(participant) for participant in data['participants']]

class Data:
    def __init__(self, data: DataDict):
        self.valid = data['valid']
        self.person = Person(data['person'])
        self.company = Company(data['company'])
        self.events = [Event(event) for event in data['events']]


Now it understands the idea we want. We want to assume safe access on initialization, and intentionally fail during a test, so that we correct to the true expectation set of the source JSON (again, assuming representative sample of source JSON, where all union type variants occur).

In [33]:
import json

def test() -> List[Data]:

    invalid_json = """
    {
        "unexpected_key": true,
        "missing all the necessary keys": true
    }
    """

    with open("./complex_json.json") as f:
        valid_json = f.read()

    # No need to explicitly type here... It would be a lie anyway
    invalid_json = json.loads(invalid_json)
    valid_json_list = json.loads(valid_json)

    try:
        invalid_data = Data(invalid_json)
    except KeyError as e:
        print("Expected failure on invalid JSON: Missing key", e)
        # def __init__(self, data: DataDict):
        #  60     self.valid = data['valid'] <-------- Missing
        #  61     self.person = Person(data['person'])
        #  62     self.company = Company(data['company'])

    # We plan for the possibility of missing keys, and run the following test to update our models:
    valid_data = [Data(obj) for obj in valid_json_list]

    return valid_data
_ = test()


Expected failure on invalid JSON: Missing key 'valid'


KeyError: 'home'

`KeyError: 'home'` now tells us that the JSON we received sometimes doesn't send the key 'home'. Therefore, we update our model of the expected JSON:

In [46]:
from typing import NotRequired

ContactDict = TypedDict("ContactDict", {
    'home': NotRequired[Optional[str]], 
    'work': NotRequired[Optional[str]]
})
AddressDict = TypedDict("AddressDict", {
    'street': str, 
    'city': str, 
    'zipcode': str
})
PersonDict = TypedDict("PersonDict", {
    'name': str, 
    'age': int, 
    'address': Optional[AddressDict], 
    'contacts': ContactDict, 
    'hobbies': List[str]
})
LocationDict = TypedDict("LocationDict", {
    'country': NotRequired[str], 
    'state': NotRequired[str], 
    'city': str
})
TeamDict = TypedDict("TeamDict", {
    'teams': List[str], 
    'team_lead': str
})
DepartmentDict = TypedDict("DepartmentDict", {
    'engineering': NotRequired[TeamDict], 
    'marketing': NotRequired[TeamDict]
})
CompanyDict = TypedDict("CompanyDict", {
    'name': str, 
    'location': LocationDict, 
    'departments': DepartmentDict, 
    'industry': str
})
ParticipantDict = TypedDict("ParticipantDict", {
    'name': str, 
    'role': str
})
EventDict = TypedDict("EventDict", {
    'name': str, 
    'date': str, 
    'location': LocationDict, 
    'codes': List[int], 
    'participants': List[ParticipantDict]
})
DataDict = TypedDict("DataDict", {
    'valid': bool, 
    'person': PersonDict, 
    'company': CompanyDict, 
    'events': List[EventDict]
})

class Contact:
    def __init__(self, data: ContactDict):
        self.home = data.get('home', None) # <--------------------------- Now lint-errors if `data['home']`. Autocomplete is active though
        self.work = data.get('work') #  <--------------------------- Then lint-errors 

class Address:
    def __init__(self, data: AddressDict):
        self.street = data['street']
        self.city = data['city']
        self.zipcode = data['zipcode']

class Person:
    def __init__(self, data: PersonDict):
        self.name = data['name']
        self.age = data['age']
        self.address = Address(data['address']) if data['address'] else None
        self.contacts = Contact(data['contacts'])
        self.hobbies = data['hobbies']

class Location:
    def __init__(self, data: LocationDict):
        self.country = data.get('country') # <-------------------------
        self.state = data.get('state') # <----------------
        self.city = data['city']

class Team:
    def __init__(self, data: TeamDict):
        self.teams = data['teams']
        self.team_lead = data['team_lead']

class Department:
    def __init__(self, data: DepartmentDict):
        engineering = data.get('engineering')
        self.engineering = None if not engineering else Team(engineering) # <--------- We are led directly to where we need to make special fallback implementations, and/or introduce correctly labeled Optional type
        
        marketing = data.get('marketing')
        self.marketing = None if not marketing else Team(marketing)

class Company:
    def __init__(self, data: CompanyDict):
        self.name = data['name']
        self.location = Location(data['location'])
        self.departments = Department(data['departments'])
        self.industry = data['industry']

class Participant:
    def __init__(self, data: ParticipantDict):
        self.name = data['name']
        self.role = data['role']

class Event:
    def __init__(self, data: EventDict):
        self.name = data['name']
        self.date = data['date']
        self.location = Location(data['location'])
        self.codes = data['codes']
        self.participants = [Participant(participant) for participant in data['participants']]

class Data:
    def __init__(self, data: DataDict):
        self.valid = data['valid']
        self.person = Person(data['person'])
        self.company = Company(data['company'])
        self.events = [Event(event) for event in data['events']]

- KeyError: 'work',
- KeyError: 'country',
- KeyError: 'state',
- KeyError: 'engineering'
- KeyError: 'marketing'

And now we have a mostly functioning class.

In [47]:
def test() -> List[Data]: # Notebook needs us to redefine this here to update context, a problem arising due to the demonstration context

    invalid_json = """
    {
        "unexpected_key": true,
        "missing all the necessary keys": true
    }
    """

    with open("./complex_json.json") as f:
        valid_json = f.read()

    # No need to explicitly type here... It would be a lie anyway
    invalid_json = json.loads(invalid_json)
    valid_json_list = json.loads(valid_json)

    try:
        invalid_data = Data(invalid_json)
    except KeyError as e:
        print("Expected failure on invalid JSON: Missing key", e)
        # def __init__(self, data: DataDict):
        #  60     self.valid = data['valid'] <-------- Missing
        #  61     self.person = Person(data['person'])
        #  62     self.company = Company(data['company'])

    # We plan for the possibility of missing keys, and run the following test to update our models:
    valid_data = [Data(obj) for obj in valid_json_list]

    return valid_data

data = test() # knows as  List[Data]

home = data[0].person.contacts.home
if home:
    string = home.capitalize()

integer = data[2].events[0].codes.pop()

try:
    marketing = data[1].company.departments.marketing.teams[0].lower()  # Type checking recognizes this as an unsafe dot access
except AttributeError as e:
    print(e)

# data[1].company.departments.
#                           -> Marketing
#                           -> Engineering...


Expected failure on invalid JSON: Missing key 'valid'
'NoneType' object has no attribute 'teams'


If you pay close attention to the JSON example, however, we should be able to access different fields here:

"departments": {
    "development": {
        "teams": ["frontend", "backend", "testing"],
        "team_lead": "Frank Rodriguez"
    },
    "sales": {
        "teams": ["enterprise", "retail", "partnerships"],
        "team_lead": "Gina White"
    }
}