In [1]:
from pydantic import BaseModel
from typing import List, Optional, Literal, Union

class Text(BaseModel): # Holds paragraph, sentence
    confirm: bool = False
    paragraph: str = ""
    sentence: str = ""

class Structure(BaseModel): # Holds info if information was extracted from table, caption or para
    table: Text
    caption: Text
    para: Text


class Power(BaseModel):
    base: float
    exponent: float

class Amount(BaseModel):
    value: float
    unit: str
    multiplier: Optional[Power] = 1

class RateConstant(BaseModel):
    constant: Amount
    type: Literal["K_O3", "K_OH", "K_TOTAL"]
    var: str

class Compound(BaseModel):
    name: str
    abbrev: Optional[str] = ""

class Ph(BaseModel):
    value: float
    regulator: Optional[str] = ""

class Scavenger(BaseModel):
    name: str
    conc: Amount

class O3_info(BaseModel):
    type: str
    info: Amount

class O3(BaseModel):
    o3_conc: Amount
    o3_info: Optional[O3_info] = "" # e.g. Information about Ozone pressure in kPa or ratio in ppm which 
                                    # cannot be directly transferred to concentration

class Conditions(BaseModel):
    ph: Ph
    temp: Amount
    order: str
    scavenger: Scavenger
    comp_conc: Amount
    o3: O3

class Reference(BaseModel):
    index: str  # Points to actual reference under "References"
    ref: str    # Full reference

class MetaData(BaseModel):
    doi: str
    relevance: bool = False
    title: str
    keywords: List[str]
    abstract: str

class Data(BaseModel):
    rate_constant: RateConstant
    comp: Compound
    cond: Conditions
    ref: Optional[Reference] = ""

class Document(BaseModel):
    entries: List[Data]
    meta: MetaData




#Exectuion

sample_data = {
    "entries": [
        {
            "rate_constant": {
                "constant": {
                    "value": 1.23,
                    "unit": "L/mol/s",
                    "multiplier": {"base": 10, "exponent": 3}
                },
                "type": "K_OH",
                "var": "OH"
            },
            "comp": {
                "name": "Compound A",
                "abbrev": "CpdA"
            },
            "cond": {
                "ph": {
                    "value": 7.0,
                    "regulator": "Buffer"
                },
                "temp": {
                    "value": 298.15,
                    "unit": "K"
                },
                "order": "second",
                "scavenger": {
                    "name": "Scavenger X",
                    "conc": {
                        "value": 0.01,
                        "unit": "M",
                        "multiplier": {"base": 10, "exponent": -3}
                    }
                },
                "comp_conc": {
                    "value": 0.001,
                    "unit": "M"
                },
                "o3": {
                    "o3_conc": {
                        "value": 1.5,
                        "unit": "ppm"
                    },
                    "o3_info": {
                        "type": "Pressure",
                        "info": {
                            "value": 101.3,
                            "unit": "kPa"
                        }
                    }
                }
            },
            "ref": {
                "index": "Ref1",
                "ref": "Doe, J. et al. (2024). Journal of Chemistry."
            }
        }
    ],
    "meta": {
        "doi": "10.1000/xyz123",
        "relevance": True,
        "title": "A Comprehensive Study of Compound A",
        "keywords": ["chemistry", "kinetics", "compound A"],
        "abstract": "This study explores the kinetics of Compound A under various conditions."
    }
}

# Validating the data using your schema
document = Document(**sample_data)

# Print the validated and structured data
print(document.json())


{"entries":[{"rate_constant":{"constant":{"value":1.23,"unit":"L/mol/s","multiplier":{"base":10.0,"exponent":3.0}},"type":"K_OH","var":"OH"},"comp":{"name":"Compound A","abbrev":"CpdA"},"cond":{"ph":{"value":7.0,"regulator":"Buffer"},"temp":{"value":298.15,"unit":"K","multiplier":1},"order":"second","scavenger":{"name":"Scavenger X","conc":{"value":0.01,"unit":"M","multiplier":{"base":10.0,"exponent":-3.0}}},"comp_conc":{"value":0.001,"unit":"M","multiplier":1},"o3":{"o3_conc":{"value":1.5,"unit":"ppm","multiplier":1},"o3_info":{"type":"Pressure","info":{"value":101.3,"unit":"kPa","multiplier":1}}}},"ref":{"index":"Ref1","ref":"Doe, J. et al. (2024). Journal of Chemistry."}}],"meta":{"doi":"10.1000/xyz123","relevance":true,"title":"A Comprehensive Study of Compound A","keywords":["chemistry","kinetics","compound A"],"abstract":"This study explores the kinetics of Compound A under various conditions."}}


  Expected `Power` but got `int` - serialized value may not be as expected
  Expected `Power` but got `int` - serialized value may not be as expected
  Expected `Power` but got `int` - serialized value may not be as expected
  Expected `Power` but got `int` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_json(
