In [9]:
import requests
import json
import re
from typing import Optional

available_models = [
    "bedrock-claude-3-sonnet-(IN)",
    "gpt-4o-(US)",
    "gemini-2.0-flash",
    "bedrock-claude-3.5-(US)",
    "gpt-4.1-(US)",
    "bedrock-llama3-2-11b-(US)",
    "bedrock-llama3-2-90b-(US)",
    "bedrock-llama3-70b-instruct-v1-(IN)",
    "bedrock-claude-3-haiku-(IN)"
]

class LLMResponse:
    def __init__(self, content: str):
        self.content = content

class LLM:
    """
    A class that represents a language model.
    """
    def __init__(
        self, 
        authorization_token: str = "Bearer sk-h_pNVQpyaHWjyOpwKfE5Mw", 
        endpoint: str = "https://llmproxy.go-yubi.in/chat/completions", 
        temperature: float = 0, 
        model: str = "gpt-4.1-(US)",
        max_tokens: Optional[int] = None
    ):
        if model not in available_models:
            raise ValueError(f"Model {model} is not available. Available models: {available_models}")
        self.authorization_token = authorization_token
        self.endpoint = endpoint
        self.temperature = temperature
        self.model = model
        self.max_tokens = max_tokens

    def generate(self, prompt: str, system_prompt: Optional[str] = None) -> LLMResponse:
        """
        Generates a response from the language model.
        """
        headers = {
            "Authorization": self.authorization_token,
            "Content-Type": "application/json"
        }
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})
        data = {
            "model": self.model,
            "messages": messages,
            "temperature": self.temperature
        }

        if self.max_tokens is not None:
            data["max_tokens"] = self.max_tokens

        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
            response.raise_for_status()
            out = response.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip()
            return LLMResponse(out)
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error calling LLM API: {e}")
    
    def invoke(self, prompt: str, system_prompt: Optional[str] = None) -> "LLMResponse":
        """
        Invoke the LLM with the given prompt and system prompt.
        """
        return self.generate(prompt, system_prompt=system_prompt)

    def __or__(self, other):
        """
        Allows chaining of LLM components using the | operator.
        """
        return Pipeline(self, other)

class JsonOutputParser:
    """
    A parser that converts the output of a LLM into a Pydantic object.
    """
    def __init__(self, pydantic_object):
        self.pydantic_object = pydantic_object

    def get_format_instructions(self) -> str:
        """
        Returns the format instructions for the Pydantic object.
        """
        return (
            "Output must be a valid JSON object that adheres to the following schema:\n" +
            json.dumps(self.pydantic_object.model_json_schema(), indent=2)
        )

    def parse(self, text):
        """
        Parses the output of a LLM into a Pydantic object.
        """
        if hasattr(text, "content"):
            text = text.content
        text = text.strip()
        if not text:
            raise Exception("Warning: No JSON content received from LLM.")

        code_fence_re = r"```[\s]*json[\s]*\n(.*?)(?:```|$)"
        match = re.search(code_fence_re, text, re.DOTALL | re.IGNORECASE)
        if not match:
            code_fence_re2 = r"```(.*?)(?:```|$)"
            match = re.search(code_fence_re2, text, re.DOTALL)
        if match:
            text = match.group(1).strip()

        json_match = re.search(r"({.*}|\[.*\])", text, re.DOTALL)
        if json_match:
            text = json_match.group(1).strip()

        try:
            text = re.sub(r'(?<!\\)\\(?![\\/"bfnrtu])', r'\\\\', text)
            parsed_json = json.loads(text)
            model_obj = self.pydantic_object.model_validate(parsed_json)
            return model_obj.model_dump()
        except Exception as e:
            raise Exception("Error parsing JSON output:", e)

    def __call__(self, llm_output):
        return self.parse(llm_output)

class PromptTemplate:
    """
    A prompt template that can be used to format a prompt for a LLM.
    """
    def __init__(self, template: str, input_variables: list, partial_variables: dict = None):
        self.template = template
        self.input_variables = input_variables
        self.partial_variables = partial_variables or {}

    def format(self, **kwargs) -> str:
        """
        Formats the prompt template with the given keyword arguments.
        """
        context = self.partial_variables.copy()
        context.update(kwargs)
        return self.template.format(**context)

    def __or__(self, other):
        """
        Allows chaining of LLM components using the | operator.
        """
        return Pipeline(self, other)

    @classmethod
    def from_template(cls, template: str):
        """
        Creates a PromptTemplate from a template string.
        """
        import string
        input_vars = [v[1] for v in string.Formatter().parse(template) if v[1] is not None]
        return cls(template, input_vars, {})

class Pipeline:
    """
    A pipeline that chains together multiple steps for a LLM.
    """
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def __or__(self, other):
        """
        Allows chaining of LLM components using the | operator.
        """
        return Pipeline(self, other)

    def _has_parser(self):
        """
        Recursively checks if any right step is a JsonOutputParser.
        """
        node = self
        while isinstance(node, Pipeline):
            if isinstance(node.right, JsonOutputParser):
                return True
            node = node.right if isinstance(node.right, Pipeline) else None
        return False

    def _append_json_markdown_instruction(self, prompt_str):
        """
        Appends a JSON markdown instruction to the prompt string if it is not already present.
        """
        instruction = "You must always return valid JSON fenced by a markdown code block. Do not return any additional text."
        if instruction.lower() not in prompt_str.lower():
            prompt_str = prompt_str.rstrip() + "\n\n" + instruction
        return prompt_str

    def invoke(self, input_data):
        """
        Invokes the pipeline with the given input data.
        """
        parser_in_chain = self._has_parser()

        if isinstance(self.left, Pipeline):
            input_for_right = self.left.invoke(input_data)
        elif isinstance(self.left, PromptTemplate):
            prompt_str = self.left.format(**input_data)
            if parser_in_chain:
                prompt_str = self._append_json_markdown_instruction(prompt_str)
            input_for_right = prompt_str
        elif callable(self.left):
            input_for_right = self.left(input_data)
        else:
            input_for_right = self.left

        result = None
        if isinstance(self.right, Pipeline):
            result = self.right.invoke(input_for_right)
        elif isinstance(self.right, LLM):
            if isinstance(input_for_right, LLMResponse):
                result = input_for_right
            else:
                result = self.right.generate(input_for_right)
        elif isinstance(self.right, JsonOutputParser):
            parsed = self.right.parse(input_for_right)
            if hasattr(parsed, "model_dump") and callable(parsed.model_dump):
                return parsed.model_dump()
            else:
                return parsed
        elif callable(self.right):
            result = self.right(input_for_right)
        else:
            result = self.right
        return result

def extract_json_from_response(text):
    # Pattern to match JSON inside markdown fences: ```json ... ```
    pattern = r'```[\s]*json[\s]*\n(.*?)```'
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    # Fallback: extract the first {...} JSON object
    match = re.search(r'(\{.*\})', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def main():
    fname = "Singla"
    with open(f'{fname}.txt', 'r', encoding='utf-8') as f:
        file_content = f.read()
    prompt_str = f""" {file_content}

        1. Yearly Data Handling
            1. For each financial year present in the document:
                - Store the year as a key under "financials".
                - Inside each year, add a "Year" field with the year value.
                - Add "unit" field (e.g., "INR", "INR_Lakh", "INR_Crore") as it appears in the document.
            2. If the document contains multiple years and any line item or variable does not have a value for one or more financial years:
                - Explicitly set its value as null for that year.
                - Do not infer, copy, or generate missing values.
                - Only assign values if they are explicitly stated under the correct financial year.
                - Do not use values from other years as placeholders (no forward-fill or backfill logic).
            3. Previous year data may appear in the following formats:
                - A dedicated "(Previous Year)" row at the bottom of tables.
                - Column labels with year references like “2022” or “as on 31.03.2022”.
                - Footnote-style blocks labeled “(Previous Year)” or “Note: Previous Year data”.
            4. These formats **must be parsed and mapped** as valid data for the correct financial year, even if presented below current year values.
            5. Values labeled as “Grand Total”, “Net Block Written Down Value as on 31.03.2022”, or other summary rows must also be extracted and assigned to the correct year.
            6. Do not assign any value to a year unless it is explicitly stated under that year or labeled as “Previous Year”.
            7. If values appear in tables or blocks where "Previous Year" is shown as a summary row (e.g., under Total or Grand Total), those values must be parsed correctly and mapped under that year.
            8. If a line item appears for a year but its value is blank or missing, store it as null.
            9. Never copy a value from one year to another (e.g., do not use the "Opening Balance" of 2023 as the "Closing Balance" for 2022 unless explicitly stated in the document).
            10. When reading line-by-line notes or block entries (e.g., Notes 3 or 11), do not skip "(Previous Year)" rows or summary totals; these must be processed and mapped just like tabular entries.

        2. Data Organization per Year
            For each financial year, structure the output in this order:
                1. "Year": Financial year (e.g., "2023")
                2. "unit": Unit of measure (e.g., "INR_Lakh")
                3. "Balance sheet": Extract all Balance Sheet data excluding notes.
                4. "Balance sheet notes": Notes related to Balance Sheet (see Notes Handling below).
                5. "Profit loss": Extract all Profit and Loss account data excluding notes.
                6. "Profit loss notes": Notes related to Profit and Loss.
                7. "Trading": Extract Trading account data (if present).
                8. "Trading notes": Notes related to Trading account.

       3. Data Mapping & Hierarchy Rules
            Use exact headings and subheadings from the document as keys.
            Extract hierarchies accurately (e.g., “Tangible Assets” > “Block A” > sub-entries).
            If subtotals are present, extract them under the key "Total [Subheading]".
            For deduction or addition parts in Balance Sheet, include them under correct sections. Include Total Additions and Total Deductions if present.
            Continue this nesting for any further hierarchical levels if present.

        4. Special Line Items Handling
            Extract Gross Profit C/d and Gross Profit B/d only if explicitly present.
                If both Trading and Profit & Loss are on the same page, do not auto-balance them — extract the values as given.
            In Trading and Profit & Loss, extract:
                total debit
                total credit
                Store these totals under the respective section (Trading or Profit loss).
        
        5. Notes Extraction
            For each notes section (Balance Sheet Notes, Profit Loss Notes, Trading Notes):
            1. Extract:
                Note number or Note title
                Balance sheet item/Profit Loss item/Trading item (line item the note relates to)
                Classification (valid values: equity, current_assets, non_current_assets, current_liabilities, non_current_liabilities, assets_miscellaneous, liabilities_miscellaneous)
                Notes: Actual data under the note
            2. If the note contains addition sections split by dates (e.g., “UP TO 30.09.2023” and “AFTER 01.10.2023”), extract both sections under the correct year with clear keys.
            3. Preserve all nested subheadings and sub-structures inside notes exactly as they appear in the document.
            4. If no notes are present for a section, store an empty array.
        
        6. Handling Schedules/Notes and References
            If a main statement (Balance Sheet, Profit & Loss, or Trading) references a schedule or note (e.g., “Note 1”, “Schedule A”), extract the note/schedule data under the corresponding notes section:
                "Balance sheet notes"
                "Profit loss notes"
                "Trading notes"
            Extra Notes
                Extract any remaining notes not referenced by Balance Sheet, Profit & Loss, or Trading statements under the key: "Extra notes" (only once).
                Ensure no duplication of notes across different sections.
        
        General Rules
            Preserve original numeric formatting (do not convert numbers to strings).
            Do not infer or generate values — store only what is present.
            If any data (line-item or subtotal) is missing in the document for any year, set its value as null.
            Use exact structure, headings, and labels as in the document.
            Give both Total of line items which are already present in the document along with the line items,don't miss out anything.'
            Return only a valid JSON object (no markdown, no explanations, no extra text).
        
     Here is the document to process: """

    # Simulated: Replace with your own LLM object and inference method
    llm = LLM()
    output = llm.generate(prompt_str)
    print('Extraction Successful')

    # --------------------------------------------------------------
    # Converting to Json file
    # --------------------------------------------------------------
    response_str = output.content

    # Extract the JSON text
    json_text = extract_json_from_response(response_str)
    if json_text:
        try:
            # Parse the JSON to a dictionary
            data = json.loads(json_text)
            # Save to a file
            with open(f'NF_{fname}_Json.json', 'w') as f:
                json.dump(data, f, indent=4)
            print(f"Data successfully saved to NF_{fname}_Json.json.")
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    else:
        print("Could not find a valid JSON in the response.")

if __name__ == "__main__":
    main()

Extraction Successful
Data successfully saved to NF_Singla_Json.json.
