In [2]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from dotenv import load_dotenv
import json
import os

In [3]:
load_dotenv()

True

In [21]:
#test case 1
with open (r"testcases\test case 1\NIPS-2017-attention-is-all-you-need-Bibtex.txt", "r") as file:
    passage = file.read()
    
with open (r"testcases/test case 1/paper citations_schema.json", "r") as file:
    schema = json.load(file)

In [24]:
#test case 2
with open (r"testcases\test case 2\github actions sample input.txt", "r") as file:
    passage = file.read()
    
with open (r"testcases\test case 2\github_actions_schema.json", "r") as file:
    schema = json.load(file)

In [14]:
#test case 3
with open (r"testcases\test case 3\resume.txt", "r") as file:
    passage = file.read()
    
with open (r"testcases/test case 3/convert your resume to this schema.json", "r") as file:
    schema = json.load(file)

In [25]:
llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT"), 
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=0.4,
    top_p=0.9,
    model_kwargs={  
        "response_format": {"type": "json_object"},
    }
    )

In [26]:
from langchain_core.output_parsers import JsonOutputParser

parser = JsonOutputParser()

prompt = PromptTemplate(
    template = """
   You are a strict JSON generator. Your task is to:
    1. Carefully read the passage below.
    2. Analyze the given JSON Schema.
    3. Generate a valid JSON object using only the keys defined in the schema.
    4. Use correct data types as defined in the schema, including:
    - Only lowercase booleans (`true`, `false`)
    - No strings like `"true"` or `"false"`
    - No uppercase booleans (`True`, `False`)
    5. Do not include any extra keys or explanations. Return only valid and strict JSON output.
    6. Ensure that all values match their expected types, formats, and regular expressions as defined in the schema.

    **Passage**: {passage}

    **Schema**: {schema} 

    **Output**: (Only return the JSON. No markdown, no comments, no extra text.)

    {format_instruction}
    """,
    input_variables=["passage","schema"],
    partial_variables={"format_instruction": parser.get_format_instructions()},
)

chain = prompt | llm | parser

In [27]:
result = json.dumps(chain.invoke({"passage": passage,"schema":json.dumps(schema)}),indent=2)
print(result)

{
  "name": "MkDocs Publisher",
  "author": "DevRel Team",
  "description": "A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use.",
  "inputs": {
    "python-version": {
      "description": "The version of Python to set up for building.",
      "required": false,
      "default": "3.11"
    },
    "requirements-file": {
      "description": "Path to the Python requirements file",
      "required": true
    },
    "gh-token": {
      "description": "GitHub token for deployment.",
      "required": true,
      "deprecationMessage": "Prefer using GITHUB_TOKEN environment variable directly if permissions allow."
    }
  },
  "runs": {
    "using": "composite",
    "steps": [
      {
        "name": "Checkout Code",
        "uses": "actions/checkout@v4"
      },
      {
        "name": "Setup Python",
        "uses": "actions/setup-python@v5",
        "with": {
          "python-version": "${{ inputs.python-version }}"
        },
        "id": 