In [1]:
import os
import json
import re
from dotenv import load_dotenv
from typing import Dict, Any, List, Optional, Tuple
from IPython.display import Image, display

from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, END

In [2]:
# Load environment variables
load_dotenv()
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")

In [3]:
# Initialize the Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GOOGLE_API_KEY)

In [5]:
html_template = """
<!DOCTYPE html>
<html>
<head>
    <title>{{company_name}} - Official Website</title>
</head>
<body>
    <header>
        <h1>Welcome to {{company_name}}</h1>
        <p>{{company_tagline}}</p>
    </header>
    
    <section class="mission">
        <h2>Our Mission</h2>
        <p>{{company_mission}}</p>
    </section>
    
    <section class="services">
        <h2>Our Services</h2>
        <p>{{company_services}}</p>
    </section>
    
    <footer>
        <p>Contact us at: {{company_email}}</p>
    </footer>
</body>
</html>
"""

In [14]:
def clean_json_output(response):
    # Remove code block markers and extract JSON
    cleaned = re.sub(r"```json|```", "", response).strip()
    return json.loads(cleaned)

In [15]:
system_prompt = """
    You are an HTML analyzer. Your task is to identify placeholders in an HTML template that should be replaced 
    with user-specific information. Typical placeholders might be enclosed in double curly braces like {{company_name}} 
    or might appear as generic text like 'Company Mission Goes Here'.
    
    Analyze the HTML and return a JSON dictionary where:
    - Keys are descriptive field names (e.g., "company_name", "company_mission")
    - Values are initially set to null
    
    Return ONLY the JSON dictionary without any additional text.
    """
    
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Analyze this HTML template and identify all placeholders:\n\n{html_template}")
])

response = llm.invoke(prompt.format(html_template=html_template))

if '```json' in response.content:
    json_data = clean_json_output(response.content)

In [16]:
json_data

{'company_name': None,
 'company_tagline': None,
 'company_mission': None,
 'company_services': None,
 'company_email': None}

In [13]:
import json
import re

def clean_json_output(response):
    # Remove code block markers and extract JSON
    cleaned = re.sub(r"```json|```", "", response).strip()
    return json.loads(cleaned)

# Example usage
raw_response = """```json
{
  "company_name": "TechCorp",
  "company_tagline": "Innovating the Future",
  "company_mission": "To revolutionize technology.",
  "company_services": "AI Solutions, Cloud Computing",
  "company_email": "contact@techcorp.com"
}
```"""

json_data = clean_json_output(raw_response)
print(json_data)


{'company_name': 'TechCorp', 'company_tagline': 'Innovating the Future', 'company_mission': 'To revolutionize technology.', 'company_services': 'AI Solutions, Cloud Computing', 'company_email': 'contact@techcorp.com'}
