In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.5.1+cu121
CUDA available: True


In [14]:
import os
import pandas as pd
import torch
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

# --------------------------
# Settings
# --------------------------
MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
IMAGE_DIR = "inference_images"   # folder with wireframe images
OUTPUT_FILE = "llava_next_results.csv"

PROMPT_TEXT = """You are analyzing a webpage wireframe. 
Give a thorough, section-by-section description of the layout. 
Include details such as:
- The structure and arrangement of elements
- Exact labels of buttons, menus, and CTAs
- Headings, subheadings, and text content
- Relationships between sections (e.g., grid layout, columns, rows)
- Notes about emphasis, grouping, and visual hierarchy

Do not summarize. Instead, expand with full descriptive sentences that would help a designer or developer rebuild this wireframe faithfully.
"""

# --------------------------
# Load model & processor
# --------------------------
processor = LlavaNextProcessor.from_pretrained(MODEL_NAME)
model = LlavaNextForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="cuda",
    load_in_4bit=True
)

# --------------------------
# Helper function
# --------------------------
def infer(image_path, prompt_text=PROMPT_TEXT):
    image = Image.open(image_path).convert("RGB")

    # conversation: text first, then image
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt_text},
                {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=512)
    return processor.decode(output[0], skip_special_tokens=True)

# --------------------------
# Run over all images
# --------------------------
results = []
for fname in os.listdir(IMAGE_DIR):
    if fname.lower().endswith((".png", ".jpg", ".jpeg")):
        path = os.path.join(IMAGE_DIR, fname)
        print(f"üîç Processing {fname}...")
        try:
            desc = infer(path)
            results.append({"image": fname, "description": desc})
        except Exception as e:
            results.append({"image": fname, "description": f"ERROR: {e}"})

# --------------------------
# Save results
# --------------------------
df = pd.DataFrame(results)
df.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Done! Results saved to {OUTPUT_FILE}")


Fetching 2 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<?, ?it/s]
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:37<00:00,  9.43s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


üîç Processing image_139_48.png...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


üîç Processing image_79_37.png...

‚úÖ Done! Results saved to llava_next_results.csv


In [18]:
import os
import pandas as pd
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info  # comes with the model repo

MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
IMAGE_DIR = "inference_images"
OUTPUT_FILE = "qwen2vl_results.csv"

PROMPT = (
    "You are analyzing a webpage wireframe. "
    "Give a very detailed section-by-section description, including:\n"
    "- Header and navigation items\n"
    "- Hero section text and call-to-action\n"
    "- Content blocks with titles, descriptions, and buttons\n"
    "- Footer elements\n\n"
    "Be exhaustive, using full sentences that would help a designer rebuild the layout."
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",   # best balance for accuracy
    bnb_4bit_compute_dtype=torch.bfloat16  # or torch.float16 if your GPU doesn‚Äôt support bf16
)

# --------------------------
# Load processor & model
# --------------------------
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype="auto",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_NAME)

def infer(image_path, prompt=PROMPT):
    # Prepare chat message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    # Chat template
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    # Vision inputs
    image_inputs, video_inputs = process_vision_info(messages)

    # Final processor call
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    # Generate
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]

# --------------------------
# Process folder
# --------------------------
results = []
for fname in os.listdir(IMAGE_DIR):
    if fname.lower().endswith((".png", ".jpg", ".jpeg")):
        path = os.path.join(IMAGE_DIR, fname)
        print(f"üîç Processing {fname}...")
        try:
            desc = infer(path)
            results.append({"image": fname, "description": desc})
        except Exception as e:
            results.append({"image": fname, "description": f"ERROR: {e}"})

# --------------------------
# Save results
# --------------------------
df = pd.DataFrame(results)
df.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Done! Results saved to {OUTPUT_FILE}")


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:09<00:00,  1.87s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 964.65it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 828.75it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen2vl_results.csv


In [1]:
import os
import pandas as pd
import numpy as np
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

# ---- Simple loader (resize only) ----
def load_image_simple(image_file, input_size=448):
    image = Image.open(image_file).convert("RGB")
    image = image.resize((input_size, input_size))
    pixel_values = torch.tensor(np.array(image)).permute(2,0,1).unsqueeze(0)  # [1,3,H,W]
    return pixel_values

# ---- Model Setup ----
MODEL_NAME = "OpenGVLab/InternVL3_5-8B"
IMAGE_DIR = "inference_images"
OUTPUT_FILE = "internvl35_results.csv"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16   # use float16 if bf16 not supported
)

model = AutoModel.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=False)

generation_config = dict(max_new_tokens=1024, do_sample=False)

PROMPT = (
    "You are analyzing a webpage wireframe. "
    "Give a very detailed section-by-section description, including:\n"
    "- Header and navigation items\n"
    "- Hero section text and call-to-action\n"
    "- Content blocks with titles, descriptions, and buttons\n"
    "- Footer elements\n\n"
    "Be exhaustive, using full sentences that would help a designer rebuild the layout."
)

# ---- Inference ----
def infer(image_path, prompt=PROMPT):
    pixel_values = load_image_simple(image_path).to(model.device, dtype=torch.float16)
    response = model.chat(tokenizer, pixel_values, prompt, generation_config)
    return response

# ---- Batch Run ----
results = []
for fname in os.listdir(IMAGE_DIR):
    if fname.lower().endswith((".png", ".jpg", ".jpeg")):
        path = os.path.join(IMAGE_DIR, fname)
        print(f"üîç Processing {fname}...")
        try:
            desc = infer(path)
            results.append({"image": fname, "description": desc})
        except Exception as e:
            results.append({"image": fname, "description": f"ERROR: {e}"})

pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Done! Results saved to {OUTPUT_FILE}")


  from .autonotebook import tqdm as notebook_tqdm


FlashAttention2 is not installed.


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:40<00:00, 10.08s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


üîç Processing image_139_48.png...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


üîç Processing image_5578_23.png...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


üîç Processing image_79_37.png...

‚úÖ Done! Results saved to internvl35_results.csv


In [1]:
import os
import pandas as pd
import torch
from pathlib import Path
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info  # from the model repo

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
IMAGE_DIR = "inference_images"
OUTPUT_CSV = "qwen2vl_results.csv"
HTML_OUT_DIR = "outputs_html"
MAX_NEW_TOKENS_DESC = 1024
MAX_NEW_TOKENS_HTML = 1400

PROMPT_DESC = (
    "You are analyzing a webpage wireframe. "
    "Give a very detailed section-by-section description, including:\n"
    "- Header and navigation items\n"
    "- Hero section text and call-to-action\n"
    "- Content blocks with titles, descriptions, and buttons\n"
    "- Footer elements\n\n"
    "Be exhaustive, using full sentences that would help a designer rebuild the layout."
)

PROMPT_HTML_IMAGE_ONLY = (
    "You are an expert front-end developer. Given ONLY the wireframe image, "
    "generate a complete, minimal, responsive HTML5 page that approximates the layout.\n\n"
    "Requirements:\n"
    "- Use semantic HTML tags (header, nav, main, section, footer, etc.).\n"
    "- Include a minimal <style> block (no external CSS/JS). Keep CSS concise.\n"
    "- Use placeholder text for headlines, paragraphs, buttons, links.\n"
    "- Structure should reflect the wireframe‚Äôs hierarchy (header, hero, content blocks, footer, etc.).\n"
    "- Avoid any JavaScript.\n\n"
    "Output ONLY the HTML document."
)

PROMPT_HTML_WITH_DESC = (
    "You are an expert front-end developer. Use BOTH the wireframe image AND the provided textual description "
    "to generate a complete, minimal, responsive HTML5 page that matches the layout.\n\n"
    "Requirements:\n"
    "- Follow the described hierarchy (header/nav, hero, content blocks, footer) as closely as possible.\n"
    "- Use semantic HTML tags and a small <style> block (no external CSS/JS).\n"
    "- Use the description‚Äôs section names and CTA labels as placeholders.\n"
    "- Avoid any JavaScript.\n\n"
    "Output ONLY the HTML document."
)

# --------------------------
# Model loading (CUDA-aware)
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        # 4-bit quant when GPU is available
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

    model = Qwen2VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor

model, processor = load_model_and_processor()
os.makedirs(HTML_OUT_DIR, exist_ok=True)

# --------------------------
# Core generation helpers
# --------------------------
def _gen_from_messages(messages, max_new_tokens=1024):
    """
    Generic chat invoke for Qwen2-VL using apply_chat_template + process_vision_info.
    Returns string output.
    """
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,          # deterministic for consistency
            temperature=None,
        )

    # Trim the prompt tokens
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return output[0]

def describe_wireframe(image_path, prompt=PROMPT_DESC):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text",  "text": prompt},
        ],
    }]
    return _gen_from_messages(messages, max_new_tokens=MAX_NEW_TOKENS_DESC)

def html_from_image_only(image_path, prompt=PROMPT_HTML_IMAGE_ONLY):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text",  "text": prompt},
        ],
    }]
    return _gen_from_messages(messages, max_new_tokens=MAX_NEW_TOKENS_HTML)

def html_from_image_and_desc(image_path, description, prompt=PROMPT_HTML_WITH_DESC):
    # We pass the image and then include the description as additional context.
    desc_block = (
        "Here is the textual description you MUST follow where possible:\n\n"
        f"{description}\n\n"
        "‚Äî End of description ‚Äî"
    )
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text",  "text": desc_block},
            {"type": "text",  "text": prompt},
        ],
    }]
    return _gen_from_messages(messages, max_new_tokens=MAX_NEW_TOKENS_HTML)

# --------------------------
# Batch over folder
# --------------------------
def main():
    results = []
    image_dir = Path(IMAGE_DIR)
    for fname in sorted(os.listdir(image_dir)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        path = str(image_dir / fname)
        stem = Path(fname).stem
        print(f"üîç Processing {fname}...")

        row = {"image": fname, "description": "", "html_from_image": "", "html_from_image_plus_desc": ""}

        # 1) Description
        try:
            desc = describe_wireframe(path)
            row["description"] = desc
        except Exception as e:
            row["description"] = f"ERROR: {e}"

        # 2) HTML from image only
        try:
            html_img = html_from_image_only(path)
            row["html_from_image"] = html_img
            # Save HTML file
            out_file_img = os.path.join(HTML_OUT_DIR, f"{stem}__img_only.html")
            with open(out_file_img, "w", encoding="utf-8") as f:
                f.write(html_img)
        except Exception as e:
            row["html_from_image"] = f"ERROR: {e}"

        # 3) HTML from image + description (only if description succeeded)
        try:
            if row["description"] and not row["description"].startswith("ERROR:"):
                html_plus = html_from_image_and_desc(path, row["description"])
                row["html_from_image_plus_desc"] = html_plus
                out_file_plus = os.path.join(HTML_OUT_DIR, f"{stem}__img_plus_desc.html")
                with open(out_file_plus, "w", encoding="utf-8") as f:
                    f.write(html_plus)
            else:
                row["html_from_image_plus_desc"] = "SKIPPED: No valid description to condition on."
        except Exception as e:
            row["html_from_image_plus_desc"] = f"ERROR: {e}"

        results.append(row)

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}\nüìÇ HTML files saved in: {HTML_OUT_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:46<00:00,  9.35s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen2vl_results.csv
üìÇ HTML files saved in: outputs_html


Chain of thought reasoning

In [1]:
# script1_cot_html.py
import os
import re
import pandas as pd
import torch
from pathlib import Path
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
IMAGE_DIR = "inference_images"
OUTPUT_CSV = "qwen2vl_results_cot_html.csv"
HTML_OUT_DIR = "outputs_html_cot"
MAX_NEW_TOKENS = 1700

PROMPT_HTML_IMAGE_ONLY = (
    "You are an expert front-end developer. Given ONLY the wireframe image, produce exactly two sections:\n\n"
    "1) <analysis>  Provide your step-by-step reasoning about how the layout maps to semantic HTML. "
    "Identify header/nav items, hero structure, content blocks, and footer; explain layout choices (grid/flex), "
    "responsiveness, and any assumptions you must make. Keep this as clear prose or short bullet points. </analysis>\n\n"
    "2) <final>  Output a complete, minimal, responsive HTML5 document that reflects the layout. "
    "Include <!doctype html>, <html>, <head> with <meta charset> and <meta name=\"viewport\">, "
    "and a minimal <style> block in <head> (no external CSS/JS). Use semantic tags (header, nav, main, section, footer), "
    "placeholder text for headings, paragraphs, and buttons, and accessible attributes where sensible. "
    "Avoid all JavaScript.  </final>\n\n"
    "Rules:\n"
    "- Do not repeat or paraphrase these instructions in your output.\n"
    "- Start your answer with <analysis> and end with </final>.\n"
)


# --------------------------
# Load model
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    model = Qwen2VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor

model, processor = load_model_and_processor()
os.makedirs(HTML_OUT_DIR, exist_ok=True)

# --------------------------
# Helpers
# --------------------------
def extract_final(output_text: str):
    match = re.search(r"<final>(.*?)</final>", output_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return output_text.strip()

def html_from_image_only(image_path):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_HTML_IMAGE_ONLY},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    # Trim only if model actually generated something
    outputs = []
    for in_ids, out_ids in zip(inputs.input_ids, generated_ids):
        if len(out_ids) > len(in_ids):
            trimmed = out_ids[len(in_ids):]
        else:
            trimmed = out_ids  # fallback: return everything
        outputs.append(trimmed)

    output_texts = processor.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_texts[0]

# --------------------------
# Main
# --------------------------
def main():
    results = []
    for fname in sorted(os.listdir(IMAGE_DIR)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        path = os.path.join(IMAGE_DIR, fname)
        stem = Path(fname).stem
        print(f"üîç Processing {fname}...")

        row = {"image": fname, "html_raw": "", "html_final": ""}

        try:
            raw_output = html_from_image_only(path)
            final_output = extract_final(raw_output)

            row["html_raw"] = raw_output
            row["html_final"] = final_output

            # Save final HTML separately
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__img_only.html"), "w", encoding="utf-8") as f:
                f.write(final_output)

            # Save raw CoT reasoning + HTML
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__img_only_raw.txt"), "w", encoding="utf-8") as f:
                f.write(raw_output)

        except Exception as e:
            row["html_raw"] = f"ERROR: {e}"
            row["html_final"] = f"ERROR: {e}"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}\nüìÇ Outputs in: {HTML_OUT_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:43<00:00,  8.62s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 750.59it/s]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen2vl_results_cot_html.csv
üìÇ Outputs in: outputs_html_cot


In [6]:
# compare_wireframes.py
import os
import re
import pandas as pd
import torch
from pathlib import Path

# Assume model & processor are already loaded:
#   model = Qwen2VLForConditionalGeneration.from_pretrained(...).to("cuda")
#   processor = AutoProcessor.from_pretrained(...)

GT_DIR = "inference_images"
GEN_DIR = "direct_desc_gen"
OUTPUT_CSV = "qwen2vl_comparisons.csv"
MAX_NEW_TOKENS = 800

PROMPT_COMPARE = (
    "You are a strict UI/UX evaluator.\n"
    "Compare the first image (ground-truth wireframe) with the second image (generated wireframe).\n"
    "Do the following:\n"
    "1. Provide a detailed section-by-section comparison (header, hero, content blocks, footer).\n"
    "2. Explain key differences in structure, layout, and visual hierarchy.\n"
    "3. Assign a similarity score from 1 to 5 (1 = very different, 5 = nearly identical). Be very critical and you can assign scores in decimals.\n\n"
    "Output format:\n"
    "<analysis> ... reasoning here ... </analysis>\n"
    "<score>n</score>\n"
)

def extract_score(text: str):
    match = re.search(r"<score>([1-5])</score>", text)
    if match:
        return int(match.group(1))
    return None

def compare_images(gt_path, gen_path):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": gt_path},
            {"type": "image", "image": gen_path},
            {"type": "text", "text": PROMPT_COMPARE},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[gt_path, gen_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return output_text, extract_score(output_text)

def main():
    results = []
    gt_files = sorted([f for f in os.listdir(GT_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg"))])

    for fname in gt_files:
        gt_path = os.path.join(GT_DIR, fname)
        gen_path = os.path.join(GEN_DIR, fname)
        if not os.path.exists(gen_path):
            print(f"‚ö†Ô∏è Skipping {fname} (no generated version found)")
            continue

        print(f"üîç Comparing {fname}...")
        try:
            explanation, score = compare_images(gt_path, gen_path)
            results.append({"image": fname, "explanation": explanation, "score": score})
        except Exception as e:
            results.append({"image": fname, "explanation": f"ERROR: {e}", "score": None})

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


üîç Comparing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Comparing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Comparing image_139_48.png...
üîç Comparing image_5578_23.png...
üîç Comparing image_79_37.png...

‚úÖ Done! Results saved to qwen2vl_comparisons.csv


Qwen 2.5 vl


COT

In [1]:
# script1_cot_html.py
import os
import re
import pandas as pd
import torch
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
IMAGE_DIR = "inference_images"   # <-- updated directory name
OUTPUT_CSV = "qwen25vl_results_cot_html.csv"
HTML_OUT_DIR = "outputs_html_cot_25"   # <-- updated output dir
MAX_NEW_TOKENS = 1700

PROMPT_HTML_IMAGE_ONLY = (
    "You are an expert front-end developer. Given ONLY the wireframe image, produce exactly two sections:\n\n"
    "1) <analysis>  Provide your step-by-step reasoning about how the layout maps to semantic HTML. "
    "Identify header/nav items, hero structure, content blocks, and footer; explain layout choices (grid/flex), "
    "responsiveness, and any assumptions you must make. Keep this as clear prose or short bullet points. </analysis>\n\n"
    "2) <final>  Output a complete, minimal, responsive HTML5 document that reflects the layout. "
    "Include <!doctype html>, <html>, <head> with <meta charset> and <meta name=\"viewport\">, "
    "and a minimal <style> block in <head> (no external CSS/JS). Use semantic tags (header, nav, main, section, footer), "
    "placeholder text for headings, paragraphs, and buttons, and accessible attributes where sensible. "
    "Avoid all JavaScript.  </final>\n\n"
    "Rules:\n"
    "- Do not repeat or paraphrase these instructions in your output.\n"
    "- Start your answer with <analysis> and end with </final>.\n"
)

# --------------------------
# Load model
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor

model, processor = load_model_and_processor()
os.makedirs(HTML_OUT_DIR, exist_ok=True)

# --------------------------
# Helpers
# --------------------------
def extract_final(output_text: str):
    match = re.search(r"<final>(.*?)</final>", output_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return output_text.strip()

def html_from_image_only(image_path):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_HTML_IMAGE_ONLY},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    # Trim only if model actually generated something
    outputs = []
    for in_ids, out_ids in zip(inputs.input_ids, generated_ids):
        if len(out_ids) > len(in_ids):
            trimmed = out_ids[len(in_ids):]
        else:
            trimmed = out_ids  # fallback
        outputs.append(trimmed)

    output_texts = processor.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_texts[0]

# --------------------------
# Main
# --------------------------
def main():
    results = []
    for fname in sorted(os.listdir(IMAGE_DIR)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        path = os.path.join(IMAGE_DIR, fname)
        stem = Path(fname).stem
        print(f"üîç Processing {fname}...")

        row = {"image": fname, "html_raw": "", "html_final": ""}

        try:
            raw_output = html_from_image_only(path)
            final_output = extract_final(raw_output)

            row["html_raw"] = raw_output
            row["html_final"] = final_output

            # Save only clean HTML inside <final> tags
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__img_only.html"), "w", encoding="utf-8") as f:
                f.write(final_output)

            # Save full reasoning + final separately
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__img_only_raw.txt"), "w", encoding="utf-8") as f:
                f.write(raw_output)

        except Exception as e:
            row["html_raw"] = f"ERROR: {e}"
            row["html_final"] = f"ERROR: {e}"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}\nüìÇ Outputs in: {HTML_OUT_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:46<00:00,  9.27s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen25vl_results_cot_html.csv
üìÇ Outputs in: outputs_html_cot_25


In [1]:
# script1_direct_html.py
import os
import pandas as pd
import torch
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
IMAGE_DIR = "inference_images"   # same directory as before
OUTPUT_CSV = "qwen25vl_results_direct_html.csv"
HTML_OUT_DIR = "outputs_html_direct_25"
MAX_NEW_TOKENS = 1700

PROMPT_HTML_IMAGE_ONLY = (
    "You are an expert front-end developer. Given ONLY the wireframe image, "
    "generate a complete, minimal, responsive HTML5 page that approximates the layout.\n\n"
    "Requirements:\n"
    "- Use semantic HTML tags (header, nav, main, section, footer, etc.).\n"
    "- Include a minimal <style> block (no external CSS/JS). Keep CSS concise.\n"
    "- Use placeholder text for headlines, paragraphs, buttons, links.\n"
    "- Structure should reflect the wireframe‚Äôs hierarchy (header, hero, content blocks, footer, etc.).\n"
    "- Avoid any JavaScript.\n\n"
    "Output ONLY the HTML document."
)

# --------------------------
# Load model
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor

model, processor = load_model_and_processor()
os.makedirs(HTML_OUT_DIR, exist_ok=True)

# --------------------------
# Helpers
# --------------------------
def html_from_image_only(image_path):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_HTML_IMAGE_ONLY},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    # Trim only if model actually generated something
    outputs = []
    for in_ids, out_ids in zip(inputs.input_ids, generated_ids):
        if len(out_ids) > len(in_ids):
            trimmed = out_ids[len(in_ids):]
        else:
            trimmed = out_ids
        outputs.append(trimmed)

    output_texts = processor.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_texts[0]

# --------------------------
# Main
# --------------------------
def main():
    results = []
    for fname in sorted(os.listdir(IMAGE_DIR)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        path = os.path.join(IMAGE_DIR, fname)
        stem = Path(fname).stem
        print(f"üîç Processing {fname}...")

        row = {"image": fname, "html_output": ""}

        try:
            html_output = html_from_image_only(path)
            row["html_output"] = html_output

            # Save HTML directly
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__direct.html"), "w", encoding="utf-8") as f:
                f.write(html_output)

        except Exception as e:
            row["html_output"] = f"ERROR: {e}"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}\nüìÇ HTML outputs in: {HTML_OUT_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.20s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen25vl_results_direct_html.csv
üìÇ HTML outputs in: outputs_html_direct_25


with descr

In [1]:
# script2_desc_html.py
import os
import pandas as pd
import torch
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
IMAGE_DIR = "inference_images"
OUTPUT_CSV = "qwen25vl_results_desc_html.csv"
HTML_OUT_DIR = "outputs_html_desc_25"
MAX_NEW_TOKENS_DESC = 1200
MAX_NEW_TOKENS_HTML = 1700

PROMPT_DESC = (
    "You are analyzing a webpage wireframe. "
    "Give a very detailed section-by-section description, including:\n"
    "- Header and navigation items\n"
    "- Hero section text and call-to-action\n"
    "- Content blocks with titles, descriptions, and buttons\n"
    "- Footer elements\n\n"
    "Be exhaustive, using full sentences that would help a designer rebuild the layout."
)

PROMPT_HTML_WITH_DESC = (
    "You are an expert front-end developer. Use BOTH the wireframe image AND the provided textual description "
    "to generate a complete, minimal, responsive HTML5 page that matches the layout.\n\n"
    "Requirements:\n"
    "- Follow the described hierarchy (header/nav, hero, content blocks, footer) as closely as possible.\n"
    "- Use semantic HTML tags and a small <style> block (no external CSS/JS).\n"
    "- Use the description‚Äôs section names and CTA labels as placeholders.\n"
    "- Avoid any JavaScript.\n\n"
    "Output ONLY the HTML document."
)

# --------------------------
# Load model
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor

model, processor = load_model_and_processor()
os.makedirs(HTML_OUT_DIR, exist_ok=True)

# --------------------------
# Helpers
# --------------------------
def generate_text(messages, max_new_tokens=1024):
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[m["content"][0]["image"] for m in messages if "image" in str(m)], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)

    outputs = []
    for in_ids, out_ids in zip(inputs.input_ids, generated_ids):
        trimmed = out_ids[len(in_ids):] if len(out_ids) > len(in_ids) else out_ids
        outputs.append(trimmed)

    return processor.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

def describe_wireframe(image_path):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_DESC},
        ],
    }]
    return generate_text(messages, max_new_tokens=MAX_NEW_TOKENS_DESC)

def html_from_image_and_desc(image_path, description):
    desc_block = "Here is the textual description you MUST follow:\n\n" + description
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": desc_block},
            {"type": "text", "text": PROMPT_HTML_WITH_DESC},
        ],
    }]
    return generate_text(messages, max_new_tokens=MAX_NEW_TOKENS_HTML)

# --------------------------
# Main
# --------------------------
def main():
    results = []
    for fname in sorted(os.listdir(IMAGE_DIR)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        path = os.path.join(IMAGE_DIR, fname)
        stem = Path(fname).stem
        print(f"üîç Processing {fname}...")

        row = {"image": fname, "description": "", "html_output": ""}

        try:
            # Step 1: Get description
            desc = describe_wireframe(path)
            row["description"] = desc
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__desc.txt"), "w", encoding="utf-8") as f:
                f.write(desc)

            # Step 2: Generate HTML with description
            html_output = html_from_image_and_desc(path, desc)
            row["html_output"] = html_output
            with open(os.path.join(HTML_OUT_DIR, f"{stem}__html_from_image_plus_desc.html"), "w", encoding="utf-8") as f:
                f.write(html_output)

        except Exception as e:
            row["description"] = f"ERROR: {e}"
            row["html_output"] = f"ERROR: {e}"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}\nüìÇ Outputs in: {HTML_OUT_DIR}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.20s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Processing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Processing image_139_48.png...
üîç Processing image_5578_23.png...
üîç Processing image_79_37.png...

‚úÖ Done! Results saved to qwen25vl_results_desc_html.csv
üìÇ Outputs in: outputs_html_desc_25


In [8]:
# compare_wireframes_25.py
import os
import re
import pandas as pd

GT_DIR = "inference_images"        # Ground-truth wireframes
GEN_DIR = "desc_gen_25"        # Generated wireframes
OUTPUT_CSV = "qwen25vl_comparisons_desc.csv"
MAX_NEW_TOKENS = 800

PROMPT_COMPARE = (
    "You are a strict UI/UX evaluator.\n"
    "Compare the first image (ground-truth wireframe) with the second image (generated wireframe).\n"
    "Do the following:\n"
    "1. Provide a detailed section-by-section comparison (header, hero, content blocks, footer).\n"
    "2. Explain key differences in structure, layout, and visual hierarchy.\n"
    "3. Assign a similarity score from 1 to 5 (1 = very different, 5 = nearly identical). "
    "You may use decimal values like 3.2 or 4.7. Be very critical.\n\n"
    "Output format:\n"
    "<analysis> ... reasoning here ... </analysis>\n"
    "<score>n</score>\n"
)

def extract_score(text: str):
    # Extract integer or decimal score between <score> tags
    match = re.search(r"<score>([0-5](?:\.\d+)?)</score>", text)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return None
    return None

def compare_images(gt_path, gen_path, model, processor):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": gt_path},
            {"type": "image", "image": gen_path},
            {"type": "text", "text": PROMPT_COMPARE},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[gt_path, gen_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return output_text, extract_score(output_text)

def main(model, processor):
    results = []
    gt_files = sorted([f for f in os.listdir(GT_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg", ".webp"))])

    for fname in gt_files:
        gt_path = os.path.join(GT_DIR, fname)
        gen_path = os.path.join(GEN_DIR, fname)
        if not os.path.exists(gen_path):
            print(f"‚ö†Ô∏è Skipping {fname} (no generated version found)")
            continue

        print(f"üîç Comparing {fname}...")
        try:
            explanation, score = compare_images(gt_path, gen_path, model, processor)
            results.append({"image": fname, "explanation": explanation, "score": score})
        except Exception as e:
            results.append({"image": fname, "explanation": f"ERROR: {e}", "score": None})

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Done! Results saved to {OUTPUT_CSV}")

# Usage:
if __name__ == "__main__":
    main(model, processor)




üîç Comparing 3AD8509F-A870-4A05-BF80-909BED5EED6A.png...
üîç Comparing 3C0A4C08-F7BF-42DD-A72D-4A379B66B529.png...
üîç Comparing image_139_48.png...
üîç Comparing image_5578_23.png...
üîç Comparing image_79_37.png...

‚úÖ Done! Results saved to qwen25vl_comparisons_desc.csv


Heirarchy generation

In [2]:
import os
import json
import torch
import pandas as pd
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Config
# --------------------------
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
IMAGE_DIR =  r"C:\Users\fa076154\Desktop\CAP6411\Inference\Web2Code_image\WebSight_images_new"#"inference_images"
OUTPUT_JSON_DIR = "outputs_hierarchy_DPO_json_25"#"outputs_hierarchy_json_25"
OUTPUT_CSV = "qwen25vl_component_hierarchy_DPO.csv"#"qwen25vl_component_hierarchy.csv"
MAX_NEW_TOKENS = 2500


PROMPT_HIERARCHY = (
    "You are an expert UI layout analyzer. "
    "Analyze this wireframe and output a precise, hierarchical JSON structure that captures the full layout and nesting of components.\n\n"
    "Representation rules:\n"
    "- Each component is an object with:\n"
    "  ‚Ä¢ 'type': the component name (e.g., page, header, nav, hero, section, row, column, button, image, card, footer)\n"
    "  ‚Ä¢ 'attributes': a dictionary of visual attributes such as color, position (top/left), size, alignment, and text content if visible.\n"
    "  ‚Ä¢ 'children': a list of components visually contained within that element.\n\n"
    "Hierarchy and structure guidelines:\n"
    "- The root node must be the full page ('type': 'page').\n"
    "- Preserve the **visual and logical nesting** ‚Äî if elements appear inside a container, section, or div, they must be represented as its children.\n"
    "- Group horizontally aligned components together within a 'row' container.\n"
    "- Within each 'row', represent vertical stacking as separate 'column' components where appropriate.\n"
    "- Maintain left-to-right and top-to-bottom order strictly as seen in the layout.\n"
    "- Ensure sibling components appear in correct sequence and avoid flattening nested structures.\n\n"
    "Output requirements:\n"
    "- Output only **valid JSON** ‚Äî no text or explanation outside the JSON.\n"
    "- The structure must be complete and balanced (all brackets closed).\n"
    "- Pay particular attention to the **arrangement and nesting of divs**, preserving all parent‚Äìchild relationships exactly as seen visually."
)


# PROMPT_HIERARCHY = (
#     "You are an expert UI layout analyzer. "
#     "Analyze this wireframe and output all visible components in a hierarchical JSON structure.\n\n"
#     "Each component should be represented as an object with:\n"
#     "- 'type': the component name (e.g., header, nav, hero, button, image, card, footer)\n"
#     "- 'attributes': a dictionary with attributes like color, position, size, alignment, and text content if visible\n"
#     "- 'children': a list of nested components inside it\n\n"
#     "The root node should represent the full page as 'page'.\n"
#     "Follow the visual hierarchy (top to bottom, left to right). Output valid JSON only‚Äîno text outside the JSON."
# )

# --------------------------
# Load Model
# --------------------------
def load_model_and_processor():
    has_cuda = torch.cuda.is_available()
    quant = None
    dtype = "auto"
    device_map = "auto" if has_cuda else "cpu"

    if has_cuda:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        quantization_config=quant,
        torch_dtype=dtype,
        device_map=device_map,
        low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    return model, processor


# --------------------------
# Inference Function
# --------------------------
def extract_hierarchy(image_path, model, processor):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_HIERARCHY},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image_path], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)

    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return output_text.strip()


# --------------------------
# Main Function
# --------------------------
def main():
    model, processor = load_model_and_processor()
    os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)

    results = []
    image_dir = Path(IMAGE_DIR)

    for fname in sorted(os.listdir(image_dir)):
        if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            continue

        img_path = str(image_dir / fname)
        stem = Path(fname).stem
        print(f"üîç Extracting hierarchy for {fname}...")

        row = {"image": fname, "raw_output": "", "status": ""}

        try:
            raw_json = extract_hierarchy(img_path, model, processor)
            row["raw_output"] = raw_json

            # Attempt to parse JSON
            try:
                parsed_json = json.loads(raw_json)
                json_path = os.path.join(OUTPUT_JSON_DIR, f"{stem}_hierarchy.json")
                with open(json_path, "w", encoding="utf-8") as f:
                    json.dump(parsed_json, f, indent=2)
                row["status"] = "parsed"
            except json.JSONDecodeError:
                # Save raw text if JSON invalid
                with open(os.path.join(OUTPUT_JSON_DIR, f"{stem}_raw.txt"), "w", encoding="utf-8") as f:
                    f.write(raw_json)
                row["status"] = "invalid_json"

        except Exception as e:
            row["raw_output"] = f"ERROR: {e}"
            row["status"] = "error"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Extraction complete! Results saved to {OUTPUT_CSV}\nüìÇ JSONs in: {OUTPUT_JSON_DIR}")


if __name__ == "__main__":
    main()


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.16s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]


üîç Extracting hierarchy for image_0_1.png...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Extracting hierarchy for image_0_10.png...
üîç Extracting hierarchy for image_0_11.png...
üîç Extracting hierarchy for image_0_12.png...
üîç Extracting hierarchy for image_0_13.png...
üîç Extracting hierarchy for image_0_15.png...
üîç Extracting hierarchy for image_0_16.png...
üîç Extracting hierarchy for image_0_18.png...
üîç Extracting hierarchy for image_0_19.png...
üîç Extracting hierarchy for image_0_2.png...
üîç Extracting hierarchy for image_0_20.png...
üîç Extracting hierarchy for image_0_21.png...
üîç Extracting hierarchy for image_0_22.png...
üîç Extracting hierarchy for image_0_23.png...
üîç Extracting hierarchy for image_0_24.png...
üîç Extracting hierarchy for image_0_25.png...
üîç Extracting hierarchy for image_0_26.png...
üîç Extracting hierarchy for image_0_27.png...
üîç Extracting hierarchy for image_0_28.png...
üîç Extracting hierarchy for image_0_29.png...
üîç Extracting hierarchy for image_0_3.png...
üîç Extracting hierarchy for image_0_30.p

KeyboardInterrupt: 

Verification

In [1]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --------------------------
# Model Configuration
# --------------------------
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Efficient 4-bit quantization
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16   # use bfloat16 if your GPU supports it
)

# --------------------------
# Load Model & Processor
# --------------------------
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype="auto",
    device_map="auto",
    low_cpu_mem_usage=True
)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

print("‚úÖ Qwen 2.5-VL loaded successfully!")


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.10s/it]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]


‚úÖ Qwen 2.5-VL loaded successfully!


In [2]:
import os
import re
import json
import torch
import pandas as pd
from pathlib import Path

# --------------------------
# Config
# --------------------------
RAW_HIERARCHY_DIR = "outputs_hierarchy_sample_json_25"     # where *_raw.txt files are
IMAGE_DIR = "sample"#"inference_images"
OUTPUT_HTML_DIR = "outputs_html_verify_sample_25"#"outputs_html_verify_25"
OUTPUT_CSV = "qwen25vl_verify_hierarchy_results_sample_from_rawtxt.csv"#"qwen25vl_verify_hierarchy_results_from_rawtxt.csv"
MAX_NEW_TOKENS = 2500

PROMPT_HTML_FROM_JSON = (
    "You are an expert front-end developer. "
    "Use BOTH the following hierarchical component JSON and the provided wireframe image "
    "to generate a complete, minimal, responsive HTML5 layout.\n\n"
    "Guidelines:\n"
    "- Each node's 'type' corresponds to an HTML section or element.\n"
    "- Use semantic HTML tags (header, nav, main, section, article, footer, etc.).\n"
    "- Use node 'attributes' to infer inline styles (colors, alignment, size).\n"
    "- Preserve the hierarchy: parent nodes contain their children in proper order.\n"
    "- Include a minimal <style> block in <head> but no external CSS or JavaScript.\n"
    "- Use placeholder text for headings, paragraphs, or buttons.\n"
    "- The visual layout should reflect the image as closely as possible.\n\n"
    "Output ONLY valid HTML code, starting with <!doctype html>."
)

# --------------------------
# Helper: Extract JSON from raw text
# --------------------------
def extract_json_from_text(text: str):
    """
    Extract JSON code from a text block that may contain roles and markdown fences.
    """
    # 1Ô∏è‚É£ Look for ```json ... ``` block first
    match = re.search(r"```json(.*?)```", text, re.DOTALL)
    if match:
        json_str = match.group(1).strip()
    else:
        # 2Ô∏è‚É£ Fallback: look for first JSON-like braces
        match = re.search(r"\{[\s\S]*\}", text)
        if not match:
            print("‚ö†Ô∏è No JSON found in file.")
            return None
        json_str = match.group(0)

    # 3Ô∏è‚É£ Try to parse cleanly
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"‚ö†Ô∏è JSON parse error: {e}")
        # 4Ô∏è‚É£ Try to repair common issues (truncated JSON)
        if json_str.count("{") > json_str.count("}"):
            json_str += "}" * (json_str.count("{") - json_str.count("}"))
        try:
            return json.loads(json_str)
        except Exception:
            return None

# --------------------------
# Core Function
# --------------------------
def generate_html_from_json_and_image(json_data, image_path, model, processor):
    json_text = json.dumps(json_data, indent=2)
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": PROMPT_HTML_FROM_JSON},
            {"type": "text", "text": f"Here is the JSON:\n\n{json_text}"}
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=[image_path],
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            temperature=None
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    output_text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return output_text

# --------------------------
# Main
# --------------------------
def main(model, processor):
    os.makedirs(OUTPUT_HTML_DIR, exist_ok=True)
    results = []

    raw_files = sorted([f for f in os.listdir(RAW_HIERARCHY_DIR) if f.endswith("_raw.txt")])

    for fname in raw_files:
        raw_path = os.path.join(RAW_HIERARCHY_DIR, fname)
        stem = Path(fname).stem.replace("_raw", "")
        image_candidates = [os.path.join(IMAGE_DIR, f"{stem}.png"), os.path.join(IMAGE_DIR, f"{stem}.jpg")]
        image_path = next((p for p in image_candidates if os.path.exists(p)), None)

        if not image_path:
            print(f"‚ö†Ô∏è Skipping {fname} ‚Äî No matching image found.")
            continue

        print(f"üîç Processing {fname} with image {Path(image_path).name}...")

        row = {"raw_file": fname, "image_file": Path(image_path).name, "status": "", "html_file": "", "raw_output": ""}

        try:
            with open(raw_path, "r", encoding="utf-8") as f:
                raw_text = f.read()

            json_data = extract_json_from_text(raw_text)
            if json_data is None:
                row["status"] = "error: no valid JSON"
                results.append(row)
                continue

            html_output = generate_html_from_json_and_image(json_data, image_path, model, processor)
            row["raw_output"] = html_output

            out_html_path = os.path.join(OUTPUT_HTML_DIR, f"{stem}__verify.html")
            with open(out_html_path, "w", encoding="utf-8") as f:
                f.write(html_output)

            row["html_file"] = out_html_path
            row["status"] = "success"

        except Exception as e:
            row["status"] = f"error: {e}"

        results.append(row)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ Verification done! HTMLs saved in {OUTPUT_HTML_DIR}")
    print(f"üìä Log saved to {OUTPUT_CSV}")

# --------------------------
# Example usage
# --------------------------
main(model, processor)


üîç Processing 1_raw.txt with image 1.png...
üîç Processing 2_raw.txt with image 2.png...
üîç Processing 3_raw.txt with image 3.png...

‚úÖ Verification done! HTMLs saved in outputs_html_verify_sample_25
üìä Log saved to qwen25vl_verify_hierarchy_results_sample_from_rawtxt.csv
