# Form2SDCTemplate: Convert Forms to SDC4 Templates

**Version 4.2.3** | [PyPI](https://pypi.org/project/form2sdc/) | [GitHub](https://github.com/SemanticDataCharter/Form2SDCTemplate)

This notebook converts PDF, DOCX, and image forms into SDC4-compliant markdown templates using Google Gemini.

**How it works:**
1. Upload your form (PDF, DOCX, PNG, JPG)
2. Gemini analyzes the form structure and extracts fields
3. (Optional) SDCStudio catalog is searched for reusable components
4. A validated SDC4 markdown template is generated with reuse references
5. Download the template for use with SDCStudio

[\![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SemanticDataCharter/Form2SDCTemplate/blob/main/notebooks/form_to_template.ipynb)

In [None]:
# Cell 1: Setup - Install dependencies
import subprocess
import sys

subprocess.check_call(
    [sys.executable, "-m", "pip", "install", "-q",
     "google-genai>=1.0", "form2sdc[gemini]"]
)
print("Dependencies installed successfully!")

## API Key Setup

This notebook uses **Google Gemini** to analyze your forms. You need a free API key:

1. Go to [Google AI Studio](https://aistudio.google.com/apikey)
2. Click **Create API Key** (sign in with any Google account)
3. Copy the key — you'll paste it in the next cell when prompted

The free tier is sufficient for form conversion. Your key is entered via a masked input field and is not stored.

## SDCStudio Component Reuse (Optional)

If you have an [SDCStudio](https://sdcstudio.axius-sdc.com) account, you can connect to the **component catalog** so Gemini can reuse existing definitions (e.g., standard address clusters, coded value sets) instead of generating them from scratch.

**Get your API key:**
1. Log in to [SDCStudio](https://sdcstudio.axius-sdc.com) (60-day free trial, no charge)
2. Click your profile icon (top-right) > **Settings** > **API Keys**
3. Click **Generate API Key** and copy it immediately (it won't be shown again)

If you skip this step, the notebook works fine — templates are generated without reuse references.

In [None]:
# Cell 2: Configuration
import getpass
import os

# Google AI API key (required)
if "GOOGLE_API_KEY" in os.environ:
    api_key = os.environ["GOOGLE_API_KEY"]
    print("Using GOOGLE_API_KEY from environment.")
else:
    api_key = getpass.getpass("Enter your Google AI API key: ")
    print("Google AI API key set.")

# SDCStudio API key (optional \u2014 enables component reuse)
sdcstudio_key = None
SDCSTUDIO_API_URL = "https://sdcstudio.axius-sdc.com/api/v1/catalog/components/"

if "SDCSTUDIO_API_KEY" in os.environ:
    sdcstudio_key = os.environ["SDCSTUDIO_API_KEY"]
    print("Using SDCSTUDIO_API_KEY from environment.")
else:
    sdcstudio_key = getpass.getpass(
        "Enter your SDCStudio API key (press Enter to skip): "
    )
    if sdcstudio_key:
        print("SDCStudio API key set \u2014 component reuse enabled.")
    else:
        sdcstudio_key = None
        print("SDCStudio skipped \u2014 generating without component reuse.")

# Model selection
model = "gemini-3-flash-preview"
print(f"\nModel: {model}")

In [None]:
# Cell 3: Upload your form
uploaded_file = None
uploaded_filename = None

try:
    # Google Colab file upload
    from google.colab import files as colab_files
    uploaded = colab_files.upload()
    if uploaded:
        uploaded_filename = list(uploaded.keys())[0]
        uploaded_file = uploaded[uploaded_filename]
        print(f"Uploaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
except ImportError:
    # Jupyter fallback - use file path
    from pathlib import Path
    file_path_str = input("Enter the path to your form file: ").strip()
    if file_path_str:
        p = Path(file_path_str)
        if p.exists():
            uploaded_filename = p.name
            uploaded_file = p.read_bytes()
            print(f"Loaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
        else:
            print(f"File not found: {file_path_str}")

if uploaded_file is None:
    print("No file uploaded. Please run this cell again.")

In [None]:
# Cell 4: Analyze the form (extract fields and structure)
from pathlib import Path
from form2sdc.analyzer import GeminiAnalyzer

if uploaded_file is None:
    print("Please upload a file in Cell 3 first.")
else:
    print(f"Analyzing {uploaded_filename}...")

    analyzer = GeminiAnalyzer(api_key=api_key, model=model)
    analysis = analyzer.analyze(
        file_path=Path(uploaded_filename),
        file_content=uploaded_file,
    )

    # Show what Gemini extracted
    print(f"\nDataset: {analysis.dataset_name}")
    if analysis.domain:
        print(f"Domain: {analysis.domain}")
    print(f"Language: {analysis.source_language}")

    # Collect all field names and types for catalog search
    all_columns = list(analysis.root_cluster.columns)
    cluster_names = []
    for sub in analysis.root_cluster.sub_clusters:
        cluster_names.append(sub.name)
        all_columns.extend(sub.columns)

    print(f"Fields: {len(all_columns)}")
    print(f"Sub-clusters: {len(analysis.root_cluster.sub_clusters)}")

    print("\nExtracted fields:")
    for col in all_columns:
        print(f"  - {col.name} ({col.column_type.value})")
    if cluster_names:
        print(f"\nCluster groupings: {', '.join(cluster_names)}")

    print("\nAnalysis complete!")

In [None]:
# Cell 5: Search SDCStudio catalog for reusable components (optional)
import urllib.request
import urllib.parse
import json

catalog_context = ""

if sdcstudio_key and 'analysis' in dir():
    print("Searching SDCStudio catalog for reusable components...\n")

    # Build search terms from extracted field names and cluster names
    search_terms = set()
    for col in all_columns:
        search_terms.add(col.name)
    for name in cluster_names:
        search_terms.add(name)

    # Query catalog for each term, collect unique matches
    seen_ids = set()
    matches = []

    for term in search_terms:
        params = urllib.parse.urlencode({"search": term})
        url = f"{SDCSTUDIO_API_URL}?{params}"
        req = urllib.request.Request(url, headers={
            "Authorization": f"Token {sdcstudio_key}",
        })
        try:
            with urllib.request.urlopen(req, timeout=15) as resp:
                data = json.loads(resp.read().decode("utf-8"))
                for c in data.get("results", []):
                    if c["ct_id"] not in seen_ids:
                        seen_ids.add(c["ct_id"])
                        matches.append(c)
        except Exception:
            pass  # Skip failed searches, continue with others

    if matches:
        lines = [f"Found {len(matches)} reusable components matching your form fields:\n"]
        for c in matches:
            lines.append(
                f"  - {c['label']} ({c['component_type']}) \u2014 "
                f"reuse with: {c['reuse_ref']}"
            )
            if c.get("description"):
                lines.append(f"    {c['description'][:120]}")
        catalog_context = "\n".join(lines)
        print(catalog_context)
    else:
        print("No matching components found in catalog. Template will define all components fresh.")

elif sdcstudio_key and 'analysis' not in dir():
    print("Run Cell 4 first to analyze the form.")
else:
    print("SDCStudio not configured \u2014 skipping catalog lookup.")

In [None]:
# Cell 6: Build template (with catalog reuse if available)
from form2sdc.template_builder import TemplateBuilder
from form2sdc.validator import Form2SDCValidator

if 'analysis' not in dir():
    print("Please run Cell 4 first.")
else:
    # If catalog matches were found, re-analyze with reuse instructions
    if catalog_context:
        print("Re-analyzing with catalog reuse context...\n")
        reuse_instructions = (
            "The following components already exist in the SDCStudio catalog. "
            "Where a catalog component matches a field in this form, use the "
            "reuse reference (e.g., @ProjectName:Label) instead of defining "
            "a new component.\n\n" + catalog_context
        )
        analysis = analyzer.analyze(
            file_path=Path(uploaded_filename),
            file_content=uploaded_file,
            additional_instructions=reuse_instructions,
        )

    # Build and validate
    builder = TemplateBuilder()
    template = builder.build(analysis)

    validator = Form2SDCValidator()
    validation = validator.validate(template, document=uploaded_filename)

    print("Template built.")
    if catalog_context:
        print("Component reuse: applied from SDCStudio catalog.")

In [None]:
# Cell 7: Validation results
if 'validation' not in dir():
    print("Please run Cell 6 first.")
else:
    if validation.valid:
        print("VALIDATION PASSED")
    else:
        print("VALIDATION ISSUES FOUND")

    if validation.errors:
        print(f"\n--- CRITICAL ERRORS ({len(validation.errors)}) ---")
        for e in validation.errors:
            print(f"  [{e.code}] Line {e.line}: {e.message}")
            if e.fix:
                print(f"    Fix: {e.fix}")

    if validation.warnings:
        print(f"\n--- WARNINGS ({len(validation.warnings)}) ---")
        for w in validation.warnings:
            print(f"  [{w.code}] Line {w.line}: {w.message}")

    if validation.suggestions:
        print(f"\n--- SUGGESTIONS ({len(validation.suggestions)}) ---")
        for s in validation.suggestions:
            print(f"  [{s.code}] {s.message}")

    print(f"\nTotal: {len(validation.errors)} errors, {len(validation.warnings)} warnings, {len(validation.suggestions)} suggestions")

In [None]:
# Cell 8: Preview template as rendered markdown
if 'template' not in dir():
    print("Please run Cell 6 first.")
else:
    try:
        from IPython.display import Markdown, display
        display(Markdown(template))
    except ImportError:
        print(template)

In [None]:
# Cell 9: Raw markdown (for copying)
if 'template' not in dir():
    print("Please run Cell 6 first.")
else:
    print(template)

In [None]:
# Cell 10: Download template
if 'template' not in dir():
    print("Please run Cell 6 first.")
else:
    # Generate filename from dataset name
    safe_name = analysis.dataset_name.replace(" ", "_").lower()
    output_filename = f"{safe_name}_template.md"

    # Save to file
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(template)
    print(f"Saved: {output_filename}")

    # Trigger download in Colab
    try:
        from google.colab import files as colab_files
        colab_files.download(output_filename)
        print("Download started!")
    except ImportError:
        print(f"File saved to: {output_filename}")