# Form2SDCTemplate: Convert Forms to SDC4 Templates

This notebook converts PDF, DOCX, and image forms into SDC4-compliant markdown templates using Google Gemini.

**How it works:**
1. Upload your form (PDF, DOCX, PNG, JPG)
2. Gemini analyzes the form structure
3. A validated SDC4 markdown template is generated
4. Download the template for use with SDCStudio

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SemanticDataCharter/Form2SDCTemplate/blob/main/notebooks/form_to_template.ipynb)

In [None]:
# Cell 1: Setup - Install dependencies
import subprocess
import sys

subprocess.check_call(
    [sys.executable, "-m", "pip", "install", "-q",
     "google-genai>=1.0", "form2sdc[gemini]"]
)
print("Dependencies installed successfully!")

## API Key Setup

This notebook uses **Google Gemini** to analyze your forms. You need a free API key:

1. Go to [Google AI Studio](https://aistudio.google.com/apikey)
2. Click **Create API Key** (sign in with any Google account)
3. Copy the key — you'll paste it in the next cell when prompted

The free tier is sufficient for form conversion. Your key is entered via a masked input field and is not stored.

## SDCStudio Component Reuse (Optional)

If you have an [SDCStudio](https://sdcstudio.axius-sdc.com) account, you can connect to the **component catalog** so Gemini can reuse existing definitions (e.g., standard address clusters, coded value sets) instead of generating them from scratch.

**Get your API key:**
1. Log in to [SDCStudio](https://sdcstudio.axius-sdc.com) (60-day free trial, no charge)
2. Click your profile icon (top-right) > **Settings** > **API Keys**
3. Click **Generate API Key** and copy it immediately (it won't be shown again)

If you skip this step, the notebook works fine — templates are generated without reuse references.

In [None]:
# Cell 2: Configuration
import getpass
import os

# Google AI API key (required)
if "GOOGLE_API_KEY" in os.environ:
    api_key = os.environ["GOOGLE_API_KEY"]
    print("Using GOOGLE_API_KEY from environment.")
else:
    api_key = getpass.getpass("Enter your Google AI API key: ")
    print("Google AI API key set.")

# SDCStudio API key (optional — enables component reuse)
sdcstudio_key = None
SDCSTUDIO_API_URL = "https://sdcstudio.axius-sdc.com/api/v1/catalog/components/"

if "SDCSTUDIO_API_KEY" in os.environ:
    sdcstudio_key = os.environ["SDCSTUDIO_API_KEY"]
    print("Using SDCSTUDIO_API_KEY from environment.")
else:
    sdcstudio_key = getpass.getpass(
        "Enter your SDCStudio API key (press Enter to skip): "
    )
    if sdcstudio_key:
        print("SDCStudio API key set \u2014 component reuse enabled.")
    else:
        sdcstudio_key = None
        print("SDCStudio skipped \u2014 generating without component reuse.")

# Model selection
model = "gemini-2.5-flash"  # Change to "gemini-2.5-pro" for higher quality
print(f"\nModel: {model}")

In [None]:
# Cell 3: Upload your form
uploaded_file = None
uploaded_filename = None

try:
    # Google Colab file upload
    from google.colab import files as colab_files
    uploaded = colab_files.upload()
    if uploaded:
        uploaded_filename = list(uploaded.keys())[0]
        uploaded_file = uploaded[uploaded_filename]
        print(f"Uploaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
except ImportError:
    # Jupyter fallback - use file path
    from pathlib import Path
    file_path_str = input("Enter the path to your form file: ").strip()
    if file_path_str:
        p = Path(file_path_str)
        if p.exists():
            uploaded_filename = p.name
            uploaded_file = p.read_bytes()
            print(f"Loaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
        else:
            print(f"File not found: {file_path_str}")

if uploaded_file is None:
    print("No file uploaded. Please run this cell again.")

In [None]:
# Cell 4: Search SDCStudio catalog for reusable components (optional)
catalog_context = ""

if sdcstudio_key and uploaded_filename:
    import urllib.request
    import urllib.parse
    import json

    # Extract a search term from the filename
    search_term = uploaded_filename.rsplit(".", 1)[0].replace("_", " ").replace("-", " ")
    params = urllib.parse.urlencode({"search": search_term, "page": 1})
    url = f"{SDCSTUDIO_API_URL}?{params}"

    req = urllib.request.Request(url, headers={
        "Authorization": f"Token {sdcstudio_key}",
    })

    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read().decode("utf-8"))
            count = data.get("count", 0)
            results = data.get("results", [])

            if results:
                lines = [f"Found {count} reusable components in SDCStudio catalog:\n"]
                for c in results[:20]:
                    lines.append(
                        f"  - {c['label']} ({c['component_type']}) \u2014 "
                        f"reuse with: {c['reuse_ref']}"
                    )
                    if c.get("description"):
                        lines.append(f"    {c['description'][:120]}")
                catalog_context = "\n".join(lines)
                print(catalog_context)
            else:
                print(f"No catalog matches for '{search_term}'. Proceeding without reuse.")
    except Exception as e:
        print(f"Could not reach SDCStudio catalog: {e}")
        print("Proceeding without component reuse.")
elif sdcstudio_key and not uploaded_filename:
    print("Upload a file first (Cell 3), then re-run this cell.")
else:
    print("SDCStudio not configured \u2014 skipping catalog lookup.")

In [None]:
# Cell 5: Analyze the form
from pathlib import Path
from form2sdc.analyzer import GeminiAnalyzer
from form2sdc.core import FormToTemplatePipeline

if uploaded_file is None:
    print("Please upload a file in Cell 3 first.")
else:
    print(f"Analyzing {uploaded_filename}...")

    analyzer = GeminiAnalyzer(api_key=api_key, model=model)
    pipeline = FormToTemplatePipeline(analyzer)

    # Pass catalog results as additional context for component reuse
    reuse_instructions = ""
    if catalog_context:
        reuse_instructions = (
            "The following components already exist in the SDCStudio catalog. "
            "Where a catalog component matches a field in this form, use the "
            "reuse reference (e.g., @ProjectName:Label) instead of defining "
            "a new component.\n\n" + catalog_context
        )

    result = pipeline.process(
        file_path=Path(uploaded_filename),
        file_content=uploaded_file,
        additional_instructions=reuse_instructions,
    )

    # Summary
    analysis = result.analysis
    print(f"\nDataset: {analysis.dataset_name}")
    if analysis.domain:
        print(f"Domain: {analysis.domain}")
    print(f"Language: {analysis.source_language}")
    col_count = len(analysis.root_cluster.columns)
    for sub in analysis.root_cluster.sub_clusters:
        col_count += len(sub.columns)
    print(f"Columns: {col_count}")
    print(f"Sub-clusters: {len(analysis.root_cluster.sub_clusters)}")
    if catalog_context:
        print("Component reuse: enabled (SDCStudio catalog)")
    print("\nAnalysis complete!")

In [None]:
# Cell 6: Validation results
if 'result' not in dir():
    print("Please run Cell 5 first.")
else:
    v = result.validation

    if v.valid:
        print("VALIDATION PASSED")
    else:
        print("VALIDATION ISSUES FOUND")

    if v.errors:
        print(f"\n--- CRITICAL ERRORS ({len(v.errors)}) ---")
        for e in v.errors:
            print(f"  [{e.code}] Line {e.line}: {e.message}")
            if e.fix:
                print(f"    Fix: {e.fix}")

    if v.warnings:
        print(f"\n--- WARNINGS ({len(v.warnings)}) ---")
        for w in v.warnings:
            print(f"  [{w.code}] Line {w.line}: {w.message}")

    if v.suggestions:
        print(f"\n--- SUGGESTIONS ({len(v.suggestions)}) ---")
        for s in v.suggestions:
            print(f"  [{s.code}] {s.message}")

    print(f"\nTotal: {len(v.errors)} errors, {len(v.warnings)} warnings, {len(v.suggestions)} suggestions")

In [None]:
# Cell 7: Preview template as rendered markdown
if 'result' not in dir():
    print("Please run Cell 5 first.")
else:
    try:
        from IPython.display import Markdown, display
        display(Markdown(result.template))
    except ImportError:
        print(result.template)

In [None]:
# Cell 8: Raw markdown (for copying)
if 'result' not in dir():
    print("Please run Cell 5 first.")
else:
    print(result.template)

In [None]:
# Cell 9: Download template
if 'result' not in dir():
    print("Please run Cell 5 first.")
else:
    # Generate filename from dataset name
    safe_name = result.analysis.dataset_name.replace(" ", "_").lower()
    output_filename = f"{safe_name}_template.md"

    # Save to file
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(result.template)
    print(f"Saved: {output_filename}")

    # Trigger download in Colab
    try:
        from google.colab import files as colab_files
        colab_files.download(output_filename)
        print("Download started!")
    except ImportError:
        print(f"File saved to: {output_filename}")