# Form2SDCTemplate: Convert Forms to SDC4 Templates

This notebook converts PDF, DOCX, and image forms into SDC4-compliant markdown templates using Google Gemini.

**How it works:**
1. Upload your form (PDF, DOCX, PNG, JPG)
2. Gemini analyzes the form structure
3. A validated SDC4 markdown template is generated
4. Download the template for use with SDCStudio

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SemanticDataCharter/Form2SDCTemplate/blob/main/notebooks/form_to_template.ipynb)

In [None]:
# Cell 1: Setup - Install dependencies
import subprocess
import sys

subprocess.check_call(
    [sys.executable, "-m", "pip", "install", "-q",
     "google-genai>=1.0", "form2sdc[gemini]"]
)
print("Dependencies installed successfully!")

## API Key Setup

This notebook uses **Google Gemini** to analyze your forms. You need a free API key:

1. Go to [Google AI Studio](https://aistudio.google.com/apikey)
2. Click **Create API Key** (sign in with any Google account)
3. Copy the key â€” you'll paste it in the next cell when prompted

The free tier is sufficient for form conversion. Your key is entered via a masked input field and is not stored.

In [None]:
# Cell 2: Configuration
import getpass
import os

# Get API key (masked input)
if "GOOGLE_API_KEY" in os.environ:
    api_key = os.environ["GOOGLE_API_KEY"]
    print("Using GOOGLE_API_KEY from environment.")
else:
    api_key = getpass.getpass("Enter your Google AI API key: ")
    print("API key set.")

# Model selection
model = "gemini-2.5-flash"  # Change to "gemini-2.5-pro" for higher quality
print(f"Model: {model}")

In [None]:
# Cell 3: Upload your form
uploaded_file = None
uploaded_filename = None

try:
    # Google Colab file upload
    from google.colab import files as colab_files
    uploaded = colab_files.upload()
    if uploaded:
        uploaded_filename = list(uploaded.keys())[0]
        uploaded_file = uploaded[uploaded_filename]
        print(f"Uploaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
except ImportError:
    # Jupyter fallback - use file path
    from pathlib import Path
    file_path_str = input("Enter the path to your form file: ").strip()
    if file_path_str:
        p = Path(file_path_str)
        if p.exists():
            uploaded_filename = p.name
            uploaded_file = p.read_bytes()
            print(f"Loaded: {uploaded_filename} ({len(uploaded_file):,} bytes)")
        else:
            print(f"File not found: {file_path_str}")

if uploaded_file is None:
    print("No file uploaded. Please run this cell again.")

In [None]:
# Cell 4: Analyze the form
from pathlib import Path
from form2sdc.analyzer import GeminiAnalyzer
from form2sdc.core import FormToTemplatePipeline

if uploaded_file is None:
    print("Please upload a file in Cell 3 first.")
else:
    print(f"Analyzing {uploaded_filename}...")

    analyzer = GeminiAnalyzer(api_key=api_key, model=model)
    pipeline = FormToTemplatePipeline(analyzer)

    result = pipeline.process(
        file_path=Path(uploaded_filename),
        file_content=uploaded_file,
    )

    # Summary
    analysis = result.analysis
    print(f"\nDataset: {analysis.dataset_name}")
    if analysis.domain:
        print(f"Domain: {analysis.domain}")
    print(f"Language: {analysis.source_language}")
    col_count = len(analysis.root_cluster.columns)
    for sub in analysis.root_cluster.sub_clusters:
        col_count += len(sub.columns)
    print(f"Columns: {col_count}")
    print(f"Sub-clusters: {len(analysis.root_cluster.sub_clusters)}")
    print("\nAnalysis complete!")

In [None]:
# Cell 5: Validation results
if 'result' not in dir():
    print("Please run Cell 4 first.")
else:
    v = result.validation

    if v.valid:
        print("VALIDATION PASSED")
    else:
        print("VALIDATION ISSUES FOUND")

    if v.errors:
        print(f"\n--- CRITICAL ERRORS ({len(v.errors)}) ---")
        for e in v.errors:
            print(f"  [{e.code}] Line {e.line}: {e.message}")
            if e.fix:
                print(f"    Fix: {e.fix}")

    if v.warnings:
        print(f"\n--- WARNINGS ({len(v.warnings)}) ---")
        for w in v.warnings:
            print(f"  [{w.code}] Line {w.line}: {w.message}")

    if v.suggestions:
        print(f"\n--- SUGGESTIONS ({len(v.suggestions)}) ---")
        for s in v.suggestions:
            print(f"  [{s.code}] {s.message}")

    print(f"\nTotal: {len(v.errors)} errors, {len(v.warnings)} warnings, {len(v.suggestions)} suggestions")

In [None]:
# Cell 6: Preview template as rendered markdown
if 'result' not in dir():
    print("Please run Cell 4 first.")
else:
    try:
        from IPython.display import Markdown, display
        display(Markdown(result.template))
    except ImportError:
        print(result.template)

In [None]:
# Cell 7: Raw markdown (for copying)
if 'result' not in dir():
    print("Please run Cell 4 first.")
else:
    print(result.template)

In [None]:
# Cell 8: Download template
if 'result' not in dir():
    print("Please run Cell 4 first.")
else:
    # Generate filename from dataset name
    safe_name = result.analysis.dataset_name.replace(" ", "_").lower()
    output_filename = f"{safe_name}_template.md"

    # Save to file
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(result.template)
    print(f"Saved: {output_filename}")

    # Trigger download in Colab
    try:
        from google.colab import files as colab_files
        colab_files.download(output_filename)
        print("Download started!")
    except ImportError:
        print(f"File saved to: {output_filename}")