In [None]:
# Cell 1: Install dependencies
# 🔧 Install required libraries (run once per fresh runtime)
!pip install -q diffusers transformers accelerate safetensors xformers opencv-python controlnet-aux

In [None]:
# Cell 2: Imports
import torch
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from controlnet_aux import OpenposeDetector

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

print("Using device:", device)

Using device: cuda


In [None]:
# Cell 3: Load Pipeline (Switched to Lineart for Sketches)
from controlnet_aux import LineartDetector

# 1. Load the Lineart Preprocessor (The "Eye" for sketches)
print("Loading Lineart Detector...")
preprocessor = LineartDetector.from_pretrained("lllyasviel/Annotators")

# 2. Load the Lineart ControlNet Model (The "Brain" for sketches)
# This model understands pencil shading better than Canny
controlnet_id = "lllyasviel/control_v11p_sd15_lineart"
print(f"Loading ControlNet: {controlnet_id}")

controlnet = ControlNetModel.from_pretrained(
    controlnet_id,
    torch_dtype=torch_dtype,
)

# 3. Reload Base Model to connect them
base_model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
print("Loading Base Model...")

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    base_model_id,
    controlnet=controlnet,
    safety_checker=None,
    torch_dtype=torch_dtype,
)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to(device)

if device == "cuda":
    pipe.enable_xformers_memory_efficient_attention()

print("✅ Lineart Pipeline is ready.")

Loading Lineart Detector...
Loading ControlNet: lllyasviel/control_v11p_sd15_lineart
Loading Base Model...


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Lineart Pipeline is ready.


In [None]:
# Cell 3.5: Load Image Captioning Model (BLIP)
from transformers import BlipProcessor, BlipForConditionalGeneration

print("Loading Captioning AI (BLIP)...")
# Load a model that can describe images
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

print("✅ Captioning AI ready.")

def auto_caption(image):
    """Look at the image and describe it"""
    inputs = processor(image, return_tensors="pt").to("cuda")
    out = caption_model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    return description

Loading Captioning AI (BLIP)...
✅ Captioning AI ready.


In [None]:
# Cell 4: Preprocessing - Balanced Mode
def preprocess_canny(sketch_pil):
    """Balanced edge detection: Keeps features, removes shading noise"""

    img = np.array(sketch_pil.convert("L"))

    if img.mean() > 127:
        img = 255 - img


    img = cv2.GaussianBlur(img, (5, 5), 0)

    edges = cv2.Canny(img, 50, 150)

    edges = cv2.resize(edges, (512, 512), interpolation=cv2.INTER_LINEAR)

    print(f"✅ Using Balanced Threshold (50,150)")

    return Image.fromarray(edges).convert("RGB")

In [None]:
# Cell 5: Smart Generation (Lineart)
import random

def generate_auto(sketch_pil, seed=None):
    if seed is None:
        seed = random.randint(0, 2**32 - 1)

    # 1. Auto-Caption
    raw_caption = auto_caption(sketch_pil)

    # 2. Smart Prompt Logic

    clean_caption = raw_caption.replace("sketch", "photograph")\
                                .replace("drawing", "photograph")\
                                .replace("pencil", "")\
                                .replace("art", "")


    if "blonde" not in clean_caption and "white hair" not in clean_caption and "red" not in clean_caption:
        clean_caption += ", dark natural hair"

    final_prompt = (
        f"close up portrait, {clean_caption}, "
        "raw photo, unedited, authentic skin texture, "
        "soft natural lighting, 8k, shot on 85mm"
    )

    # Build smart negatives
    negative_prompt = "cartoon, anime, sketch, 3d render, plastic, deformed, blur, low quality"

    # If the description doesn't explicitly say "beard", forbid it.
    # This stops neck shading from turning into beards.
    if "beard" not in raw_caption and "mustache" not in raw_caption:
        negative_prompt += ", beard, stubble, facial hair"

    # 3. Create Control Image (Using Lineart, NOT Canny)
    # coarse=True helps ignore messy shading lines
    print("   🎨 Processing sketch lines...")
    control_img = preprocessor(sketch_pil, coarse=True)

    # 4. Generate
    generator = torch.Generator(device=device).manual_seed(seed)

    out = pipe(
        prompt=final_prompt,
        negative_prompt=negative_prompt,
        image=control_img,
        num_inference_steps=30,
        guidance_scale=7.0,
        controlnet_conditioning_scale=1.0,
        generator=generator,
    )

    return control_img, out.images[0], final_prompt, raw_caption

## PHASE TWO

In [None]:
# Cell 7: Two-Stage Logic (Analyze -> Render)
import random
from PIL import Image

# Global storage for the uploaded image
CURRENT_SKETCH = None
CURRENT_CONTROL = None

def step1_analyze(sketch_pil):
    """Generates the wireframe and caption without running the full diffusion"""
    global CURRENT_SKETCH, CURRENT_CONTROL

    CURRENT_SKETCH = sketch_pil


    CURRENT_CONTROL = preprocessor(sketch_pil, coarse=True)

    # 2. Get AI Description
    caption = auto_caption(sketch_pil)

    return CURRENT_CONTROL, caption

def step2_render(hair, eyes, skin, beard, gender, seed=None):
    """Takes the stored wireframe and applies styles"""
    global CURRENT_SKETCH, CURRENT_CONTROL

    if CURRENT_SKETCH is None:
        return None, "No image loaded yet! Run Step 1 first."

    if seed is None:
        seed = random.randint(0, 2**32 - 1)

    # 1. Prepare Base Caption
    raw_caption = auto_caption(CURRENT_SKETCH)
    clean_caption = raw_caption.replace("sketch", "photograph")\
                                .replace("drawing", "photograph")\
                                .replace("pencil", "")\
                                .replace("art", "")

    # 2. Handle Gender Override
    if gender != "Auto-Detect":
        if gender == "Man":
            clean_caption = clean_caption.replace("woman", "man").replace("girl", "man").replace("female", "male")
        elif gender == "Woman":
            clean_caption = clean_caption.replace("man", "woman").replace("boy", "woman").replace("male", "female")

    # 3. Attribute Injection & Dynamic Strength
    strength = 1.0
    traits = []

    if hair != "Original/Auto":
        traits.append(f"({hair} hair:1.5)")
        strength -= 0.1
    if eyes != "Original/Auto":
        traits.append(f"(bright {eyes} eyes:1.6)")
    if skin != "Original/Auto":
        traits.append(f"({skin} skin texture:1.3)")

    beard_negative = ""
    if beard != "Original/Auto":
        if beard == "Clean Shaven":
            traits.append("clean shaven")
            beard_negative = "beard, mustache, stubble"
        else:
            traits.append(f"({beard}:1.5)")
            strength -= 0.2

    trait_string = ", ".join(traits)

    if strength < 0.65: strength = 0.65

    # 4. Final Prompting
    final_prompt = (
        f"close up portrait, {trait_string}, {clean_caption}, "
        "raw photo, unedited, authentic skin texture, "
        "soft natural lighting, 8k, shot on 85mm"
    )

    negative_prompt = f"cartoon, anime, sketch, 3d render, plastic, deformed, blur, low quality, black and white, grayscale, {beard_negative}"

    # 5. Generate
    generator = torch.Generator(device=device).manual_seed(seed)

    out = pipe(
        prompt=final_prompt,
        negative_prompt=negative_prompt,
        image=CURRENT_CONTROL,
        num_inference_steps=35,
        guidance_scale=8.0,
        controlnet_conditioning_scale=strength,
        generator=generator,
    )

    return out.images[0], final_prompt

In [None]:
# Cell 8: Two-Step Studio Dashboard (With Reset Button)
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files
import io
import matplotlib.pyplot as plt
from PIL import Image

# --- Layout & Widgets ---
header = widgets.HTML("<h2>Phase Two: Sketch Studio</h2>")

step1_label = widgets.HTML("<b>Step 1: Input Analysis</b>")
btn_upload = widgets.Button(description="1. Upload & View Wireframe", button_style='info', layout=widgets.Layout(width='300px'))
out_step1 = widgets.Output()

step2_label = widgets.HTML("<br><b>Step 2: Customization & Render</b>")

# Options
hair_options = ['Original/Auto', 'Black', 'Dark Brown', 'Blonde', 'Platinum Blonde', 'Red', 'Grey', 'White', 'Bald', 'Pink', 'Blue']
eye_options = ['Original/Auto', 'Blue', 'Green', 'Brown', 'Hazel', 'Grey', 'Amber', 'Red']
skin_options = ['Original/Auto', 'Pale', 'Fair', 'Tan', 'Dark', 'Olive']
beard_options = ['Original/Auto', 'Clean Shaven', 'Light Stubble', 'Heavy Stubble', 'Full Beard', 'Goatee', 'Mustache']
gender_options = ['Auto-Detect', 'Man', 'Woman']

w_gender = widgets.Dropdown(options=gender_options, description='Gender:')
w_hair = widgets.Dropdown(options=hair_options, description='Hair:')
w_eyes = widgets.Dropdown(options=eye_options, description='Eyes:')
w_skin = widgets.Dropdown(options=skin_options, description='Skin:')
w_beard = widgets.Dropdown(options=beard_options, description='Beard:')

btn_render = widgets.Button(description="2. Generate Final Photo", button_style='success', layout=widgets.Layout(width='300px'))
out_step2 = widgets.Output()

btn_reset = widgets.Button(description="🔄 Start Over / New Image", button_style='warning', layout=widgets.Layout(width='300px'))


# --- Logic ---

def on_click_upload(b):
    """Handle upload and show wireframe"""

    out_step2.clear_output()

    with out_step1:
        clear_output()
        print("📤 Select sketch file...")
        uploaded = files.upload()

        if not uploaded:
            print("❌ No file.")
            return

        fname = list(uploaded.keys())[0]
        print(f"👁️ Analyzing {fname}...")

        try:
            img = Image.open(io.BytesIO(uploaded[fname])).convert("RGB")

            # Run Step 1 Analysis
            wireframe, caption = step1_analyze(img)

            # Display Side-by-Side
            plt.figure(figsize=(10, 5))
            plt.subplot(1, 2, 1); plt.title("Original"); plt.imshow(img); plt.axis("off")
            plt.subplot(1, 2, 2); plt.title("AI Wireframe"); plt.imshow(wireframe); plt.axis("off")
            plt.show()
            print(f"📝 Auto-Caption: {caption}")
            print("✅ Wireframe generated! Now configure Step 2 below.")

        except Exception as e:
            print(f"Error: {e}")

def on_click_render(b):
    """Generate final photo based on settings"""
    with out_step2:
        clear_output()
        print("🎨 Generating... please wait...")

        try:
            result, prompt = step2_render(
                hair=w_hair.value,
                eyes=w_eyes.value,
                skin=w_skin.value,
                beard=w_beard.value,
                gender=w_gender.value
            )

            if result is None:
                print("❌ Error: No image loaded! Please upload one first.")
                return

            # Show Final
            plt.figure(figsize=(8, 8))
            plt.title("Final Result")
            plt.imshow(result)
            plt.axis("off")
            plt.show()
            print(f"✨ Prompt Used: {prompt}")

        except Exception as e:
            print(f"Error: {e}")

def on_click_reset(b):
    """Clears everything and triggers upload again"""
    global CURRENT_SKETCH
    CURRENT_SKETCH = None
    out_step2.clear_output()
    out_step1.clear_output()

    # Trigger the upload function directly
    on_click_upload(b)

# --- Bind Events ---
btn_upload.on_click(on_click_upload)
btn_render.on_click(on_click_render)
btn_reset.on_click(on_click_reset)

# --- Display Dashboard ---
ui_box = widgets.VBox([
    header,
    step1_label, btn_upload, out_step1,
    step2_label,
    widgets.HBox([w_gender, w_hair]),
    widgets.HBox([w_eyes, w_skin]),
    w_beard,
    btn_render, out_step2,
    widgets.HTML("<hr>"),
    btn_reset
])

display(ui_box)

VBox(children=(HTML(value='<h2>Phase Two: Sketch Studio</h2>'), HTML(value='<b>Step 1: Input Analysis</b>'), B…

In [None]:

from transformers import CLIPProcessor, CLIPModel
from skimage.metrics import structural_similarity as ssim
import numpy as np
import pandas as pd
import torch

print("Loading Evaluation Models...")

# 1. Load CLIP (The judge of text accuracy)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_clip_score(image, prompt):
    """Scores how well the image matches the text (0 to 100)"""
    inputs = clip_processor(text=[prompt], images=image, return_tensors="pt", padding=True).to("cuda")
    outputs = clip_model(**inputs)
    # This calculates the similarity logic
    logits_per_image = outputs.logits_per_image
    return logits_per_image.item()

def get_structure_score(img1_pil, img2_pil):
    """Scores how closely the edges match (0 to 1.0)"""
    # Convert both to grayscale numpy arrays
    i1 = np.array(img1_pil.convert("L").resize((512,512)))
    i2 = np.array(img2_pil.convert("L").resize((512,512)))

    # Calculate Structural Similarity Index
    score, _ = ssim(i1, i2, full=True)
    return score

print("✅ Scoring Engine Ready.")

Loading Evaluation Models...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

✅ Scoring Engine Ready.


In [None]:

from google.colab import files
import io

print("📤 Upload Test Cases for Accuracy Report...")
uploaded = files.upload()

report_data = []

if uploaded:
    print(f"\n📊 Starting Benchmark on {len(uploaded)} images...\n")

    for fname in uploaded.keys():
        # Load Input
        sketch = Image.open(io.BytesIO(uploaded[fname])).convert("RGB")

        # 1. Generate (Using the Phase 2 Logic in 'Auto' mode)
        # We use a fixed seed for consistent testing
        control, result, prompt = generate_interactive(
            sketch,
            hair="Original/Auto",
            eyes="Original/Auto",
            skin="Original/Auto",
            beard="Original/Auto",
            gender="Auto-Detect",
            seed=1024
        )

        # 2. Calculate Scores
        # Structural: Compare Sketch lines to Photo lines
        struct_score = get_structure_score(sketch, result)

        # Textual: Compare Prompt to Photo content
        text_score = get_clip_score(result, prompt)

        # 3. Log Data
        report_data.append({
            "Filename": fname,
            "Structure (SSIM)": round(struct_score, 3), # Closer to 1.0 is better structure
            "Prompt Match (CLIP)": round(text_score, 2), # Higher is better content
            "Prompt Used": prompt[:50] + "..." # Truncate for display
        })

        print(f"   ✅ Graded {fname}: Struct={struct_score:.2f} | CLIP={text_score:.1f}")

    print("\n🏁 Benchmark Complete.")

📤 Upload Test Cases for Accuracy Report...


Saving download.png to download.png

📊 Starting Benchmark on 1 images...



NameError: name 'generate_interactive' is not defined

In [None]:
# Cell 11: Display Report Card
import pandas as pd
from IPython.display import display

# Create DataFrame
df = pd.DataFrame(report_data)

# Visual Settings
pd.set_option('display.max_colwidth', None)

print("=== 📊 ACCURACY REPORT ===")
print("1. Structure (SSIM): >0.30 is good for Sketch-to-Photo (it implies transformation happened).")
print("   If this is 1.0, the AI just copied the sketch exactly (bad).")
print("2. Prompt Match (CLIP): >25 is a strong match for this specific model type.\n")

# Display Table
display(df)

# Calculate Averages
avg_struct = df["Structure (SSIM)"].mean()
avg_clip = df["Prompt Match (CLIP)"].mean()

print(f"\n📈 AVERAGE SCORES:")
print(f"   Structure Consistency: {avg_struct:.3f}")
print(f"   Prompt Adherence:      {avg_clip:.2f}")

In [None]:
# Cell 13: Instant Accuracy Report Card
import pandas as pd
from IPython.display import display

def run_accuracy_check():
    # 1. Grab the image data
    # If you used the dashboard, we grab that. If not, we load the Da Vinci file.
    try:
        if 'CURRENT_SKETCH' in globals() and CURRENT_SKETCH is not None:
            print("p Found loaded sketch from Dashboard...")
            target_sketch = CURRENT_SKETCH
        else:
            print("p No dashboard image found. Loading 'test_sketch_davinci.jpg'...")
            target_sketch = Image.open("test_sketch_davinci.jpg").convert("RGB")
    except:
        print("❌ No image found. Please upload one first!")
        return

    # 2. Re-Generate the result to get the data for scoring
    # We grab the current settings from the widgets if they exist, otherwise default
    try:
        h_val = w_hair.value if 'w_hair' in globals() else "Original/Auto"
        e_val = w_eyes.value if 'w_eyes' in globals() else "Original/Auto"
        s_val = w_skin.value if 'w_skin' in globals() else "Original/Auto"
        b_val = w_beard.value if 'w_beard' in globals() else "Original/Auto"
        g_val = w_gender.value if 'w_gender' in globals() else "Auto-Detect"
    except:
        h_val, e_val, s_val, b_val, g_val = "Original/Auto", "Original/Auto", "Original/Auto", "Original/Auto", "Auto-Detect"

    print(f"   ⚙️  Settings: Hair={h_val}, Eyes={e_val}, Beard={b_val}")
    print("   🤖 Generating & Scoring...")

    control, result, prompt = generate_interactive(
        target_sketch,
        hair=h_val,
        eyes=e_val,
        skin=s_val,
        beard=b_val,
        gender=g_val
    )

    # 3. Calculate Scores
    # SSIM (Structural Similarity) - How much does it respect the lines?
    struct_score = get_structure_score(target_sketch, result)

    # CLIP (Semantic Similarity) - How much does it respect the prompt?
    text_score = get_clip_score(result, prompt)

    # 4. Generate Report
    print("\n" + "="*30)
    print(" 📋 ACCURACY REPORT CARD")
    print("="*30)

    # Visual Output
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1); plt.title("Reference Sketch"); plt.imshow(target_sketch); plt.axis("off")
    plt.subplot(1, 3, 2); plt.title("AI Interpretation"); plt.imshow(result); plt.axis("off")

    # Display Scores visually
    plt.subplot(1, 3, 3)
    plt.axis("off")
    plt.text(0.1, 0.8, f"Structure Match: {struct_score*100:.1f}%", fontsize=14, weight="bold")
    plt.text(0.1, 0.6, f"(Target: 35% - 55%)", fontsize=10, color="gray")

    plt.text(0.1, 0.4, f"Prompt Accuracy: {text_score:.1f}/100", fontsize=14, weight="bold")
    plt.text(0.1, 0.2, f"(Target: >30.0)", fontsize=10, color="gray")
    plt.show()

    # Interpretation
    print(f"ℹ️  ANALYSIS:")
    if struct_score < 0.25:
        print("⚠️  Structure Warning: The AI changed the face shape too much (Low Fidelity).")
    elif struct_score > 0.60:
        print("⚠️  Structure Warning: The image looks too much like a sketch (Low Realism).")
    else:
        print("✅ Structure Pass: Perfect balance between Sketch shape and Photo realism.")

    if text_score > 30:
        print("✅ Prompt Pass: The AI clearly understood the attributes (Hair/Eyes/etc).")
    else:
        print("⚠️  Prompt Warning: The attributes might be subtle or missing.")

# Run the report
run_accuracy_check()

p Found loaded sketch from Dashboard...
   ⚙️  Settings: Hair=Original/Auto, Eyes=Original/Auto, Beard=Original/Auto
   🤖 Generating & Scoring...


NameError: name 'generate_interactive' is not defined