In [None]:
import random
import os
import pandas as pd
from tqdm import tqdm

In [None]:

def generate_inference_prompts(num_prompts=10000, malignant_percentage=50, output_dir="inference_prompts"):
    """
    Generate shorter inference prompts for skin cancer image generation with configurable
    ratio of malignant to benign lesions and 'SKINCANCER:' prefix.

    Args:
        num_prompts (int): Total number of prompts to generate
        malignant_percentage (int): Percentage of malignant lesions (0-100)
        output_dir (str): Directory to save the prompts
    """
    num_malignant = int(num_prompts * (malignant_percentage / 100))
    num_benign = num_prompts - num_malignant

    print(f"Generating {num_malignant} malignant and {num_benign} benign prompts")

    os.makedirs(output_dir, exist_ok=True)

    asymmetry_benign = ["fully symmetrical", "symmetric in one axis"]
    asymmetry_malignant = ["asymmetrical", "fully asymmetrical"]

    pigment_network_benign = ["typical pigment network"]
    pigment_network_malignant = ["atypical pigment network", "irregular pigment network"]

    dots_globules_benign = ["no dots/globules", "typical dots/globules"]
    dots_globules_malignant = ["atypical dots/globules", "irregular dots/globules"]

    streaks_benign = ["no streaks"]
    streaks_malignant = ["streaks present", "irregular streaks"]

    regression_benign = ["no regression"]
    regression_malignant = ["regression areas"]

    blueveil_benign = ["no blue-whitish veil"]
    blueveil_malignant = ["blue-whitish veil"]

    colors_benign = [
        ["light brown"],
        ["dark brown"],
        ["tan"],
        ["light brown", "dark brown"],
    ]
    colors_benign_weights = [0.4, 0.3, 0.2, 0.1]

    colors_malignant = [
        ["dark brown", "black"],
        ["dark brown", "blue gray"],
        ["dark brown", "black", "red"],
        ["dark brown", "black", "blue gray"],
        ["black", "blue gray", "red"]
    ]
    colors_malignant_weights = [0.25, 0.25, 0.2, 0.2, 0.1]

    all_prompts = []

    for i in tqdm(range(num_benign), desc="Generating benign prompts"):
        benign_type = random.choices(
            ["common nevus", "atypical nevus"],
            weights=[0.7, 0.3]
        )[0]

        asymmetry = random.choice(asymmetry_benign)
        pigment_network = random.choice(pigment_network_benign)
        dots_globules = random.choice(dots_globules_benign)
        streaks = random.choice(streaks_benign)
        regression = random.choice(regression_benign)
        blueveil = random.choice(blueveil_benign)

        colors = random.choices(colors_benign, weights=colors_benign_weights)[0]

        if len(colors) == 1:
            color_desc = f"uniform {colors[0]}"
        else:
            color_desc = f"{' and '.join(colors)}"

        prompt = (
            f"SKINCANCER: BENIGN dermatoscopic image of {benign_type}, {asymmetry}, {color_desc}, "
            f"{pigment_network}, {dots_globules}, {streaks}, {regression}, {blueveil}, "
            f"high-resolution clinical photograph at 20x magnification"
        )

        all_prompts.append({
            "id": f"benign_{i+1:05d}",
            "type": "benign",
            "subtype": benign_type,
            "prompt": prompt
        })

    for i in tqdm(range(num_malignant), desc="Generating malignant prompts"):
        malignant_type = random.choices(
            ["melanoma", "nodular melanoma", "lentigo maligna melanoma"],
            weights=[0.6, 0.3, 0.1]
        )[0]

        asymmetry = random.choice(asymmetry_malignant)
        pigment_network = random.choice(pigment_network_malignant)
        dots_globules = random.choice(dots_globules_malignant)
        streaks = random.choice(streaks_malignant) if random.random() < 0.7 else random.choice(streaks_benign)
        regression = random.choice(regression_malignant) if random.random() < 0.6 else random.choice(regression_benign)
        blueveil = random.choice(blueveil_malignant) if random.random() < 0.7 else random.choice(blueveil_benign)

        colors = random.choices(colors_malignant, weights=colors_malignant_weights)[0]

        color_desc = f"variegated {', '.join(colors)}"

        abcde_mention = "ABCDE criteria" if random.random() < 0.4 and malignant_type == "melanoma" else ""

        prompt_parts = [
            f"SKINCANCER: MALIGNANT dermatoscopic image of {malignant_type}",
            abcde_mention if abcde_mention else None,
            asymmetry,
            color_desc,
            pigment_network,
            dots_globules,
            streaks,
            regression,
            blueveil,
            "high-resolution clinical photograph at 20x magnification"
        ]

        prompt_parts = [part for part in prompt_parts if part]

        prompt = ", ".join(prompt_parts)

        all_prompts.append({
            "id": f"malignant_{i+1:05d}",
            "type": "malignant",
            "subtype": malignant_type,
            "prompt": prompt
        })

    df = pd.DataFrame(all_prompts)

    df = df.sample(frac=1).reset_index(drop=True)

    csv_path = os.path.join(output_dir, f"inference_prompts_{num_prompts}_{malignant_percentage}pct_malignant.csv")
    df.to_csv(csv_path, index=False)

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Saving individual prompt files"):
        prompt_file = os.path.join(output_dir, f"{row['id']}.txt")
        with open(prompt_file, "w", encoding="utf-8") as f:
            f.write(row["prompt"])

    prompt_lengths = [len(p) for p in df['prompt']]
    avg_length = sum(prompt_lengths) / len(prompt_lengths)
    max_length = max(prompt_lengths)
    min_length = min(prompt_lengths)

    print(f"Generated {len(df)} prompts")
    print(f"Average prompt length: {avg_length:.1f} characters")
    print(f"Minimum prompt length: {min_length} characters")
    print(f"Maximum prompt length: {max_length} characters")
    print(f"CSV file saved to: {csv_path}")
    print(f"Individual prompt files saved to: {output_dir}")

    return df



In [None]:
if __name__ == "__main__":
    num_prompts = 10000
    malignant_percentage = 50
    output_dir = "/content/drive/MyDrive/Datasets/inference_prompts"


    generate_inference_prompts(
        num_prompts=num_prompts,
        malignant_percentage=malignant_percentage,
        output_dir=output_dir
    )


Generating 5000 malignant and 5000 benign prompts


Generating benign prompts: 100%|██████████| 5000/5000 [00:00<00:00, 67998.17it/s]
Generating malignant prompts: 100%|██████████| 5000/5000 [00:00<00:00, 67048.57it/s]
Saving individual prompt files: 100%|██████████| 10000/10000 [02:04<00:00, 80.26it/s]

Generated 10000 prompts
Average prompt length: 261.2 characters
Minimum prompt length: 236 characters
Maximum prompt length: 299 characters
CSV file saved to: /content/drive/MyDrive/Datasets/inference_prompts/inference_prompts_10000_50pct_malignant.csv
Individual prompt files saved to: /content/drive/MyDrive/Datasets/inference_prompts



