In [1]:
print("--- Starting Captioned LoRA Training (Report Methodology) ---")

# --- 1. Library Installations ---
print("\n--- Installing/Updating necessary libraries ---")
!pip install -q datasets diffusers transformers accelerate bitsandbytes xformers peft
!pip install -q Pillow transformers accelerate bitsandbytes xformers peft
!pip uninstall -y -q diffusers
!pip install -q git+https://github.com/huggingface/diffusers
print("--- Library installation complete ---")

--- Starting Captioned LoRA Training (Report Methodology) ---

--- Installing/Updating necessary libraries ---
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:

# --- Download Text-to-Image LoRA Script ---
# The report approach requires the text_to_image script.
print("\n--- Downloading Text-to-Image LoRA training script ---")
script_url = "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/text_to_image/train_text_to_image_lora.py"
script_name = "train_text_to_image_lora.py"
import os
!wget -q -O {script_name} {script_url}
print(f"Script '{script_name}' downloaded.")



--- Downloading Text-to-Image LoRA training script ---
Script 'train_text_to_image_lora.py' downloaded.


In [14]:
import os
import json
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm

# --- 1. Configurations ---
model_id = "Salesforce/blip-image-captioning-large"
images_dir = "/content/lora_dataset/images"
captions_dir = "/content/lora_dataset/captions"
os.makedirs(captions_dir, exist_ok=True)

# --- 2. Initialize BLIP Model and Processor ---
print(f"--- Initializing {model_id} ---")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and model
processor = BlipProcessor.from_pretrained(model_id)
model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)
model.eval()

# --- 3. Process Images ---
image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
print(f"--- Generating captions for {len(image_files)} images ---")

for img_name in tqdm(image_files, desc="Processing"):
    img_path = os.path.join(images_dir, img_name)
    base_name = os.path.splitext(img_name)[0]
    output_txt_path = os.path.join(captions_dir, f"{base_name}.txt")

    try:
        raw_image = Image.open(img_path).convert("RGB")

        # BLIP Inference: Unconditional captioning
        inputs = processor(images=raw_image, return_tensors="pt").to(device)

        # Generate caption with optimized parameters
        out = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=5,
            early_stopping=True
        )

        caption = processor.decode(out[0], skip_special_tokens=True)

        # Save to individual .txt file
        with open(output_txt_path, "w") as f:
            f.write(caption.strip())

    except Exception as e:
        print(f" - Error processing {img_name}: {e}")

# --- 4. Generate metadata.jsonl for LoRA Training ---
metadata = []
for img_file in os.listdir(images_dir):
    base = os.path.splitext(img_file)[0]
    txt_path = os.path.join(captions_dir, f"{base}.txt")
    if os.path.exists(txt_path):
        with open(txt_path, "r") as f:
            metadata.append({
                "file_name": f"images/{img_file}",
                "text": f.read().strip()
            })

dataset_root = "/content/lora_dataset"
with open(os.path.join(dataset_root, "metadata.jsonl"), "w") as f:
    for entry in metadata:
        json.dump(entry, f)
        f.write("\n")

print(f"\n--- Process Complete: metadata.jsonl generated in {dataset_root} ---")

--- Initializing Salesforce/blip-image-captioning-large ---


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

--- Generating captions for 12 images ---


Processing: 100%|██████████| 12/12 [00:06<00:00,  1.81it/s]


--- Process Complete: metadata.jsonl generated in /content/lora_dataset ---





In [None]:
# --- Create metadata.jsonl ---
# Requirement for the HuggingFace 'train_text_to_image_lora.py' script
metadata = []
for img_file in os.listdir(images_dir):
    base = os.path.splitext(img_file)[0]
    txt_file = f"{base}.txt"

    txt_path = os.path.join(captions_dir, txt_file)
    if os.path.exists(txt_path):
        with open(txt_path, "r") as f:
            saved_caption = f.read().strip()

        # Note: file_name should be relative to where metadata.jsonl is (the root)
        metadata.append({"file_name": f"images/{img_file}", "text": saved_caption})

with open(os.path.join(dataset_root, "metadata.jsonl"), "w") as f:
    for entry in metadata:
        json.dump(entry, f)
        f.write("\n")

print(f"--- Dataset structured and metadata.jsonl generated in {dataset_root} ---")

--- Dataset structured and metadata.jsonl generated in /content/lora_dataset ---


In [16]:

# --- 5. Training Parameters (Aligned with Report) ---
print("\n--- Defining LoRA Training Parameters ---")
pretrained_model = "runwayml/stable-diffusion-v1-5"
output_dir = "lora_face_model"


# --- LoRA Hyperparameters (Report Aligned) ---
training_config = {
    "model_id": "runwayml/stable-diffusion-v1-5", #
    "dataset_path": "/content/lora_dataset",      # [cite: 219]
    "output_name": "lora_face_model",
    "resolution": 512,                            #
    "batch_size": 1,                              # Optimized for memory
    "gradient_accumulation": 4,                   # Effective batch of 4
    "learning_rate": 1e-4,                        # Typical for LoRA [cite: 434]
    "max_steps": 1000,                            # [cite: 220]
    "precision": "fp16",                          # Essential for 8GB VRAM
    "checkpoint_freq": 500,
    "seed": 42
}



# --- Build the Acceleration Command ---
accelerate_command = (
    f"accelerate launch train_text_to_image_lora.py "
    f"--pretrained_model_name_or_path='{training_config['model_id']}' "
    f"--train_data_dir='{training_config['dataset_path']}' "
    f"--caption_column='text' "
    f"--resolution={training_config['resolution']} "
    f"--random_flip "
    f"--train_batch_size={training_config['batch_size']} "
    f"--gradient_accumulation_steps={training_config['gradient_accumulation']} "
    f"--gradient_checkpointing "           # Critical for 8GB VRAM
    f"--use_8bit_adam "                    # Required for memory efficiency
    f"--max_train_steps={training_config['max_steps']} "
    f"--learning_rate={training_config['learning_rate']} "
    f"--lr_scheduler='constant' "
    f"--lr_warmup_steps=0 "
    f"--seed={training_config['seed']} "
    f"--output_dir='{training_config['output_name']}' "
    f"--mixed_precision='{training_config['precision']}' "
    f"--enable_xformers_memory_efficient_attention "
    f"--checkpointing_steps={training_config['checkpoint_freq']} "
)

print(f"--- Launching Training: {training_config['model_id']} ---")
!{accelerate_command}



--- Defining LoRA Training Parameters ---
--- Launching Training: runwayml/stable-diffusion-v1-5 ---
The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `1`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
2025-12-19 06:12:20.495595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766124740.537027   16528 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766124740.554083   16528 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766124740.591972   16528 computation_placer

In [None]:
# --- 7. Inference ---
!pip install compel
print("\n--- Starting Inference with Trained LoRA ---")
from diffusers import StableDiffusionPipeline


pipe = StableDiffusionPipeline.from_pretrained(pretrained_model, torch_dtype=torch.float16).to("cuda")
lora_weight_path = os.path.join(output_dir, "pytorch_lora_weights.safetensors")

if os.path.exists(lora_weight_path):
    pipe.load_lora_weights(lora_weight_path)
    print("LoRA weights loaded.")



In [None]:
test_prompt = "a detailed mugshot photo of a person with dark hair and a serious expression"
image = pipe(test_prompt, num_inference_steps=30).images[0]
image.save("test_output.png")
image.show()

In [None]:
# Detailed mugshot prompt for Indian prisoner (150+ tokens)
prompt = """Professional police mugshot photograph of an Indian male prisoner, front-facing neutral expression,
medium brown skin tone, age 28-35 years old, oval face shape with defined cheekbones and strong jawline.
Short cropped black hair, slightly messy, receding hairline at temples. Thick dark eyebrows, slightly arched.
Deep-set dark brown eyes with tired expression, slight dark circles underneath, medium-length eyelashes.
Straight nose with slightly wide nostrils, average bridge height. Full lips with defined cupid's bow,
slight stubble beard growth around chin and jawline. Slight mustache visible above upper lip. Small scar
on left cheek near ear. Visible tattoo on neck showing traditional Indian script. Wearing orange prison
jumpsuit with visible collar, prisoner identification number visible on chest. Plain grey concrete wall background,
harsh fluorescent lighting from above creating slight shadows under nose and chin. Standard police mugshot composition,
centered framing, eye-level camera angle, official documentation style, high detail, sharp focus on facial features,
realistic skin texture with visible pores, professional law enforcement photography, neutral color grading,
ID board visible at bottom with prisoner number and date, serious demeanor, direct eye contact with camera,
photorealistic, documentary photography style, 8k resolution, ultra detailed."""

# Generate the image
from compel import Compel

compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
conditioning = compel.build_conditioning_tensor(prompt)
image = pipe(prompt_embeds=conditioning).images[0]
image.save("indian_prisoner_mugshot.png")

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
import os
import shutil

# --- 1. Define Paths ---
# 'output_dir' was defined as 'lora_face_model' in your training params
source_path = "lora_face_model/pytorch_lora_weights.safetensors"
save_destination = "/content/final_lora_weights.safetensors"

# --- 2. Save the Weights ---
if os.path.exists(source_path):
    shutil.copy(source_path, save_destination)
    print(f"--- LoRA weights saved to: {save_destination} ---")
else:
    print("Error: Training output not found. Ensure training completed successfully.")

# --- (Optional) Zip the entire output folder ---
# This includes logs and checkpoints if you want to resume training later
# shutil.make_archive("lora_model_full_backup", 'zip', "lora_face_model")
# print("--- Full model directory zipped as lora_model_full_backup.zip ---")

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
