In [1]:
!nvidia-smi

Sat Dec 13 15:36:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# @title Step 1: Setup Environment (Corrected)
# We install from PyPI (stable) instead of GitHub to avoid version conflicts
!pip install -q transformers bitsandbytes accelerate pillow pandas

from google.colab import drive
drive.mount('/content/drive')

print("Environment ready & Drive mounted.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Environment ready & Drive mounted.


In [7]:
# @title Step 2: Load Model (Robust Manual Method)
import torch
from transformers import BitsAndBytesConfig, AutoProcessor, LlavaForConditionalGeneration

# 1. Configure Quantization (Same as before)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-1.5-7b-hf"

print(f"Loading {model_id}...")

# Load Processor (Handles the text/image formatting)
processor = AutoProcessor.from_pretrained(model_id)

# Load Model (The brain)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

print("Model loaded successfully!")

Loading llava-hf/llava-1.5-7b-hf...


chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/686 [00:00<?, ?it/s]

Model loaded successfully!


In [8]:
# @title Step 3: Define Logic (Updated)
from PIL import Image
import os
import pandas as pd
from tqdm import tqdm

def run_llava(image, text_prompt, max_tokens=75):
    # LLaVA 1.5 strictly requires this format
    # We must explicitly include the <image> token
    full_prompt = f"USER: <image>\n{text_prompt}\nASSISTANT:"

    # Process inputs (convert image and text to numbers)
    inputs = processor(text=full_prompt, images=image, return_tensors="pt").to("cuda")

    # Generate output
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=False  # Deterministic (Greedy decoding) for consistency
    )

    # Decode back to text
    generated_text = processor.decode(output_ids[0], skip_special_tokens=True)

    # Clean up the output (remove the prompt part)
    # The model sometimes repeats the prompt, so we split by 'ASSISTANT:'
    if "ASSISTANT:" in generated_text:
        return generated_text.split("ASSISTANT:")[-1].strip()
    return generated_text

def get_detailed_annotation(image):
    prompt = """You are an expert forensic artist. Analyze this image and provide a detailed description for exactly these attributes. Do not refuse.
1. Skin color and gender:
2. Overall facial appearance:
3. Hair (include color, root color, length, texture, thickness):
4. Forehead (include shape, size, features):
5. Hairline (include shape, receding):
6. Beard (include shape, thickness):
7. Eyes (include size, shape, color, glasses, eyelashes):
8. Eyebrows (include size, shape, color, thickness):
9. Nose (include shape, size, width):
10. Lips (size, color, upper vs lower):
11. Chin and jawline (shape, definition, double chin):
12. Ears (visible, shape):
13. Scars (location, appearance):

Return ONLY the attribute list."""
    return run_llava(image, prompt, max_tokens=75)

def compress_for_sd(detailed_text):
    # For compression, we pass a dummy black image because LLaVA expects image input
    dummy_image = Image.new('RGB', (224, 224), color='black')

    prompt = f"""You are a professional compression assistant.
Compress the following detailed face description into a single natural paragraph.
Constraint: Keep it under 75 words.
Focus on: Face shape, expression, hair, eyes, nose, lips, jawline.
Drop: Less critical details like ears or 'no scars'.

DESCRIPTION TO COMPRESS:
{detailed_text}"""
    return run_llava(dummy_image, prompt, max_tokens=150)

def process_batch(folder_path):
    results = []

    # Get all valid image files
    valid_extensions = ('.jpg', '.jpeg', '.png', '.webp')
    if not os.path.exists(folder_path):
        print(f"Error: Folder {folder_path} does not exist.")
        return pd.DataFrame()

    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(valid_extensions)]
    print(f"Found {len(image_files)} images in {folder_path}")

    # Process loop
    for filename in tqdm(image_files, desc="Processing Images"):
        img_path = os.path.join(folder_path, filename)

        try:
            image = Image.open(img_path).convert("RGB")

            # Phase 1: Annotation
            annotation = get_detailed_annotation(image)

            # Phase 2: Compression
            final_prompt = compress_for_sd(annotation)

            results.append({
                "Filename": filename,
                "Detailed_Annotation": annotation,
                "Final_Prompt": final_prompt
            })

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    return pd.DataFrame(results)

In [9]:
# @title Step 4: Run on your Folder
# REPLACE THIS PATH with your actual folder path
# Example: "/content/drive/MyDrive/test_faces"
folder_path = "/content/drive/MyDrive/faces_dataset"

# Check if folder exists
if os.path.exists(folder_path):
    # Run the batch processing
    df_results = process_batch(folder_path)

    # Display the results
    from IPython.display import display
    print("\nProcessing Complete! Here are the results:")
    display(df_results)

    # Save to CSV
    save_path = os.path.join(folder_path, "results.csv")
    df_results.to_csv(save_path, index=False)
    print(f"\nResults saved to: {save_path}")
else:
    print(f"Error: Folder not found at {folder_path}. Please check the path.")

Found 8 images in /content/drive/MyDrive/faces_dataset


Processing Images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [03:20<00:00, 25.05s/it]


Processing Complete! Here are the results:





Unnamed: 0,Filename,Detailed_Annotation,Final_Prompt
0,A20097.jpg,1. Skin color and gender: White male\n2. Overa...,"The face of an aged, wrinkled man with a reced..."
1,A15702.jpg,1. Skin color and gender: White male\n2. Overa...,"The face of an aged, wrinkled man with a reced..."
2,A15856.jpg,1. Skin color and gender: White male\n2. Overa...,The face of a balding white male with a recedi...
3,A50321.jpg,1. Skin color and gender: White male\n2. Overa...,"Face shape, expression, hair, eyes, nose, lips..."
4,A51233.jpg,1. Skin color and gender: White male\n2. Overa...,"Face shape, expression, hair, eyes, nose, lips..."
5,A15380.jpg,1. Skin color and gender: White male\n2. Overa...,"The face of an aged, wrinkled man with a reced..."
6,A60166.jpg,1. Skin color and gender: White male\n2. Overa...,"The face of an aged, wrinkled man with a reced..."
7,A10037.jpg,1. Skin color and gender: White male\n2. Overa...,"The man has a beard and mustache, and his skin..."



Results saved to: /content/drive/MyDrive/faces_dataset/results.csv


In [14]:
# @title Step 6: Save Results to Word Document (.docx)
# 1. Install the library
!pip install -q python-docx

import pandas as pd
import os
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

# 2. Setup Paths (Using your existing folder_path)
csv_path = os.path.join(folder_path, "results.csv")
word_output_path = os.path.join(folder_path, "VLM_Pipeline_Results.docx")

if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(f"‚úÖ Loaded data for {len(df)} entries.")

    # 3. Create the Word Document
    doc = Document()

    # Add a Title
    title = doc.add_heading('VLM Pipeline Results Report', 0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph(f"Total Images Processed: {len(df)}")
    doc.add_paragraph("-" * 50)

    # 4. Iterate and Add Content
    for idx, row in df.iterrows():
        img_filename = row['Filename']
        img_path = os.path.join(folder_path, img_filename)

        # --- Header ---
        header = doc.add_heading(f"Image: {img_filename}", level=1)

        # --- Image ---
        if os.path.exists(img_path):
            try:
                # Add image and resize to 3 inches width (fits nicely on page)
                doc.add_picture(img_path, width=Inches(3.0))
                last_paragraph = doc.paragraphs[-1]
                last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
            except Exception as e:
                doc.add_paragraph(f"[Error loading image: {e}]")
        else:
            doc.add_paragraph("[Image file not found]")

        # --- Phase 1: Annotation ---
        doc.add_heading('Phase 1: Detailed Annotation', level=2)
        p_anno = doc.add_paragraph(str(row['Detailed_Annotation']))
        p_anno.style = 'Quote' # Makes it look distinct

        # --- Phase 2: Compressed Prompt ---
        doc.add_heading('Phase 2: Compressed Prompt (SDXL)', level=2)
        p_prompt = doc.add_paragraph(str(row['Final_Prompt']))

        # Calculate Word Count
        word_count = len(str(row['Final_Prompt']).split())

        # Add Stats
        stats = doc.add_paragraph()
        run = stats.add_run(f"Word Count: {word_count} words")
        run.bold = True
        run.font.color.rgb = RGBColor(0, 100, 0) # Dark Green

        # --- Separator ---
        doc.add_page_break()
        print(f"Added entry for {img_filename}")

    # 5. Save the Document
    doc.save(word_output_path)
    print(f"\nüéâ Success! Word Document saved at:\n{word_output_path}")

else:
    print("‚ùå results.csv not found. Please run the previous processing steps first.")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ[0m [32m245.8/253.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m253.0/253.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Loaded data for 8 entries.
Added entry for A20097.jpg
Added entry for A15702.jpg
Added entry for A15856.jpg
Added entry for A50321.jpg
Added entry for A51233.jpg
Added entry for A15380.jpg
Added entry for A60166.jpg
Added entry for A10037.jpg

üéâ Success! Word Document saved at:
/content/drive/MyDrive/faces_dataset/VLM_Pipeline_Results.docx
