### Recognition Model: Data Labelling with Gemma3

In [1]:
import base64
import ollama
import re
import json
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm

# ---------- config ----------
IMG_DIR = r"D:\Mine\AI\BogoBeauty\img_align_celeba"
OUT_DIR = r"D:\Mine\AI\BogoBeauty"
# final timestamped snapshot (end of run)
OUT_XLSX = os.path.join(OUT_DIR, f"celeba_features_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx")
OUT_CSV  = OUT_XLSX.replace(".xlsx", ".csv")
# stable checkpoint paths (used for periodic saves + resume)
CHECKPOINT_XLSX = os.path.join(OUT_DIR, "celeba_features.xlsx")
CHECKPOINT_CSV  = os.path.join(OUT_DIR, "celeba_features.csv")

MODEL_NAME = "gemma3:latest"

outputTemplate = """
    {
        "caption": "{caption}",
        "features": {
            "facing_direction": "front OR side or back or 3/4",
            "eye_color": "eye_color",
            "hair_color": "hair_color",
            "skin_tone": "skin_tone",
            "eyebrow_color": "eyebrow_color"
        }
    }
"""

prompt = fr"""
You are a facial attribute feature extraction model. 
Generate me the features of the image of the person that I am about to give you in a json block:
{outputTemplate}
Do not give me any other text, just the json block.
"""

# ---------- helpers ----------
fenced_json_re = re.compile(r"```json\s*(\{.*\})\s*```", re.DOTALL)
COLUMNS = [
    "file_name", "caption", "facing_direction",
    "eye_color", "hair_color", "eyebrow_color", "skin_tone"
]

def extract_json_str(s: str) -> str:
    """Return bare JSON string, stripping ```json fences if present."""
    m = fenced_json_re.search(s)
    return m.group(1) if m else s.strip()

def try_json_loads(s: str):
    """Parse JSON; also fixes common trailing-commas issues."""
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        s2 = re.sub(r",\s*([}\]])", r"\1", s)  # remove trailing commas
        return json.loads(s2)

def load_checkpoint():
    """Load existing checkpoint (xlsx preferred; fallback to csv)."""
    if os.path.exists(CHECKPOINT_XLSX):
        df = pd.read_excel(CHECKPOINT_XLSX)
    elif os.path.exists(CHECKPOINT_CSV):
        df = pd.read_csv(CHECKPOINT_CSV)
    else:
        df = pd.DataFrame(columns=COLUMNS)
    # normalize columns/order
    for col in COLUMNS:
        if col not in df.columns:
            df[col] = ""
    df = df[COLUMNS]
    # coerce file_name to string for set membership
    df["file_name"] = df["file_name"].astype(str)
    return df

def save_checkpoint(df: pd.DataFrame):
    """Atomically save checkpoint to both XLSX and CSV."""
    df = df.drop_duplicates(subset=["file_name"], keep="first")

    # keep .xlsx so pandas picks the Excel engine automatically
    tmp_xlsx = CHECKPOINT_XLSX + ".tmp.xlsx"
    tmp_csv  = CHECKPOINT_CSV + ".tmp"

    # either of these two lines works:
    # df.to_excel(tmp_xlsx, index=False)  # relies on .xlsx suffix
    df.to_excel(tmp_xlsx, index=False, engine="openpyxl")  # explicit engine

    df.to_csv(tmp_csv, index=False, encoding="utf-8")

    os.replace(tmp_xlsx, CHECKPOINT_XLSX)
    os.replace(tmp_csv, CHECKPOINT_CSV)

# ---------- gather files ----------
paths = [os.path.join(IMG_DIR, p) for p in os.listdir(IMG_DIR)
         if os.path.isfile(os.path.join(IMG_DIR, p))]
print(len(paths), "images found in directory")

# ---------- resume state ----------
df_ckpt = load_checkpoint()
processed_files = set(df_ckpt["file_name"].tolist())

rows = df_ckpt.to_dict(orient="records")  # start with existing rows
processed_since_save = 0

# ---------- main loop ----------
for path in tqdm(paths, desc="Processing images", unit="img"):
    fname = os.path.basename(path)
    if fname in processed_files:
        # already done in checkpoint; skip
        continue
    try:
        with open(path, 'rb') as img_file:
            img_data = img_file.read()
        img_base64 = base64.b64encode(img_data).decode('utf-8')

        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            images=[img_base64],
            options={"temperature": 0.05}
        )

        raw = response.get('response', '').strip()
        json_str = extract_json_str(raw)
        data = try_json_loads(json_str)

        features = data.get("features", {}) if isinstance(data, dict) else {}
        row = {
            "file_name": fname,
            "caption": data.get("caption", ""),
            "facing_direction": features.get("facing_direction", ""),
            "eye_color": features.get("eye_color", ""),
            "hair_color": features.get("hair_color", ""),
            "eyebrow_color": features.get("eyebrow_color", ""),
            "skin_tone": features.get("skin_tone", "")
        }
        rows.append(row)
        processed_files.add(fname)
        processed_since_save += 1

        # save every 15 successful items
        if processed_since_save >= 15:
            df_mid = pd.DataFrame(rows, columns=COLUMNS)
            save_checkpoint(df_mid)
            processed_since_save = 0  # reset counter

    except Exception as e:
        tqdm.write(f"Skipped {fname} due to error: {e}")

# ---------- final save (checkpoint + timestamped snapshot) ----------
df_final = pd.DataFrame(rows, columns=COLUMNS).drop_duplicates(subset=["file_name"], keep="first")
save_checkpoint(df_final)

# also write a timestamped snapshot for archival
df_final.to_excel(OUT_XLSX, index=False)
df_final.to_csv(OUT_CSV, index=False, encoding="utf-8")

print(f"Checkpoint saved to:\n  {CHECKPOINT_XLSX}\n  {CHECKPOINT_CSV}")
print(f"Final snapshot saved to:\n  {OUT_XLSX}\n  {OUT_CSV}")


202599 images found in directory


Processing images:   3%|▎         | 6759/202599 [4:40:45<203:06:01,  3.73s/img]

Skipped 006759.jpg due to error: an error was encountered while running the model: GGML_ASSERT(ctx->mem_buffer != NULL) failed (status code: 500)


Processing images:   7%|▋         | 13459/202599 [23:47:41<165:15:56,  3.15s/img]      

Skipped 013459.jpg due to error: an error was encountered while running the model: GGML_ASSERT(ctx->mem_buffer != NULL) failed (status code: 500)


Processing images:  10%|▉         | 20153/202599 [28:37:02<151:38:05,  2.99s/img]

Skipped 020153.jpg due to error: an error was encountered while running the model: GGML_ASSERT(ctx->mem_buffer != NULL) failed (status code: 500)


Processing images:  11%|█         | 22012/202599 [30:11:01<247:37:43,  4.94s/img]   


KeyboardInterrupt: 