In [4]:
# Final Version works best and reliable


import pandas as pd
import requests
import cv2
import mediapipe as mp
import numpy as np
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import math

# ==========================================
# ============== CONFIGURATION =============
# ==========================================
csv_path = r"E:\BAT eCRM - Main Folder\eCRM_Audit_Monthly Report\November\Check in Image Audit_November_2025_Raw- Main File.csv"
manual_folder = r"E:\BAT eCRM - Main Folder\eCRM_Audit_Monthly Report\November\Manual"
num_threads = 6 

# --- NEW CONFIGURABLE SETTINGS ---
DOWNLOAD_TIMEOUT = 20         
MEDIAPIPE_CONF_THRESH = 0.80  
DNN_CONF_THRESH = 0.70        
BATCH_SIZE = 2000 # NEW: Number of rows to process before saving and summarizing
# ==========================================

os.makedirs(manual_folder, exist_ok=True)

# === Initialize MediaPipe & OpenCV DNN (Moved up for global access) ===
mp_face_detection = mp.solutions.face_detection

model_dir = r"C:\Users\mdtai\Audit - BAT\Face Detection Report"
prototxt_path = os.path.join(model_dir, "deploy.prototxt.txt")
model_path = os.path.join(model_dir, "res10_300x300_ssd_iter_140000.caffemodel")
net = cv2.dnn.readNetFromCaffe(prototxt_path, model_path)

# === Functions (No Change) ===
def download_image(url):
    try:
        # Uses the configurable DOWNLOAD_TIMEOUT
        response = requests.get(url, timeout=DOWNLOAD_TIMEOUT) 
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        return cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    except:
        return None

def detect_with_dnn(image):
    (h, w) = image.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0,
                                 (300, 300), (104.0, 177.0, 123.0))
    net.setInput(blob)
    detections = net.forward()
    # Uses the configurable DNN_CONF_THRESH
    return np.max(detections[0, 0, :, 2]) > DNN_CONF_THRESH

def process_row(row):
    img_url = row.get("Check-In Photo")
    id_val = row.get("id", row.name) 

    if pd.isna(img_url) or str(img_url).strip() == "":
        return row.name, "Skipped (empty URL)", ""

    image = download_image(str(img_url))

    if image is None:
        return row.name, "DOWNLOAD_ERROR", "Image could not be downloaded"

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Uses the configurable MEDIAPIPE_CONF_THRESH
    with mp_face_detection.FaceDetection(min_detection_confidence=MEDIAPIPE_CONF_THRESH) as face_detector:
        results = face_detector.process(image_rgb)
        face_found = results.detections is not None and len(results.detections) > 0

    if not face_found:
        face_found = detect_with_dnn(image)

    if face_found:
        return row.name, "GOOD", ""
    else:
        save_path = os.path.join(manual_folder, f"{id_val}_NOFACE.jpg")
        cv2.imwrite(save_path, image)
        return row.name, "NO FACE", f"Saved: {save_path}"

# === Main Execution Logic with Batching ===

# Load the entire DataFrame once
df = pd.read_csv(csv_path)
original_df_len = len(df)
df["Face_Status"] = df["Face_Status"].astype(str).replace('nan', '', regex=False)

# Identify rows to process (Empty/NaN or DOWNLOAD_ERROR)
rows_to_process = df[
    (df["Face_Status"].isna()) | 
    (df["Face_Status"].astype(str).str.strip() == '') | 
    (df["Face_Status"] == 'DOWNLOAD_ERROR')
]

# We iterate over the indices of the rows that need processing
processing_indices = rows_to_process.index.tolist()
processing_df_len = len(processing_indices)

print(f"Total rows in CSV: {original_df_len}")
print(f"Rows already processed (skipped): {original_df_len - processing_df_len}")
print(f"Rows to be processed (Empty + 'DOWNLOAD_ERROR'): {processing_df_len}\n")

if processing_df_len == 0:
    print("âœ… All rows have already been processed. Exiting.")
    exit()

# Calculate the number of batches
num_batches = math.ceil(processing_df_len / BATCH_SIZE)

# Global counters for summary (These track the NEW results from the current run)
total_good_count = 0
total_noface_count = 0
total_download_err_count = 0
total_processed_in_run = 0

print(f"Dividing {processing_df_len} rows into {num_batches} batches of up to {BATCH_SIZE} rows each.")
print("-" * 60)

# Process the DataFrame in batches
for i in range(num_batches):
    start_index = i * BATCH_SIZE
    end_index = min((i + 1) * BATCH_SIZE, processing_df_len)
    
    # Get the indices for the current batch
    current_batch_indices = processing_indices[start_index:end_index]
    
    # Use .loc to get the actual rows from the original DataFrame
    current_batch_df = df.loc[current_batch_indices].copy()
    batch_size = len(current_batch_df)

    # Reset batch counters
    batch_good_count = 0
    batch_noface_count = 0
    batch_download_err_count = 0

    print(f"ðŸš€ Starting Batch {i + 1}/{num_batches} (Size: {batch_size})")

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Map the future result back to the original index in df
        future_to_original_idx = {
            executor.submit(process_row, row): original_idx
            for original_idx, row in current_batch_df.iterrows()
        }
        
        for future in tqdm(as_completed(future_to_original_idx), total=batch_size, desc=f"Batch {i + 1} Progress"):
            original_idx = future_to_original_idx[future] 
            total_processed_in_run += 1
            
            try:
                processing_idx, status, log_msg = future.result() 
                
                # --- CRITICAL: Update the original DataFrame directly ---
                df.at[original_idx, "Face_Status"] = status 
                
                # Tally results
                if status == "GOOD":
                    batch_good_count += 1
                elif status == "NO FACE":
                    batch_noface_count += 1
                elif status == "DOWNLOAD_ERROR":
                    batch_download_err_count += 1
                
            except Exception as e:
                df.at[original_idx, "Face_Status"] = "SYSTEM_ERROR"
                print(f"\n[SYSTEM ERROR] Row Index {original_idx}: {e}")

    # === UPDATE TOTAL COUNTERS & BATCH SUMMARY ===
    total_good_count += batch_good_count
    total_noface_count += batch_noface_count
    total_download_err_count += batch_download_err_count
    
    print("\n" + "-"*60)
    print(f"âœ… Batch {i + 1} Summary (Processed {batch_size} rows)")
    print(f"Batch GOOD Faces        : {batch_good_count}")
    print(f"Batch NO FACE Images    : {batch_noface_count}")
    print(f"Batch DOWNLOAD FAILURES : {batch_download_err_count}")
    print("-" * 60)

    # === SAVE CHECKPOINT ===
    df.to_csv(csv_path, index=False)
    print(f"ðŸ’¾ **CSV checkpoint saved successfully** after Batch {i + 1}.")
    print("-" * 60 + "\n")


# === FINAL SUMMARY ===
print("\n" + "="*70)
print(f"âœ… ALL BATCHES COMPLETE!")
print(f"Total Rows in CSV       : {original_df_len}")
print(f"Rows Processed in Run   : {total_processed_in_run}")
print(f"----------------------------------------------------------")
print(f"GRAND TOTALS (NEW RESULTS):")
print(f"Total GOOD Faces        : {total_good_count}")
print(f"Total NO FACE Images    : {total_noface_count}")
print(f"Total DOWNLOAD FAILURES : {total_download_err_count}")
print(f"NO FACE images saved to : {manual_folder}")
print("="*70 + "\n")

Total rows in CSV: 24909
Rows already processed (skipped): 24872
Rows to be processed (Empty + 'DOWNLOAD_ERROR'): 37

Dividing 37 rows into 1 batches of up to 2000 rows each.
------------------------------------------------------------
ðŸš€ Starting Batch 1/1 (Size: 37)


Batch 1 Progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 37/37 [00:01<00:00, 30.41it/s]



------------------------------------------------------------
âœ… Batch 1 Summary (Processed 37 rows)
Batch GOOD Faces        : 0
Batch NO FACE Images    : 0
Batch DOWNLOAD FAILURES : 37
------------------------------------------------------------
ðŸ’¾ **CSV checkpoint saved successfully** after Batch 1.
------------------------------------------------------------


âœ… ALL BATCHES COMPLETE!
Total Rows in CSV       : 24909
Rows Processed in Run   : 37
----------------------------------------------------------
GRAND TOTALS (NEW RESULTS):
Total GOOD Faces        : 0
Total NO FACE Images    : 0
Total DOWNLOAD FAILURES : 37
NO FACE images saved to : E:\BAT eCRM - Main Folder\eCRM_Audit_Monthly Report\November\Manual

