In [None]:
# Note: Tesseract OCR engine must be installed separately
# Windows: https://github.com/UB-Mannheim/tesseract/wiki
# macOS: brew install tesseract
# Linux: sudo apt-get install tesseract-ocr

In [None]:
!pip install kagglehub opencv-python pytesseract numpy Pillow PyMuPDF python-docx

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.5 pytesseract-0.3.13 python-docx-1.2.0


In [None]:
import cv2 as cv # image processing
import numpy as np # for math calculations
from PIL import Image # opening/verifying images
import pytesseract # OCR engine
import os # check if file exists, create directories
from pathlib import Path # handle file paths
import csv # reading/writing CSV files
import fitz  # PyMuPDF for PDF handling
from docx import Document  # python-docx for Word documents
import kagglehub


# Function to validate image file
def is_valid_image(file_path):
    # Check if file-path exists
    if not os.path.exists(file_path):
        # print(f"Error: File not found - {file_path}")
        return False

    # Check for valid image extensions
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp', '.pdf', '.docx', '.doc'}
    file_ext = Path(file_path).suffix.lower()

    # Check file extension
    if file_ext not in valid_extensions:
        print(f"Error: Invalid file extension - {file_ext}")
        return False

    # If PDF, verify it's a valid PDF
    if file_ext == '.pdf':
        try:
            doc = fitz.open(file_path)
            doc.close()
            return True
        except Exception as e:
            # print(f"Error: Invalid PDF file - {str(e)}")
            return False

    # If DOCX, verify it's a valid DOCX
    if file_ext in ['.docx', '.doc']:
        try:
            doc = Document(file_path)
            return True
        except Exception as e:
            # print(f"Error: Invalid DOCX file - {str(e)}")
            return False

    # Try to load the image and verify
    try:
        with Image.open(file_path) as img:
            # verify() checks for corruption
            img.verify()
        return True
    except Exception as e:
        # print(f"Error: Cannot open image - {str(e)}")
        return False


# calculate text density in the image (Purpose: How much text vs empty space in the image?)
def calculate_text_density(gray):
    # Converts image to black & white (text = white, background = black)
    _, binary = cv.threshold(gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)

    # Calculate percentage of text pixels (Counts white pixels (text) vs total pixels)
    text_pixels = np.sum(binary == 255)
    total_pixels = binary.size
    text_density = (text_pixels / total_pixels) * 100
    return round(text_density, 2)


# calculate noise level in the image (Purpose: How grainy/dirty is the image?)
def calculate_noise_level(gray, kernel_size=5):
    # Apply median filter to remove noise
    median_filtered = cv.medianBlur(gray, kernel_size)

    # Difference between original and filtered = noise
    noise = cv.absdiff(gray, median_filtered)
    noise_level = np.mean(noise)

    return round(noise_level, 2)


def calculate_skew_angle(gray):
    edges = cv.Canny(gray, 50, 150, apertureSize=3)
    lines = cv.HoughLines(edges, 1, np.pi / 180, 200)

    if lines is not None and len(lines) > 0:
        angles = []
        for rho, theta in lines[:, 0]:
            angle = (theta * 180 / np.pi) - 90
            angles.append(angle)
        skew = abs(np.median(angles))
        return round(skew, 2)

    return 0.0


# Purpose: Is the background clean or has shadows/stains?
def calculate_background_uniformity(gray):
    # Heavy blur to get background
    background = cv.GaussianBlur(gray, (51, 51), 0)

    # Check variation in background
    uniformity = np.std(background)

    return round(uniformity, 2)


# Purpose: Does the image have normal document proportions?
def check_aspect_ratio(image):
    height, width = image.shape[:2]
    aspect_ratio = width / height

    # Standard paper ratios: A4=1.414
    # Allow range: 0.65 (portrait) to 1.7 (wide portrait)
    is_normal = 0.65 < aspect_ratio < 1.7

    return round(aspect_ratio, 2), is_normal


# Purpose: The MAIN quality checker - combines all tests!
def calculate_image_quality(image):
    # Convert to grayscale if needed
    if len(image.shape) == 3:
        gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    else:
        gray = image

    # Basic metrics
    laplacian_var = cv.Laplacian(gray, cv.CV_64F).var() # Sharpness (Laplacian variance), Measures how sharp/blurry, and the image is High value = sharp, Low value = blurry
    brightness = np.mean(gray) # Brightness (Mean pixel value), Average brightness of the image, Higher value = brighter image
    contrast = np.std(gray) # Contrast (Standard deviation of pixel values), Measures difference between light and dark areas,( High = easy to read, Low = washed out )

    # Resolution
    height, width = gray.shape
    resolution = width * height

    # Advanced metrics
    text_density = calculate_text_density(gray)
    noise_level = calculate_noise_level(gray)
    skew_angle = calculate_skew_angle(gray)
    background_uniformity = calculate_background_uniformity(gray)
    aspect_ratio, is_normal_aspect = check_aspect_ratio(image)

    # Determine if image needs preprocessing
    needs_processing = False
    reasons = []

    # Check basic quality
    if laplacian_var < 100:  # Blurry image
        needs_processing = True
        reasons.append("low_sharpness")

    if brightness < 50 or brightness > 200:  # Too dark or too bright
        needs_processing = True
        reasons.append("poor_brightness")

    if contrast < 30:  # Low contrast == washed out
        needs_processing = True
        reasons.append("low_contrast")

    if resolution < 500000:  # Low resolution
        needs_processing = True
        reasons.append("low_resolution")

    # Check advanced quality
    if text_density < 8 or text_density > 50:  # Too sparse or too dense
        needs_processing = True
        reasons.append("abnormal_text_density")

    if noise_level > 10:  # Noisy image
        needs_processing = True
        reasons.append("high_noise")

    if skew_angle > 2:  # Tilted document
        needs_processing = True
        reasons.append("skewed_document")

    if background_uniformity > 25:  # Uneven background
        needs_processing = True
        reasons.append("non_uniform_background")

    if not is_normal_aspect:  # Abnormal proportions
        needs_processing = True
        reasons.append("abnormal_aspect_ratio")

    # Calculate overall quality score
    quality_score = calculate_quality_score(
        laplacian_var, brightness, contrast, resolution,
        text_density, noise_level, skew_angle, background_uniformity
    )

    quality_metrics = {
        # Basic metrics
        'sharpness': round(laplacian_var, 2),
        'brightness': round(brightness, 2),
        'contrast': round(contrast, 2),
        'resolution': resolution,

        # Advanced metrics
        'text_density': text_density,
        'noise_level': noise_level,
        'skew_angle': skew_angle,
        'background_uniformity': background_uniformity,
        'aspect_ratio': aspect_ratio,
        'is_normal_aspect': is_normal_aspect,

        # Overall assessment
        'needs_processing': needs_processing,
        'reasons': reasons,
        'quality_score': quality_score
    }

    return quality_metrics


# Purpose: Give the image a grade from 0-100
def calculate_quality_score(sharpness, brightness, contrast, resolution,
                            text_density=None, noise_level=None, skew_angle=None,
                            background_uniformity=None):
    # Basic metrics (70 points total)
    sharpness_score = min(sharpness / 500, 1.0) * 25  # Max 25 points

    # Brightness score (optimal range 80-180)
    brightness_diff = abs(brightness - 130)
    brightness_score = max(0, (1 - brightness_diff / 130)) * 20  # Max 20 points

    # Contrast score
    contrast_score = min(contrast / 100, 1.0) * 15  # Max 15 points

    # Resolution score
    resolution_score = min(resolution / 2000000, 1.0) * 10  # Max 10 points

    total_score = sharpness_score + brightness_score + contrast_score + resolution_score

    # Advanced metrics (30 points total) - only if provided
    if text_density is not None:
        # Optimal text density: 15-35%
        if 15 <= text_density <= 35:
            density_score = 10
        elif 10 <= text_density < 15 or 35 < text_density <= 45:
            density_score = 5
        else:
            density_score = 0
        total_score += density_score

    if noise_level is not None:
        # Lower noise is better
        noise_score = max(0, (1 - noise_level / 20)) * 8  # Max 8 points
        total_score += noise_score

    if skew_angle is not None:
        # Less skew is better
        skew_score = max(0, (1 - skew_angle / 10)) * 7  # Max 7 points
        total_score += skew_score

    if background_uniformity is not None:
        # More uniform is better
        uniformity_score = max(0, (1 - background_uniformity / 50)) * 5  # Max 5 points
        total_score += uniformity_score

    return round(total_score, 2)


# Purpose: Fix images that are too dark or too bright
def enhance_brightness_contrast(image):
    # Convert to LAB color space
    if len(image.shape) == 3:
        lab = cv.cvtColor(image, cv.COLOR_BGR2LAB)
        l, a, b = cv.split(lab)
    else:
        l = image

    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) # CLAHE (Contrast Limited Adaptive Histogram Equalization)
    enhanced_l = clahe.apply(l)

    if len(image.shape) == 3:
        enhanced_lab = cv.merge([enhanced_l, a, b])
        enhanced = cv.cvtColor(enhanced_lab, cv.COLOR_LAB2BGR)
    else:
        enhanced = enhanced_l

    return enhanced


# Purpose: Reduce noise/graininess from the image
def denoise_image(image):
    if len(image.shape) == 3:
        denoised = cv.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
    else:
        denoised = cv.fastNlMeansDenoising(image, None, 10, 7, 21)

    return denoised


# Purpose: Make blurry text sharper
def sharpen_image(image):
    kernel = np.array([ [-1, -1, -1],
                        [-1,  9, -1],
                        [-1, -1, -1]])
    sharpened = cv.filter2D(image, -1, kernel)
    return sharpened


def binarize_image(image):
    if len(image.shape) == 3:
        gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    else:
        gray = image

    # Apply Otsu's thresholding
    _, binary = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU) # Text becomes pure black, background becomes pure white

    return binary


# Purpose: Rotate tilted documents to be straight
def deskew_image(image):
    if len(image.shape) == 3:
        gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    else:
        gray = image

    # Detect edges
    edges = cv.Canny(gray, 50, 150, apertureSize=3)

    # Detect lines using Hough transform
    lines = cv.HoughLines(edges, 1, np.pi / 180, 200)

    if lines is not None and len(lines) > 0:
        # Calculate median angle
        angles = []
        for rho, theta in lines[:, 0]:
            angle = (theta * 180 / np.pi) - 90
            angles.append(angle)

        median_angle = np.median(angles)

        # Only rotate if angle is significant
        if abs(median_angle) > 0.5:
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_matrix = cv.getRotationMatrix2D(center, median_angle, 1.0)
            rotated = cv.warpAffine(image, rotation_matrix, (width, height),
                                    flags=cv.INTER_CUBIC,
                                    borderMode=cv.BORDER_REPLICATE)
            return rotated

    return image


# Purpose: Make small images bigger for better OCR
def upscale_image(image, scale_factor=2):
    height, width = image.shape[:2]
    new_dimensions = (width * scale_factor, height * scale_factor)
    upscaled = cv.resize(image, new_dimensions, interpolation=cv.INTER_CUBIC)
    return upscaled


def preprocess_image(image, quality_metrics, show_details=False):
    processed = image.copy()
    reasons = quality_metrics['reasons']

    if not reasons:
        if show_details:
            print("Image quality is good, applying minimal processing...")
    else:
        if show_details:
            print(f"Applying preprocessing for: {', '.join(reasons)}")

    # Upscale if low resolution
    if 'low_resolution' in reasons:
        if show_details:
            print("  - Upscaling image...")
        processed = upscale_image(processed, scale_factor=2)

    # Deskew if needed (or if skew detected)
    if 'skewed_document' in reasons:
        if show_details:
            print("  - Deskewing image...")
        processed = deskew_image(processed)

    # Denoise if noisy
    if 'high_noise' in reasons:
        if show_details:
            print("  - Denoising image...")
        processed = denoise_image(processed)

    # Enhance brightness and contrast if needed
    if 'poor_brightness' in reasons or 'low_contrast' in reasons or 'non_uniform_background' in reasons:
        if show_details:
            print("  - Enhancing brightness and contrast...")
        processed = enhance_brightness_contrast(processed)

    # Sharpen if blurry
    if 'low_sharpness' in reasons:
        if show_details:
            print("  - Sharpening image...")
        processed = sharpen_image(processed)

    # Binarize for better OCR (always done at the end)
    if show_details:
        print("  - Binarizing image...")
    processed = binarize_image(processed)

    return processed


# Purpose: Extract text(paragraphs & tables) from DOCX files
def extract_text_from_docx(docx_path, show_details=False):
    try:
        if show_details:
            print("  - Extracting text from DOCX using python-docx...")

        # Open DOCX
        doc = Document(docx_path)

        # Extract text from all paragraphs
        full_text = ""
        paragraph_count = 0

        for paragraph in doc.paragraphs:
            if paragraph.text.strip():  # Only add non-empty paragraphs
                full_text += paragraph.text + "\n"
                paragraph_count += 1

        # Also extract text from tables
        table_count = len(doc.tables)
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        full_text += cell.text + " "
            full_text += "\n"

        # Clean up text
        full_text = full_text.strip()

        if show_details:
            print(f"  - Extracted text from {paragraph_count} paragraph(s)")
            if table_count > 0:
                print(f"  - Extracted text from {table_count} table(s)")
            print(f"  - Total characters: {len(full_text)}")

        return {
            'text': full_text,
            'paragraph_count': paragraph_count,
            'table_count': table_count,
            'success': True
        }

    except Exception as e:
        if show_details:
            print(f"  - Error extracting from DOCX: {str(e)}")
        return {
            'text': '',
            'paragraph_count': 0,
            'table_count': 0,
            'success': False,
            'error': str(e)
        }


# Purpose: Check if file is a DOCX or DOC
def is_docx_file(file_path):
    return Path(file_path).suffix.lower() in ['.docx', '.doc']


# Purpose: Extract text from PDF files
def extract_text_from_pdf(pdf_path, show_details=False):
    try:
        if show_details:
            print("  - Extracting text from PDF using PyMuPDF...")

        # Open PDF
        doc = fitz.open(pdf_path)

        # Extract text from all pages
        full_text = ""
        page_count = len(doc)

        for page_num in range(page_count):
            page = doc[page_num]
            text = page.get_text()
            full_text += text + "\n"

        doc.close()

        # Clean up text
        full_text = full_text.strip()

        if show_details:
            print(f"  - Extracted text from {page_count} page(s)")
            print(f"  - Total characters: {len(full_text)}")

        return {
            'text': full_text,
            'page_count': page_count,
            'success': True
        }

    except Exception as e:
        if show_details:
            print(f"  - Error extracting from PDF: {str(e)}")
        return {
            'text': '',
            'page_count': 0,
            'success': False,
            'error': str(e)
        }


# Purpose: Check if file is a PDF
def is_pdf_file(file_path):
    return Path(file_path).suffix.lower() == '.pdf'



def extract_text_from_image(image, config='--oem 3 --psm 6'):
    try:
        text = pytesseract.image_to_string(image, config=config)
        return text.strip()
    except Exception as e:
        # print(f"Error during text extraction: {str(e)}")
        return ""


# Purpose: Extract text along with confidence scores and bounding boxes
def extract_text_with_details(image):
    try:
        # Get detailed OCR data
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

        text_blocks = []
        n_boxes = len(data['text'])

        # Collect text blocks with confidence and bounding boxes
        for i in range(n_boxes):
            if int(data['conf'][i]) > 0:  # Only include confident detections
                text_blocks.append({
                    'text': data['text'][i],
                    'confidence': data['conf'][i],
                    'bbox': (data['left'][i], data['top'][i],
                            data['width'][i], data['height'][i])
                })

        full_text = ' '.join([block['text'] for block in text_blocks if block['text'].strip()])
        avg_confidence = np.mean([block['confidence'] for block in text_blocks]) if text_blocks else 0

        return {
            'text': full_text,
            'average_confidence': round(avg_confidence, 2),
            'blocks': text_blocks
        }
    except Exception as e:
        # print(f"Error during detailed extraction: {str(e)}")
        return {'text': '', 'average_confidence': 0, 'blocks': []}


# Purpose: The COMPLETE pipeline - from image to text!
def process_cv_image(file_path, save_processed=False, output_dir='processed_images', show_details=False):
    if show_details:
        print(f"\n{'='*60}")
        print(f"Processing CV: {file_path}")
        print(f"{'='*60}\n")

    # Step 1: Validate file
    if show_details:
        print("Step 1: Validating file...")
    if not is_valid_image(file_path):
        return {'error': 'Invalid file', 'file_path': file_path}
    if show_details:
        print("file is valid\n")

    # Check if file is DOCX
    if is_docx_file(file_path):
        if show_details:
            print("Step 2: Detected DOCX file")
            print("Step 3: Extracting text from DOCX...\n")

        # Extract text from DOCX
        docx_results = extract_text_from_docx(file_path, show_details=show_details)

        if not docx_results['success']:
            return {
                'error': 'Failed to extract from DOCX',
                'file_path': file_path,
                'details': docx_results.get('error', 'Unknown error')
            }

        extracted_text = docx_results['text']

        if show_details:
            if extracted_text:
                print(f"Text extracted successfully ({len(extracted_text)} characters)")
                print(f"  Paragraphs: {docx_results['paragraph_count']}")
                print(f"  Tables: {docx_results['table_count']}\n")
            else:
                print("No text extracted (DOCX might be empty)\n")

        # Prepare results for DOCX
        results = {
            'file_path': file_path,
            'file_type': 'DOCX',
            'paragraph_count': docx_results['paragraph_count'],
            'table_count': docx_results['table_count'],
            'quality_metrics': {
                'quality_score': 100,  # DOCX files are high quality
                'extraction_method': 'direct_text'
            },
            'preprocessing_applied': False,
            'extracted_text': extracted_text,
            'text_length': len(extracted_text),
            'average_confidence': 100.0,  # Direct text extraction = 100% confidence
            'word_count': len(extracted_text.split())
        }

        return results


    # Check if file is PDF
    if is_pdf_file(file_path):
        if show_details:
            print("Step 2: Detected PDF file")
            print("Step 3: Extracting text from PDF...\n")

        # Extract text directly from PDF
        pdf_results = extract_text_from_pdf(file_path, show_details=show_details)

        if not pdf_results['success']:
            return {
                'error': 'Failed to extract from PDF',
                'file_path': file_path,
                'details': pdf_results.get('error', 'Unknown error')
            }

        extracted_text = pdf_results['text']

        if show_details:
            if extracted_text:
                print(f"Text extracted successfully ({len(extracted_text)} characters)")
                print(f"  Pages processed: {pdf_results['page_count']}\n")
            else:
                print("No text extracted (PDF might be image-based)\n")

        # Prepare results for PDF
        results = {
            'file_path': file_path,
            'file_type': 'PDF',
            'page_count': pdf_results['page_count'],
            'quality_metrics': {
                'quality_score': 100,  # PDFs with text are high quality
                'extraction_method': 'direct_text'
            },
            'preprocessing_applied': False,
            'extracted_text': extracted_text,
            'text_length': len(extracted_text),
            'average_confidence': 100.0,  # Direct text extraction = 100% confidence
            'word_count': len(extracted_text.split())
        }

        return results

    # Step 2: Load image (if not PDF or DOCX, process as image)
    if show_details:
        print("Step 2: Loading image...")
    image = cv.imread(file_path)
    if image is None:
        return {'error': 'Failed to load image', 'file_path': file_path}
    if show_details:
        print(f"Image loaded: {image.shape[1]}x{image.shape[0]} pixels\n")

    # Step 3: Assess quality
    if show_details:
        print("Step 3: Assessing image quality...")
    quality_metrics = calculate_image_quality(image)
    if show_details:
        print(f"  Basic Metrics:")
        print(f"    Sharpness: {quality_metrics['sharpness']:.2f}")
        print(f"    Brightness: {quality_metrics['brightness']:.2f}")
        print(f"    Contrast: {quality_metrics['contrast']:.2f}")
        print(f"    Resolution: {quality_metrics['resolution']} pixels")
        print(f"  Advanced Metrics:")
        print(f"    Text Density: {quality_metrics['text_density']:.2f}%")
        print(f"    Noise Level: {quality_metrics['noise_level']:.2f}")
        print(f"    Skew Angle: {quality_metrics['skew_angle']:.2f}°")
        print(f"    Background Uniformity: {quality_metrics['background_uniformity']:.2f}")
        print(f"    Aspect Ratio: {quality_metrics['aspect_ratio']} ({'Normal' if quality_metrics['is_normal_aspect'] else 'Abnormal'})")
        print(f"  Overall Quality Score: {quality_metrics['quality_score']}/100")
        print(f"  Needs Processing: {quality_metrics['needs_processing']}")
        if quality_metrics['reasons']:
            print(f"  Issues Found: {', '.join(quality_metrics['reasons'])}")
        print()

    # Step 4: Preprocess if needed
    if quality_metrics['needs_processing']:
        if show_details:
            print("Step 4: Preprocessing image (quality issues detected)...")
        processed_image = preprocess_image(image, quality_metrics, show_details=show_details)
        if show_details:
            print("✓ Preprocessing complete\n")
    else:
        if show_details:
            print("Step 4: Skipping preprocessing (image quality is good)...")
        # Still apply basic binarization for better OCR
        processed_image = binarize_image(image)
        if show_details:
            print("✓ Applied basic binarization\n")

    # Step 6: Extract text
    if show_details:
        print("Step 5: Extracting text...")
    extracted_text = extract_text_from_image(processed_image)
    detailed_results = extract_text_with_details(processed_image)

    if show_details:
        if extracted_text:
            print(f"Text extracted successfully ({len(extracted_text)} characters)")
            print(f"  Average confidence: {detailed_results['average_confidence']:.2f}%\n")
        else:
            print("No text extracted\n")

    # Prepare results
    results = {
        'file_path': file_path,
        'quality_metrics': quality_metrics,
        'preprocessing_applied': quality_metrics['needs_processing'],
        'extracted_text': extracted_text,
        'text_length': len(extracted_text),
        'average_confidence': detailed_results['average_confidence'],
        'word_count': len(extracted_text.split()),
        'detailed_results': detailed_results
    }

    return results


# Purpose: Save FULL results with all metadata
def save_to_csv(results, csv_path='cv_dataset.csv', append=True):
    try:
        if 'error' in results:
            return False

        # Prepare row data with category
        row_data = {
            'filename': Path(results['file_path']).name,
            'full_path': results['file_path'],
            'file_type': results.get('file_type', 'Image'),
            'category': results.get('category', 'unknown'),
            'main_folder': results.get('main_folder', 'unknown'),
            'subfolder': results.get('subfolder', 'unknown'),
            'extracted_text': results['extracted_text'],
            'quality_score': results.get('quality_metrics', {}).get('quality_score', 0),
            'sharpness': results.get('quality_metrics', {}).get('sharpness', 0),
            'brightness': results.get('quality_metrics', {}).get('brightness', 0),
            'contrast': results.get('quality_metrics', {}).get('contrast', 0),
            'resolution': results.get('quality_metrics', {}).get('resolution', 0),
            'preprocessing_applied': results.get('preprocessing_applied', False),
            'text_length': results['text_length'],
            'word_count': results['word_count'],
            'average_confidence': results.get('average_confidence', 0)
        }

        file_exists = os.path.exists(csv_path)
        mode = 'a' if (append and file_exists) else 'w'

        with open(csv_path, mode, newline='', encoding='utf-8') as csvfile:
            fieldnames = list(row_data.keys())
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            if not file_exists or not append:
                writer.writeheader()

            writer.writerow(row_data)

        return True

    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")
        return False



# Purpose: Save minimal text data to CSV for model training
def save_text_only_csv(results, csv_path='cv_text_dataset.csv', append=True):
    try:
        if 'error' in results:
            return False

        # Prepare minimal row data with category as 3rd column
        row_data = {
            'id': Path(results['file_path']).stem,
            'text': results['extracted_text'],
            'category': results.get('category', 'unknown')
        }

        file_exists = os.path.exists(csv_path)
        mode = 'a' if (append and file_exists) else 'w'

        with open(csv_path, mode, newline='', encoding='utf-8') as csvfile:
            # Order: id, text, category (for model training)
            fieldnames = ['id', 'text', 'category']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            if not file_exists or not append:
                writer.writeheader()

            writer.writerow(row_data)

        return True

    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")
        return False


  # Purpose: Process multiple CV images and save results
def process_multiple_cvs(file_paths, csv_path='cv_dataset.csv', save_processed=False, show_details=False):
    all_results = []

    # If file_paths is a directory, get all images from it
    if isinstance(file_paths, str) and os.path.isdir(file_paths):
        directory = file_paths
        file_paths = []
        valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}
        for file in os.listdir(directory):
            if Path(file).suffix.lower() in valid_extensions:
                file_paths.append(os.path.join(directory, file))
        print(f"Found {len(file_paths)} images in directory: {directory}\n")

    # Process each CV
    total = len(file_paths)
    for idx, file_path in enumerate(file_paths, 1):
        if show_details:
            print(f"\n{'#'*60}")
            print(f"Processing {idx}/{total}")
            print(f"{'#'*60}")
        else:
            # Show progress without show_details output
            print(f"Processing {idx}/{total}: {Path(file_path).name}")

        # Process the CV
        results = process_cv_image(file_path, save_processed=save_processed, show_details=show_details)
        all_results.append(results)

        # Save to CSV (append mode for all files after first)
        append = (idx > 1)
        save_to_csv(results, csv_path=csv_path, append=append)

        # Also save to text-only CSV
        text_csv_path = csv_path.replace('.csv', '_text_only.csv')
        save_text_only_csv(results, csv_path=text_csv_path, append=append)

    print(f"\n{'='*60}")
    print(f"BATCH PROCESSING COMPLETE")
    print(f"{'='*60}")
    print(f"Total CVs processed: {total}")
    print(f"Successful extractions: {sum(1 for r in all_results if 'extracted_text' in r and r['extracted_text'])}")
    print(f"Results saved to: {csv_path}")
    print(f"Text-only dataset: {text_csv_path}")
    print(f"{'='*60}\n")

    return all_results


# Purpose: Get all files from nested folders with category info
def get_all_images_from_folders(root_dir):
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp', '.pdf', '.docx', '.doc'}
    image_files = []

    # Walk through all directories and subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if Path(filename).suffix.lower() in valid_extensions:
                full_path = os.path.join(dirpath, filename)

                # Extract category information from path
                rel_path = os.path.relpath(dirpath, root_dir)
                path_parts = rel_path.split(os.sep)

                # Get main folder and subfolder
                main_folder = path_parts[0] if len(path_parts) > 0 else 'unknown'
                subfolder = path_parts[-1] if len(path_parts) > 0 else 'unknown'

                image_files.append({
                    'full_path': full_path,
                    'filename': filename,
                    'main_folder': main_folder,
                    'subfolder': subfolder,
                    'category': subfolder  # Use subfolder as category (e.g., "Accountant")
                })

    return image_files


# Purpose: Process all images in nested folders and compile results
def process_nested_folders(root_dir, csv_path='cv_dataset_all.csv', overwrite=False, save_processed=False, show_details=False):
    print(f"\n{'='*60}")
    print(f"SCANNING NESTED FOLDERS")
    print(f"{'='*60}")
    print(f"Root directory: {root_dir}\n")

    # Get all image files
    image_files = get_all_images_from_folders(root_dir)

    if not image_files:
        print("No image files found!")
        return {'total': 0, 'processed': 0, 'failed': 0}

    print(f"Found {len(image_files)} total images")

    # Count by category
    category_counts = {}
    for img in image_files:
        cat = img['category']
        category_counts[cat] = category_counts.get(cat, 0) + 1

    print(f"Found {len(category_counts)} categories:")
    for cat, count in sorted(category_counts.items()):
        print(f"  - {cat}: {count} images")

    print(f"\n{'='*60}")
    print(f"STARTING PROCESSING")
    print(f"{'='*60}\n")

    # Process each image
    all_results = []
    successful = 0
    failed = 0

    for idx, img_info in enumerate(image_files, 1):
        file_path = img_info['full_path']
        category = img_info['category']

        if show_details:
            print(f"\n{'#'*60}")
            print(f"Processing {idx}/{len(image_files)}")
            print(f"Category: {category}")
            print(f"{'#'*60}")
        else:
            # Show progress with category
            print(f"Processing {idx}/{len(image_files)}: [{category}] {img_info['filename']}")

        # Process the CV
        results = process_cv_image(file_path, save_processed=save_processed, show_details=show_details)

        # Add category to results
        results['category'] = category
        results['main_folder'] = img_info['main_folder']
        results['subfolder'] = img_info['subfolder']

        all_results.append(results)

        # Track success/failure
        if 'error' not in results and results['extracted_text']:
            successful += 1
        else:
            failed += 1

        # We decide the append mode ONCE for the whole function
        # If we are NOT overwriting, we must *always* append.
        # If we ARE overwriting, we append only *after* the first file.
        current_append_mode = True
        if overwrite and idx == 1:
            current_append_mode = False
        save_to_csv(results, csv_path=csv_path, append=current_append_mode)

        # Also save to text-only CSV
        text_csv_path = csv_path.replace('.csv', '_text_only.csv')
        save_text_only_csv(results, csv_path=text_csv_path, append=current_append_mode)

    # Print final statistics
    print(f"\n{'='*60}")
    print(f"PROCESSING COMPLETE")
    print(f"{'='*60}")
    print(f"Total images: {len(image_files)}")
    print(f"Successfully extracted: {successful}")
    print(f"Failed/Empty: {failed}")
    print(f"Success rate: {(successful/len(image_files)*100):.2f}%")
    print(f"\nResults saved to:")
    print(f"  - Full dataset: {csv_path}")
    print(f"  - Text only: {text_csv_path}")

    # Statistics by category
    print(f"\n{'='*60}")
    print(f"STATISTICS BY CATEGORY")
    print(f"{'='*60}")
    category_stats = {}
    for result in all_results:
        cat = result.get('category', 'unknown')
        if cat not in category_stats:
            category_stats[cat] = {'total': 0, 'success': 0}
        category_stats[cat]['total'] += 1
        if 'error' not in result and result.get('extracted_text'):
            category_stats[cat]['success'] += 1

    for cat in sorted(category_stats.keys()):
        stats = category_stats[cat]
        success_rate = (stats['success'] / stats['total'] * 100) if stats['total'] > 0 else 0
        print(f"{cat:30s}: {stats['success']:4d}/{stats['total']:4d} ({success_rate:5.1f}%)")

    print(f"{'='*60}\n")

    return {
        'total': len(image_files),
        'processed': len(all_results),
        'successful': successful,
        'failed': failed,
        'category_stats': category_stats
    }


# Purpose: Load dataset from CSV for analysis
def load_dataset_from_csv(csv_path='cv_dataset.csv'):
    try:
        dataset = []
        with open(csv_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                dataset.append(row)

        print(f"Loaded {len(dataset)} CV records from {csv_path}")
        return dataset

    except Exception as e:
        print(f"Error loading CSV: {str(e)}")
        return []


# Purpose: Export all extracted texts to a single text file for NLP tasks
def export_texts_for_nlp(csv_path='cv_dataset.csv', output_path='cv_texts.txt'):
    try:
        dataset = load_dataset_from_csv(csv_path)

        with open(output_path, 'w', encoding='utf-8') as f:
            for record in dataset:
                # Replace newlines in text with spaces for single-line format
                text = record['extracted_text'].replace('\n', ' ').strip()
                f.write(text + '\n')

        print(f"Exported {len(dataset)} texts to {output_path}")
        return True

    except Exception as e:
        print(f"Error exporting texts: {str(e)}")
        return False


if __name__ == "__main__":
    # ==================== Main OPTION ====================
    results = process_cv_image("path/to/file.png", show_details=False)
    text = results['extracted_text']

    # Multiple files
    for file_path in ["cv1.png", "resume.pdf", "doc.docx"]:
        results = process_cv_image(file_path, show_details=False)
        text = results.get('extracted_text', '')
        print(text)




OPTION 4: Process ALL Nested Folders
------------------------------------------------------------
Downloading from https://www.kaggle.com/api/v1/datasets/download/youssefkhalil/resumes-images-datasets?dataset_version_number=1...


100%|██████████| 2.62G/2.62G [00:19<00:00, 147MB/s]

Extracting files...






SCANNING NESTED FOLDERS
Root directory: /root/.cache/kagglehub/datasets/youssefkhalil/resumes-images-datasets/versions/1

Found 12646 total images
Found 99 categories:
  - Accountant: 492 images
  - Accountant resumes: 67 images
  - Advocate: 453 images
  - Advocate resumes: 94 images
  - Agricultural: 397 images
  - Agricultural resumes: 68 images
  - Agriculture: 167 images
  - Apparel: 145 images
  - Apparel resumes: 51 images
  - Architect: 176 images
  - Architects resumes: 60 images
  - Arts: 463 images
  - Arts resumes: 83 images
  - Automobile: 156 images
  - Automobile resumes: 40 images
  - Avian: 72 images
  - Aviation: 251 images
  - Aviation resumes: 51 images
  - BPO resumes: 30 images
  - Banking: 307 images
  - Banking resumes: 41 images
  - Blockchain: 49 images
  - Blockchain resumes: 9 images
  - Building: 7 images
  - Building _Construction resumes: 55 images
  - Business Analyst resumes: 64 images
  - BusinessAnalyst: 80 images
  - Civil Engineer resumes: 89 image

KeyboardInterrupt: 