<a href="https://colab.research.google.com/github/SunSlick2/booktrade/blob/main/the_image_is_correct%2C_just_that_it_can't_extract_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Okay, I've updated the coordinates and added some common image pre-processing steps (grayscale and binarization) which often significantly improve OCR accuracy, especially on screenshots. I've also slightly relaxed the date regular expression to handle different delimiters and single-digit days/months more gracefully.

**Key Changes:**

1.  **New Coordinates:** `top_left_x = 331`, `top_left_y = 383`, `bottom_right_x = 418`, `bottom_right_y = 406`.
2.  **Image Pre-processing:**
      * Convert to `grayscale`.
      * Apply `Otsu's thresholding` to convert the image to pure black and white, which helps EasyOCR distinguish text from background.
3.  **Regex Refinement:** The date pattern now explicitly allows `/`, `-`, or `.` as separators.

-----

### Revised Python Code (`extract_date_easyocr.py`)

In [None]:
import mss
import mss.tools
from PIL import Image
import easyocr
import re
import os
import sys
import numpy as np
import cv2 # <--- NEW: Import OpenCV for image processing

# --- Set OpenMP environment variable to suppress warnings (must be at the very top) ---
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# --- Configuration ---
# Define the snip coordinates as (top_left_x, top_left_y) and (bottom_right_x, bottom_right_y)
top_left_x = 331
top_left_y = 383
bottom_right_x = 418
bottom_right_y = 406

# Calculate width and height from the provided coordinates
capture_region = {
    "left": top_left_x,
    "top": top_left_y,
    "width": bottom_right_x - top_left_x,
    "height": bottom_right_y - top_left_y
}

# Define the path for the output file
output_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "extracted_date_easyocr.txt")

# Define the directory where EasyOCR models are stored locally (e.g., after manual download)
easyocr_model_dir = os.path.join(os.path.expanduser('~'), '.EasyOCR', 'model')

# Initialize EasyOCR reader once globally for efficiency
try:
    sys.stdout.write("Attempting to initialize EasyOCR reader...\n")
    sys.stdout.flush()
    reader = easyocr.Reader(
        ['en'], # Languages to use for OCR. 'en' for English.
        model_storage_directory=easyocr_model_dir,
        download_enabled=False # Crucial: tells EasyOCR NOT to try downloading from the internet
    )
    sys.stdout.write("✅ EasyOCR reader initialized successfully from local models.\n")
    sys.stdout.flush()
except Exception as e:
    sys.stderr.write(f"❌ Error initializing EasyOCR reader. Please ensure models are in '{easyocr_model_dir}' and are correct: {e}\n")
    sys.stderr.flush()
    sys.exit(1) # Exit if reader cannot be initialized


def capture_screen_region(region):
    """Captures a specific region of the screen."""
    try:
        sys.stdout.write("Capturing screen region...\n")
        sys.stdout.flush()
        with mss.mss() as sct:
            sct_img = sct.grab(region)
            # Convert to PIL Image for processing
            img = Image.frombytes("RGB", sct_img.size, sct_img.rgb)
            return img
    except Exception as e:
        sys.stderr.write(f"❌ Error capturing screen: {e}\n")
        sys.stderr.flush()
        return None

def ocr_image_for_date(image):
    """Performs OCR on an image and tries to find a date using EasyOCR."""
    if image is None:
        return None

    try:
        sys.stdout.write("Performing OCR with EasyOCR...\n")
        sys.stdout.flush()

        # Convert PIL Image to NumPy array
        image_np = np.array(image)

        # --- NEW: Image Pre-processing for better OCR accuracy ---
        # 1. Convert to grayscale
        gray_image = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)

        # 2. Apply Otsu's thresholding to get a binary image (black and white)
        # This helps separate text from background, improving OCR
        _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # EasyOCR can work with binary images (NumPy arrays)
        results = reader.readtext(binary_image) # <--- Pass the processed NumPy array here

        full_ocr_text = ""
        # Collect all text to search for the date, or directly search in each result
        for (bbox, text, prob) in results:
            full_ocr_text += text + " " # Concatenate recognized text

        sys.stdout.write(f"DEBUG: EasyOCR Raw Text (concatenated): '{full_ocr_text.strip()}'\n")
        sys.stdout.flush()

        # Regular expression for dd/mm/yyyy format
        # Made it more flexible: allows /, -, or . as separators.
        # It still captures Day (1-2 digits), Month (01-12), Year (19xx|20xx)
        date_pattern = r'\b(\d{1,2})[/\-.](0[1-9]|1[0-2])[/\-.]((?:19|20)\d{2})\b'

        # Prioritize searching in individual OCR results for better precision
        for (bbox, text, prob) in results:
            match = re.search(date_pattern, text)
            if match:
                # Format the day with leading zeros if it's a single digit
                day = match.group(1).zfill(2)
                month = match.group(2)
                year = match.group(3)
                return f"{day}/{month}/{year}" # Return in dd/mm/yyyy format

        # If no date found in individual results, try the full concatenated text
        match = re.search(date_pattern, full_ocr_text)
        if match:
            day = match.group(1).zfill(2)
            month = match.group(2)
            year = match.group(3)
            return f"{day}/{month}/{year}"

        return None # No date found

    except Exception as e:
        sys.stderr.write(f"❌ Error during EasyOCR processing or date parsing: {e}\n")
        sys.stderr.flush()
        return None

def write_result_to_file(value, file_path):
    """Writes the extracted value to a specified file."""
    try:
        sys.stdout.write(f"Writing result to file: {file_path}\n")
        sys.stdout.flush()
        # Ensure the directory exists before trying to open the file
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w") as f:
            f.write(value if value is not None else "")
    except Exception as e:
        sys.stderr.write(f"❌ Error writing to file {file_path}: {e}\n")
        sys.stderr.flush()

def main():
    sys.stdout.write("Entering main function...\n")
    sys.stdout.flush()
    extracted_date = None
    try:
        screenshot = capture_screen_region(capture_region)
        if screenshot:
            screenshot_debug_path = "captured_date_region_debug.png"
            screenshot.save(screenshot_debug_path)
            sys.stdout.write(f"Captured region saved to {screenshot_debug_path}\n")
            sys.stdout.flush()

            extracted_date = ocr_image_for_date(screenshot)

    except Exception as e:
        sys.stderr.write(f"❌ An unexpected error occurred in main: {e}\n")
        sys.stderr.flush()
    finally:
        sys.stdout.write("Exiting main function (finally block)...\n")
        sys.stdout.flush()
        write_result_to_file(extracted_date, output_file_path)
        if extracted_date:
            sys.stdout.write(f"✔️ Successfully wrote '{extracted_date}' to {output_file_path}\n")
        else:
            sys.stdout.write(f"⚠️ No date found or error occurred. Wrote empty string to {output_file_path}\n")
        sys.stdout.flush()


if __name__ == "__main__":
    main()
    sys.stdout.write("--- Script finished ---\n")
    sys.stdout.flush()

-----

### **Crucial Step for Debugging OCR Accuracy:**

After running this updated code, please provide the output from the console, specifically the line that starts with:

**`DEBUG: EasyOCR Raw Text (concatenated): '...'`**

This line will show us exactly what text EasyOCR managed to extract from the image. Knowing this will help us determine if:

1.  EasyOCR is seeing *nothing* at all (meaning the image quality is too low for it).
2.  EasyOCR is seeing *some* text, but it's garbled or incorrect.
3.  EasyOCR is seeing the text correctly, but our date regular expression is failing to match it.

Based on that `DEBUG` output, we can decide if we need to refine the image processing further or adjust the regex.

**Remember to install `opencv-python` for the new pre-processing steps:**

```bash
conda activate your_easyocr_env # If in an environment
pip install opencv-python
```