# Introduction

In this notebook, we use Amazon Textract and Google Vision to provide a quick way of extracting text/tables from an image of a page.

Intended use: The intended use of this notebook is to quickly prototype. You should expect to modify the code in this notebook to suit your usecase.

Preparation: At a minimum, set a working folder, and make sure to add your API keys for both Textract and Google Vision. To do so, please follow the steps outlined here: https://github.com/MikeJGiordano/OCR_History/blob/main/ReadMe.md

This notebook contains four parts:

    1. Unmodified image OCR. This is intended to quickly detect text from a single image.
        a. There is then an option to run one or both OCR tools on a whole folder.
        
    2. Image preprocessing. This routine helps you to quickly preprocess a single image (adjust contrast, split image, etc). 
        a. If you are satisfied with the preprocessing routine, it will give you the option to preprocess a whole folder.
        
    3. Image preprocessing with text extraction. This runs the image modification from part 2 into the text detection from part 1.
    
    4. Image preprocessing with table extraction from Textract. This uses the image modification from part 2 to extract a table using Textract.

# Program Setup

## There are 5 steps, marked A-E.

### A: Import packages

In [None]:
import io
import json
import os

# if you don't have these packages use any package manager to install
# you can install all packages at once using the provided requirements.txt file
import cv2
import boto3
from google.cloud import vision

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm as tq

from PIL import Image, ImageDraw
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures, Direction, DirectionalFinderType
import math 

# note: the following py file, you'll have to download
import preprocess as pp 
import logging
import sys

# Set up logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Disable PIL max image size limit
Image.MAX_IMAGE_PIXELS = None

### B: Please set your working directories here

In [None]:
# please set the path to the folder containing your images here
input_folder = "/mnt/c/Users/WATLINGS/Documents/OCR Files/Census Processing/Documents/1920/Output"
output_folder = "/mnt/c/Users/WATLINGS/Documents/OCR Files/Census Processing/Documents/1920/Output_OCR"

In [None]:
#Authenticate Google Cloud here

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/mnt/c/Users/WATLINGS/Documents/GitHub/OCR_History/OCR_Python/ServiceAccountToken.json'
client = vision.ImageAnnotatorClient()

### E: Please authenticate Amazon Textract

For help with Amazon Textract, see https://github.com/MikeJGiordano/OCR_History/blob/main/Setup_AWS_Root.md

In [None]:
#Authenticate AWS Textract in the console/terminal

# Part 1: Basic text extraction

In [None]:

def resize_image_if_needed(img, max_size=8000, quality=85):
    """
    Resize image if either dimension exceeds max_size while maintaining aspect ratio.
    Added memory-efficient handling of large images.
    """
    width, height = img.size
    logger.info(f"Processing image of size {width}x{height}")
    
    if width > max_size or height > max_size:
        # Calculate new dimensions
        scale = max_size / max(width, height)
        new_width = math.floor(width * scale)
        new_height = math.floor(height * scale)
        
        try:
            logger.info(f"Resizing image from {width}x{height} to {new_width}x{new_height}")
            
            # Use LANCZOS for better quality, but fall back to NEAREST if memory error
            try:
                img = img.resize((new_width, new_height), Image.LANCZOS)
            except MemoryError:
                logger.warning("Memory error with LANCZOS, falling back to NEAREST")
                img = img.resize((new_width, new_height), Image.NEAREST)
            
            logger.info("Resize successful")
            
            # Convert to RGB if needed
            if img.mode != 'RGB':
                img = img.convert('RGB')
                logger.info("Converted to RGB mode")
            
            # Optimize memory usage
            if max(new_width, new_height) > 4000:
                # For very large images, compress more aggressively
                quality = min(quality, 75)
                logger.info(f"Large image detected, using reduced quality: {quality}")
            
            return img
            
        except Exception as e:
            logger.error(f"Error during resize: {str(e)}")
            raise
    
    return img

# First, let's create a function to process images and save results
def process_and_save_text(input_folder, output_folder, filename):
    print(f"\nProcessing {filename}...")
    
    # Setup paths
    input_path = os.path.join(input_folder, filename)
    base_name = os.path.splitext(filename)[0]
    output_text = os.path.join(output_folder, f"{base_name}_Textract.txt")
    output_json = os.path.join(output_folder, f"{base_name}_Textract.json")
    
    try:
        # Process image
        with Image.open(input_path) as img:
            # Resize if needed
            img = resize_image_if_needed(img)
            # Convert to RGB mode if needed
            if img.mode != 'RGB':
                img = img.convert('RGB')
            # Save as JPEG in memory
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=95)
            image_content = buffer.getvalue()
        
        # Process with Textract
        textract = boto3.client('textract')
        response = textract.detect_document_text(
            Document={'Bytes': image_content}
        )
        
        # Save JSON response
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(response, f, indent=2)
            
        # Save extracted text
        with open(output_text, 'w', encoding='utf-8') as f:
            for block in response['Blocks']:
                if block['BlockType'] == 'LINE':
                    f.write(block.get('Text', '') + '\n')
        
        print(f"Successfully processed {filename}")
        print(f"Text saved to: {output_text}")
        print(f"JSON saved to: {output_json}")
        
        return True
    
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return False





# Part 2: Preprocess images
Often, it helps to preprocess an image. Common routines are:
    
    1. Adjusting contrast or brightness
    2. Converting to grayscale
    3. Cropping
    4. Erasing margins
    5. Splitting images
    
We now provide two examples:
    
    1. Applying points 1-4 
    2. Preprocessing and splitting the image

### Example 1: Full image

In [None]:
# set the filename to your image here
railroad_table = "1888_Page_161.png"

In [None]:
#The next cell will apply the default preprocess settings to your image.
#If you are unsatisfied with those settings, it will instruct you on how to make changes.
#Those changes should be inserted in this cell.



In [None]:
#Preprocess a single image.
pp.preprocess_image(railroad_table,
                       input_folder,
                       output_folder,
                       **pp.default);

### Example 2: Split image

In [None]:
# set the filename to your split image here
korean_image = "126.png"

In [None]:
#The next cell will apply the default preprocess settings to your image.
#If you are unsatisfied with those settings, it will provide instructions on how to make changes.

pp.default['left_margin_percent'] = 30
pp.default['top_margin_percent'] = 5

In [None]:
#Preprocess a split image.
pp.preprocess_image(korean_image,
                       input_folder,
                       output_folder,
                       **pp.default);

# Part 3: Preprocessed Text Extraction

### Example 1: Full image

In [None]:
# using the above processing, the folder of modified images is located at:

modified_images = "output/modified_images/"

# Modification alters the name of the file to be:

modified_railroad = 'modified_' + railroad_table

In [None]:
# plot the image, save .json outputs
pp.process_content(modified_railroad, 
                   modified_images,
                   output_folder,
                   show_image = True,
                   use_google_vision=False, 
                   use_textract=True, 
                   verbose=True)

### Example 2: Split image

In [None]:
# Modification splits the file into two and renames them:

modified_1_split = 'modified_1_' + korean_image
modified_2_split = 'modified_2_' + korean_image

In [None]:
# plot the images, save .json and .txt outputs
pp.process_content(modified_1_split, 
                   modified_images,
                   output_folder,
                   show_image = True,
                   use_google_vision=True, 
                   use_textract=False, 
                   verbose=True)

pp.process_content(modified_2_split, 
                   modified_images,
                   output_folder,
                   show_image = False,
                   use_google_vision=False, 
                   use_textract=False, 
                   verbose=False)

### You can use the next cell to get text and JSON files for the entire folder of modified images through Google Vision, Textract, or both.

In [None]:
# Batch process all images in the modified folder, save .json outputs to the output folder

pp.batch_ocr(modified_images, 
                 output_folder, 
                 use_google_vision=False, 
                 use_textract=False)

# Part 4: Textract Table Extraction

### Setup

Initialize Textractor client, modify region if required

In [None]:
extractor = Textractor(profile_name="default")

Please specify the image you want to extract a table from.

In [None]:
# using the above processing, the folder of modified images is located at:

modified_images = "output/modified_images/"

# Modification alters the name of the file to be:

file_name = "Volume 1. Population, General Report and Analysis_8.png"

## Extract the tables

In [None]:
import os
from PIL import Image
import math
import io
from tqdm import tqdm 

def process_image(input_path, max_size=8000):
    """Process a single image with enhanced error handling and memory management"""
    try:
        logger.info(f"Opening image: {input_path}")
        
        # Open image with lazy loading
        with Image.open(input_path) as img:
            # Get original size
            orig_size = img.size
            logger.info(f"Original image size: {orig_size}, Mode: {img.mode}")
            
            # Resize if needed
            img = resize_image_if_needed(img, max_size)
            
            # Convert to RGB if needed
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Save as JPEG in memory with appropriate quality
            buffer = io.BytesIO()
            quality = 85 if max(img.size) <= 4000 else 75
            img.save(buffer, format='JPEG', quality=quality, optimize=True)
            
            logger.info(f"Successfully processed image. Original size: {orig_size}, Final size: {img.size}")
            return buffer.getvalue()
            
    except MemoryError:
        logger.error(f"Memory error processing {input_path}. Try reducing max_size parameter.")
        raise
    except Exception as e:
        logger.error(f"Error processing image {input_path}: {str(e)}")
        raise

def batch_resize_and_extract(extractor, input_folder, output_folder, max_size=8000):
    """Process all images in a folder with enhanced error handling"""
    os.makedirs(output_folder, exist_ok=True)
    
    # Get list of image files
    valid_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.tiff', '.bmp']
    image_files = [f for f in os.listdir(input_folder) 
                  if any(f.lower().endswith(ext) for ext in valid_extensions)]
    
    if not image_files:
        logger.warning(f"No image files found in {input_folder}")
        return
    
    logger.info(f"\nProcessing {len(image_files)} images...")
    
    successful = []
    failed = []
    tables_found = 0
    
    for filename in tqdm(image_files, desc="Processing images"):
        try:
            # Process image first
            input_path = os.path.join(input_folder, filename)
            image_content = process_image(input_path, max_size)
            
            # Extract tables using Textract
            document = extractor.analyze_document(
                file_source=image_content,
                features=[TextractFeatures.TABLES],
                save_image=True
            )
            
            if document and document.tables:
                tables_found += len(document.tables)
                # Save each table
                for i, table in enumerate(document.tables):
                    base_name = os.path.splitext(filename)[0]
                    excel_filename = f"{base_name}_table_{i+1}.xlsx"
                    output_path = os.path.join(output_folder, excel_filename)
                    table.to_excel(output_path)
                successful.append(filename)
                logger.info(f"Successfully extracted {len(document.tables)} tables from {filename}")
            else:
                failed.append((filename, "No tables found"))
                logger.warning(f"No tables found in {filename}")
                
        except Exception as e:
            logger.error(f"Error processing {filename}: {str(e)}")
            failed.append((filename, str(e)))
    
    # Print summary
    logger.info("\nProcessing complete!")
    logger.info(f"Successfully processed: {len(successful)} images")
    logger.info(f"Total tables extracted: {tables_found}")
    if failed:
        logger.error(f"\nFailed to process {len(failed)} images:")
        for filename, error in failed:
            logger.error(f"- {filename}: {error}")

    return successful, failed, tables_found

In [None]:
extractor = Textractor(profile_name="default")
successful, failed, tables = batch_resize_and_extract(extractor, input_folder, output_folder)