In [None]:
import cv2
import pytesseract
import numpy as np
import os
import re
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils import get_column_letter

# Function to enhance image quality for OCR
def enhance_image(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"The image file does not exist at: {image_path}")
    
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Unable to read the image file: {image_path}")
    
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    gray = clahe.apply(gray)
    
    # Denoise the image
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # Adaptive thresholding
    thresh = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Slight dilation to connect nearby text
    kernel = np.ones((2,2), np.uint8)
    dilated = cv2.dilate(thresh, kernel, iterations=1)
    
    cv2.imwrite('processed_image.png', dilated)
    return dilated

# Function to clean up OCR-extracted text
def clean_text(text):
    # Remove non-alphanumeric characters except spaces, period, and dollar sign
    text = re.sub(r'[^a-zA-Z0-9\s\.$]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Function to extract and save the table with structure
def extract_table(image_path, output_excel_path):
    try:
        original_image = cv2.imread(image_path)
        enhanced_image = enhance_image(image_path)
        
        contours, _ = cv2.findContours(enhanced_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        cells = []
        for i, contour in enumerate(contours):
            x, y, w, h = cv2.boundingRect(contour)
            if w < 20 or h < 20:  # Filter out small regions
                continue
            cells.append((x, y, w, h))
        
        # Sort the cells by their y-coordinate first (top-to-bottom), and then by x-coordinate (left-to-right)
        cells = sorted(cells, key=lambda c: (c[1], c[0]))
        
        # Group cells into rows based on y-coordinate proximity
        rows = []
        current_row = []
        last_y = None
        
        for cell in cells:
            x, y, w, h = cell
            if last_y is None or abs(y - last_y) < 10:
                current_row.append(cell)
            else:
                rows.append(sorted(current_row, key=lambda c: c[0]))
                current_row = [cell]
            last_y = y
        
        if current_row:
            rows.append(sorted(current_row, key=lambda c: c[0]))
        
        # Create an Excel workbook
        wb = Workbook()
        ws = wb.active
        ws.title = "Extracted Table"
        
        row_num = 1
        
        for row_cells in rows:
            col_num = 1
            for cell in row_cells:
                x, y, w, h = cell
                roi = original_image[y:y+h, x:x+w]
                
                # Apply Tesseract OCR
                text = pytesseract.image_to_string(roi, config='--psm 6').strip()
                text = clean_text(text)
                
                # Debug print
                print(f"Extracted text (Row {row_num}, Col {col_num}): {text}")
                
                # Write the text to the corresponding cell in the Excel sheet
                col_letter = get_column_letter(col_num)
                ws[f"{col_letter}{row_num}"] = text
                col_num += 1
            
            row_num += 1
        
        # Save the workbook
        wb.save(output_excel_path)
        print(f"Table data extracted and saved to {output_excel_path}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
image_path = 'image_1.jpeg'
output_excel_path = 'extracted_table_structured.xlsx'
extract_table(image_path, output_excel_path)
