In [1]:
import pytesseract

# Example path for Windows
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [3]:
import cv2
import pytesseract
import re
import os
import csv

# Preprocessing function
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Could not read the image at: {image_path}")
    
    # Apply Gaussian blur
    img = cv2.GaussianBlur(img, (5, 5), 0)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Optional: Apply thresholding for better OCR results
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    
    return binary

# Extract numbers using Tesseract
def extract_numbers(image_path):
    # Preprocess the image
    img = preprocess_image(image_path)
    
    # Use Tesseract to extract text
    text = pytesseract.image_to_string(img)
    
    # Use regex to filter numbers (including decimals)
    numbers = re.findall(r'\d+\.?\d*', text)
    
    return numbers

# Save results to CSV
def save_to_csv(output_file, results):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Image File', 'Extracted Numbers'])
        for filename, numbers in results.items():
            writer.writerow([filename, ', '.join(numbers)])

# Main processing function
def process_images(image_files, output_file):
    results = {}

    for image_file in image_files:
        try:
            extracted_numbers = extract_numbers(image_file)
            results[os.path.basename(image_file)] = extracted_numbers
        except FileNotFoundError as e:
            print(f"File error: {e}")
        except Exception as e:
            print(f"Error processing {image_file}: {e}")
    
    save_to_csv(output_file, results)
    print(f"Results saved to {output_file}")

# List of image paths (add your images here)
image_files = [
    'VW_DTN01_L_00007_101000000000000000002038853600000.front.JPG',
    'VW_DTN01_A_00002_101000000000000000002051033000000.front.JPG',
]

# Output CSV file
output_csv = 'extracted_numbers.csv'

# Run the processing
process_images(image_files, output_csv)


Results saved to extracted_numbers.csv
