In [2]:
import cv2
import pytesseract
from pytesseract import Output
import re

# Path to tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path as needed

# Load and preprocess the image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    return gray

# Extract raw text using Tesseract
def extract_text(image):
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, config=custom_config)
    return text

# Function to parse extracted text
def parse_text(text):
    result = {
        'Name': None,
        'Address': None,
        'Phone': None,
        'Mobile': None,
        'Company': None,
        'Job': None,
        'Email': None,
        'Web': None
    }

    # Regex patterns for different fields
    patterns = {
        'Phone': r'Phone|phone|Tel|tel|Telephone|telephone|T:',
        'Mobile': r'Mobile|mobile|Cell|cell|M:',
        'Email': r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b',
        'Web': r'\b(?:http://|www\.)\S+\b'
    }

    lines = text.split('\n')
    for line in lines:
        if re.search(patterns['Phone'], line, re.I):
            result['Phone'] = line
        elif re.search(patterns['Mobile'], line, re.I):
            result['Mobile'] = line
        elif re.search(patterns['Email'], line, re.I):
            result['Email'] = line
        elif re.search(patterns['Web'], line, re.I):
            result['Web'] = line
        elif result['Name'] is None:
            result['Name'] = line.strip()  # Assuming the first line is the name
        else:
            result['Company'] = line.strip() if result['Company'] is None else result['Company'] + ' ' + line.strip()

    # Further refine the parsing if needed
    return result

def ocr_business_card(image_path):
    image = preprocess_image(image_path)
    text = extract_text(image)
    return parse_text(text)

# Example usage
image_path = './download.png'
result = ocr_business_card(image_path)
print(result)


{'Name': 'Payee v', 'Address': None, 'Phone': None, 'Mobile': None, 'Company': 'fad company  DANI MARTINEZ  FINANCE MANAGER  922-456-7090  Qin coaigroatste com  Gretisireatyereastecamn  125 pero 8 Any Gy, ST 1245 ', 'Job': None, 'Email': None, 'Web': None}
