In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 19 06:22:04 2024

@author: Vishnu Parappulakkal
https://vishnu.framer.media/
"""

import os
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image
import pytesseract
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics

# Set the TESSDATA_PREFIX environment variable to the correct path
os.environ['TESSDATA_PREFIX'] = r'D:\Programs\Anaconda\share\tessdata'

class OCRApp:
    def __init__(self, root):
        self.root = root
        self.root.title("OCR to PDF Converter")
        
        # Create and place buttons
        self.btn_open = tk.Button(root, text="Open Image", command=self.open_image)
        self.btn_open.pack(pady=10)

        self.btn_save = tk.Button(root, text="Save PDF", command=self.save_pdf, state=tk.DISABLED)
        self.btn_save.pack(pady=10)

        self.image_path = None
        self.pdf_path = None

    def open_image(self):
        self.image_path = filedialog.askopenfilename(filetypes=[("Image Files", "*.png;*.jpg;*.jpeg")])
        if self.image_path:
            messagebox.showinfo("Image Selected", f"Image selected: {self.image_path}")
            self.btn_save.config(state=tk.NORMAL)
        else:
            messagebox.showwarning("No Image Selected", "Please select an image file.")

    def save_pdf(self):
        if not self.image_path:
            messagebox.showwarning("No Image Selected", "Please select an image file.")
            return
        
        try:
            # Load the original image
            image = Image.open(self.image_path)
            image_width, image_height = image.size

            # Perform OCR
            data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

            # Open save dialog for PDF
            self.pdf_path = filedialog.asksaveasfilename(defaultextension=".pdf", filetypes=[("PDF Files", "*.pdf")])
            if not self.pdf_path:
                return

            # Create PDF with the original image as the background
            c = canvas.Canvas(self.pdf_path, pagesize=(image_width, image_height))

            # Register TrueType fonts
            pdfmetrics.registerFont(TTFont('Arial-Regular', 'arial.ttf'))
            pdfmetrics.registerFont(TTFont('Arial-Bold', 'arialbd.ttf'))

            # Draw the original image as the background
            c.drawImage(self.image_path, 0, 0, width=image_width, height=image_height)

            # Overlay extracted text on the image
            for i in range(len(data['text'])):
                text = data['text'][i]
                if text.strip():  # Only draw non-empty text
                    x = data['left'][i]
                    y = image_height - (data['top'][i] + data['height'][i])  # Flip Y to match PDF coordinate system
                    
                    # Extract font size (height of text box)
                    font_size = data['height'][i]
                    
                    # Determine font style based on text (if provided)
                    font_style = 'Arial-Regular'
                    if 'bold' in text.lower():  # Example condition for bold text
                        font_style = 'Arial-Bold'
                    
                    # Set the font style and size
                    c.setFont(font_style, font_size if font_size > 0 else 12)
                    
                    # Handle text rotation (if any)
                    rotation = data['rotate'][i] if 'rotate' in data and data['rotate'][i] else 0
                    c.saveState()
                    # Translate to origin, rotate, and then translate back
                    c.translate(x + font_size / 2, y + font_size / 2)
                    c.rotate(rotation)
                    c.translate(-x - font_size / 2, -y - font_size / 2)
                    c.drawString(x, y, text)
                    c.restoreState()

            # Save the canvas as a PDF
            c.save()

            messagebox.showinfo("Process Complete", f"PDF file saved at: {self.pdf_path}")

        except Exception as e:
            messagebox.showerror("Error", str(e))

if __name__ == "__main__":
    root = tk.Tk()
    app = OCRApp(root)
    root.mainloop()
