In [None]:
import os
import re
from google.cloud import vision
from fpdf import FPDF
from PIL import Image

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'vision_key.json'
WORD = re.compile(r"\w+")

def detect_text(path):
    """Detects text in the file using Google Cloud Vision API."""
    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    # For dense text detection
    response = client.document_text_detection(image=image)
    texts = response.text_annotations
    ocr_text = ""

    if texts:
        # Extract the full text from the first annotation
        ocr_text = texts[0].description

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )

    return ocr_text

def format_text(ocr_text):
    """Formats the detected text by breaking it into lines based on full stops and newlines."""
    # Split the text by periods (.) or newlines (\n) to create separate lines
    formatted_lines = re.split(r'(?<=[.])\s+', ocr_text.strip())
    
    # Further cleaning to remove unwanted spaces or multiple newlines
    formatted_lines = [line.replace('\n', ' ').strip() for line in formatted_lines if line.strip()]
    
    return formatted_lines

def convert_text_to_pdf(text_lines, output_pdf_path, font_path):
    """Converts the detected text to a PDF file with Unicode support."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Add a Unicode-compliant font (DejaVuSans)
    pdf.add_font("DejaVu", "", font_path, uni=True)
    pdf.set_font("DejaVu", size=12)

    # Add each formatted line to the PDF
    for line in text_lines:
        pdf.multi_cell(0, 10, line)

    pdf.output(output_pdf_path)
    print(f"PDF saved as {output_pdf_path}")

# Path to your image file
image_path = "/home/niyati/PBL_MAJOR/test3.jpeg"

# Detect text from the image
raw_text = detect_text(image_path)

# Format the detected text into proper lines
formatted_text = format_text(raw_text)

# Path to save the output PDF
output_pdf_path = "output_text.pdf"

# Path to the Unicode font
font_path = "/home/niyati/PBL_MAJOR/dejavu-sans-fonts/DejaVuSans.ttf"  # Ensure you have this font file in the same directory

# Convert the formatted text to PDF
convert_text_to_pdf(formatted_text, output_pdf_path, font_path)

# To display the image (Optional)
image = Image.open(image_path)
image.show()
