<a href="https://colab.research.google.com/github/PhoenixAlpha23/Pytesseract-Streamlit-App/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Google's Tesseract engine

In [None]:
!sudo apt-get install tesseract-ocr

install the python wrapper Pytesseract for this project.
*  Numpy- array operations
*  Pdf2image to handle PDF as input.
* Streamlit library- to create a temporary website to use this project.
*  pyMuPDF->fitz for image processing

In [None]:
#install necessary packages , opencv- image preprocessing, pytesseract pillow- OCR components,streamlit for demonstration purposes,
!pip install streamlit opencv-python-headless pytesseract Pillow numpy pdf2image pyMuPDF fitz

Write the Streamlit App

In [None]:
%%writefile app.py
import streamlit as st
import cv2
import numpy as np
import pytesseract
from PIL import Image
import io
import fitz
from pdf2image import convert_from_bytes
import zipfile

def ensure_gray(image):
    """
    Ensures the input image is in grayscale format.
    If the image is already grayscale, it's returned as-is.
    If it's in color, it's converted to grayscale.
    """
    if len(image.shape) == 2:
        return image
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def deskew_hough(image):
    """
    Applies Hough Line Transform to detect and correct skew in the image.
    """
    gray = ensure_gray(image)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi/180, 100)

    if lines is not None:
        angle = 0
        for rho, theta in lines[0]:
            if theta < np.pi/4 or theta > 3*np.pi/4:
                angle = theta
                break

        if angle != 0:
            (h, w) = image.shape[:2]
            center = (w // 2, h // 2)
            M = cv2.getRotationMatrix2D(center, angle * 180 / np.pi - 90, 1.0)
            rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
            return rotated

    return image

def preprocess_image(image, options):
    """
    Applies various preprocessing techniques to improve OCR accuracy:
    1. Converts to grayscale
    2. Applies thresholding
    3. Deskews the image
    4. Inverts colors
    5. Resizes
    6. Applies denoising
    """
    gray = ensure_gray(image)

    if options['apply_threshold']:
        _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    if options['apply_deskew']:
        gray = deskew_hough(gray)

    if options['apply_denoise']:
        gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

    if options['apply_contrast']:
        gray = cv2.equalizeHist(gray)

    return gray

def extract_text(image, options):
    """
    Extracts text from the preprocessed image using pytesseract OCR.
    """
    config = f"--oem 3 --psm {options['psm']} preserve_interword_spaces=1"
    return pytesseract.image_to_string(image, config=config, lang="eng")

def process_image(uploaded_file, options):
    """
    Processes an uploaded image file:
    1. Opens the image
    2. Converts it to a numpy array
    3. Applies preprocessing
    4. Extracts text using OCR
    """
    image = Image.open(uploaded_file)
    image_np = np.array(image)
    processed_image = preprocess_image(image_np, options)
    return extract_text(processed_image, options)

def process_pdf(uploaded_file, options):
    """
    Processes an uploaded PDF file:
    1. Reads the PDF
    2. Converts each page to an image
    3. Preprocesses each image
    4. Extracts text from each preprocessed image
    5. Combines text from all pages
    """
    pdf_bytes = uploaded_file.read()
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    texts = []
    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_np = np.array(img)
        processed_image = preprocess_image(img_np, options)
        texts.append(extract_text(processed_image, options))
    return "\n\n".join(texts)

def main():
    """
    Main function to run the Streamlit app:
    1. Sets up the user interface
    2. Handles file uploads
    3. Processes uploaded files (images or PDFs)
    4. Displays extracted text
    5. Provides a download option for the extracted text
    """
    st.title("Enhanced OCR Text Extraction from Images and PDFs")
    st.write("Upload multiple images or a PDF file to extract text.")

    uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True, type=["png", "jpg", "jpeg", "pdf"])

    st.sidebar.header("OCR Options")
    options = {
        'apply_threshold': st.sidebar.checkbox("Apply Thresholding", value=True),
        'apply_deskew': st.sidebar.checkbox("Apply Deskewing", value=True),
        'apply_denoise': st.sidebar.checkbox("Apply Denoising", value=True),
        'apply_contrast': st.sidebar.checkbox("Apply Contrast Enhancement", value=False),
        'psm': st.sidebar.selectbox("Page Segmentation Mode",
                                    options=[3, 4, 6, 11, 12],
                                    format_func=lambda x: f"PSM {x}",
                                    help="3: Full auto, 4: Single column, 6: Single block of text, 11: Single text line, 12: Single word")
    }

    if uploaded_files:
        all_text = []
        individual_texts = {}
        for uploaded_file in uploaded_files:
            try:
                if uploaded_file.type == "application/pdf":
                    text = process_pdf(uploaded_file, options)
                else:
                    text = process_image(uploaded_file, options)
                all_text.append(f"File: {uploaded_file.name}\n\n{text}\n\n{'='*50}\n")
                individual_texts[uploaded_file.name] = text
            except Exception as e:
                st.error(f"Error processing {uploaded_file.name}: {str(e)}")

        combined_text = "\n".join(all_text)
        st.text_area("Extracted Text", value=combined_text, height=300)

        # Create a download button for the combined extracted text
        combined_text_io = io.BytesIO(combined_text.encode('utf-8'))
        st.download_button(
            label="Download Combined Extracted Text",
            data=combined_text_io,
            file_name="combined_extracted_text.txt",
            mime="text/plain"
        )

        # Create a download button for individual text files
        if len(individual_texts) > 0:
            zip_buffer = io.BytesIO()
            with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
                for file_name, text in individual_texts.items():
                    zip_file.writestr(f"{file_name}_extracted.txt", text)

            st.download_button(
                label="Download Individual Extracted Texts",
                data=zip_buffer.getvalue(),
                file_name="individual_extracted_texts.zip",
                mime="application/zip"
            )

if __name__ == "__main__":
    main()

Overwriting app.py


In [None]:
#To get the password to
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 34.125.254.51


In [None]:
!streamlit run app.py &>/content/logs.txt &
!npx localtunnel --port 8501 &

import time
from google.colab import files
import IPython

# Wait for Streamlit and localtunnel to start
time.sleep(10)

# Get the localtunnel URL
localtunnel_url = !curl -s http://localhost:4040/api/tunnels | python -c "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

print(f"Your Streamlit app is running at: {localtunnel_url[0]}")
IPython.display.HTML(f'<a href="{localtunnel_url[0]}" target="_blank">Click here to open the Streamlit app</a>')