In [1]:
# Install required packages
!pip install streamlit PyMuPDF pytesseract pdf2image Pillow pandas openpyxl opencv-python-headless googletrans==4.0.0-rc1
!apt-get install poppler-utils tesseract-ocr
!pip install pyngrok

# Ensure the necessary Tesseract language data files are installed
!apt-get install tesseract-ocr-eng tesseract-ocr-spa tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-chi-sim tesseract-ocr-msa tesseract-ocr-tam tesseract-ocr-hin

from PIL import Image
import os

# Save the rest of the code to a file
code = """
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pandas as pd
import numpy as np
import cv2
import os
import re
import openpyxl
from openpyxl.styles import Font, Alignment
import streamlit as st
from googletrans import Translator

# Function to detect checkboxes
def detect_checkboxes(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    _, thresh = cv2.threshold(blurred, 240, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    checkboxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if 10 < w < 50 and 10 < h < 50:
            roi = thresh[y:y+h, x:x+w]
            filled = cv2.countNonZero(roi)
            if filled > (w * h) // 2:
                checkboxes.append((x, y, w, h, "Checked"))
            else:
                checkboxes.append((x, y, w, h, "Unchecked"))

    return checkboxes

# Function to extract key-value pairs from text
def extract_key_value_pairs(text, checkboxes):
    lines = text.split('\\n')
    data = {}
    key = None
    for line in lines:
        if ':' in line:
            parts = line.split(':', 1)
            key = parts[0].strip()
            value = parts[1].strip() if len(parts) > 1 else ""
            if key and value:
                if key in data:
                    data[key] += ' ' + value
                else:
                    data[key] = value
        elif key:
            if key in data:
                data[key] += ' ' + line.strip()
            else:
                data[key] = line.strip()

    # Include checkbox states in data
    for (x, y, w, h, state) in checkboxes:
        data[f"Checkbox at ({x}, {y})"] = state

    return data

# Function to translate text
def translate_text(text, src_lang, dest_lang):
    translator = Translator()
    translated = translator.translate(text, src=src_lang, dest=dest_lang)
    return translated.text

# Function to convert numeric columns to correct types
def convert_numeric(df):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

# Streamlit app
st.title("PDF to Excel Converter")
st.write("Upload PDF files to convert them to a single Excel file.")

uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)

# Language selection for OCR
ocr_languages = {
    "English": "eng",
    "Spanish": "spa",
    "French": "fra",
    "German": "deu",
    "Chinese (Simplified)": "chi_sim",
    "Malay": "msa",
    "Tamil": "tam",
    "Hindi": "hin"
}
ocr_language = st.selectbox("Select the language for OCR", list(ocr_languages.keys()))

# Language selection for translation
translate_languages = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Chinese (Simplified)": "zh-cn",
    "Malay": "ms",
    "Tamil": "ta",
    "Hindi": "hi"
}
translate_language = st.selectbox("Select the language for Translation", list(translate_languages.keys()))

if uploaded_files:
    combined_data_list = []
    image_data_list = []
    image_counter = 1

    for uploaded_file in uploaded_files:
        # Save uploaded file to disk
        pdf_path = f"uploaded_{uploaded_file.name}"
        with open(pdf_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        # Convert PDF pages to images
        pages = convert_from_path(pdf_path, 300)
        for i, page in enumerate(pages):
            page_image_path = f'page_{i + 1}.jpg'
            page.save(page_image_path, 'JPEG')

            # Extract text from images
            extracted_texts = []
            lang_code = ocr_languages[ocr_language]
            text = pytesseract.image_to_string(Image.open(page_image_path), lang=lang_code)
            extracted_texts.append(text)

            # Translate text if necessary
            translated_texts = []
            src_lang_code = translate_languages[ocr_language]
            dest_lang_code = translate_languages[translate_language]
            for text in extracted_texts:
                translated_text = translate_text(text, src_lang=src_lang_code, dest_lang=dest_lang_code)
                translated_texts.append(translated_text)

            # Detect checkboxes
            checkboxes = detect_checkboxes(page_image_path)

            # Extract key-value pairs from translated text
            extracted_data_list = [extract_key_value_pairs(text, checkboxes) for text in translated_texts]

            # Combine extracted data
            for data in extracted_data_list:
                combined_data_list.append(data)

    # Convert combined data to DataFrame
    df = pd.DataFrame(combined_data_list)
    df = convert_numeric(df)  # Convert numeric columns to correct types

    # Save data to Excel
    excel_path = 'extracted_customer_data.xlsx'
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Data')

    st.success("PDFs have been successfully converted to a single Excel file.")
    st.download_button(label="Download Excel file", data=open(excel_path, "rb").read(), file_name="extracted_customer_data.xlsx")
"""

with open("app.py", "w") as file:
    file.write(code)

# Run the Streamlit app with ngrok
from pyngrok import ngrok

# Set the authtoken
!ngrok authtoken 2k5QrK9IDU2SWnmmtJiBxIGT547_2u3W4dMSG7byC5gmkLSmr

# Connect to ngrok
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")

# Run the Streamlit app
!streamlit run app.py

Collecting streamlit
  Downloading streamlit-1.37.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.8.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-

In [None]:
# Install required packages
!pip install streamlit PyMuPDF pytesseract pdf2image Pillow pandas openpyxl opencv-python-headless googletrans==4.0.0-rc1
!apt-get install poppler-utils tesseract-ocr
!pip install pyngrok

# Ensure the necessary Tesseract language data files are installed
!apt-get install tesseract-ocr-eng tesseract-ocr-spa tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-chi-sim tesseract-ocr-msa tesseract-ocr-tam tesseract-ocr-hin

from PIL import Image
import os

# Save the rest of the code to a file
code = """
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pandas as pd
import numpy as np
import cv2
import os
import re
import openpyxl
from openpyxl.styles import Font, Alignment
import streamlit as st
from googletrans import Translator

# Function to detect checkboxes
def detect_checkboxes(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    _, thresh = cv2.threshold(blurred, 240, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    checkboxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if 10 < w < 50 and 10 < h < 50:
            roi = thresh[y:y+h, x:x+w]
            filled = cv2.countNonZero(roi)
            if filled > (w * h) // 2:
                checkboxes.append("Checked")
            else:
                checkboxes.append("Unchecked")

    return checkboxes

# Function to extract key-value pairs from text
def extract_key_value_pairs(text, checkboxes):
    lines = text.split('\\n')
    data = {}
    key = None
    for line in lines:
        if ':' in line:
            parts = line.split(':', 1)
            key = parts[0].strip()
            value = parts[1].strip() if len(parts) > 1 else ""
            if key and value:
                if key in data:
                    data[key] += ' ' + value
                else:
                    data[key] = value
        elif key:
            if key in data:
                data[key] += ' ' + line.strip()
            else:
                data[key] = line.strip()

    # Include checkbox states in data
    for i, state in enumerate(checkboxes):
        data[f"Checkbox {i+1}"] = state

    return data

# Function to translate text
def translate_text(text, src_lang, dest_lang):
    translator = Translator()
    translated = translator.translate(text, src=src_lang, dest=dest_lang)
    return translated.text

# Function to convert numeric columns to correct types
def convert_numeric(df):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

# Streamlit app
st.title("PDF to Excel Converter")
st.write("Upload PDF files to convert them to a single Excel file.")

uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)

# Language selection for OCR
ocr_languages = {
    "English": "eng",
    "Spanish": "spa",
    "French": "fra",
    "German": "deu",
    "Chinese (Simplified)": "chi_sim",
    "Malay": "msa",
    "Tamil": "tam",
    "Hindi": "hin"
}
ocr_language = st.selectbox("Select the language for OCR", list(ocr_languages.keys()))

# Language selection for translation
translate_languages = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Chinese (Simplified)": "zh-cn",
    "Malay": "ms",
    "Tamil": "ta",
    "Hindi": "hi"
}
translate_language = st.selectbox("Select the language for Translation", list(translate_languages.keys()))

if uploaded_files:
    combined_data_list = []
    image_data_list = []
    image_counter = 1

    for uploaded_file in uploaded_files:
        # Save uploaded file to disk
        pdf_path = f"uploaded_{uploaded_file.name}"
        with open(pdf_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        # Convert PDF pages to images
        pages = convert_from_path(pdf_path, 300)
        for i, page in enumerate(pages):
            page_image_path = f'page_{i + 1}.jpg'
            page.save(page_image_path, 'JPEG')

            # Extract text from images
            extracted_texts = []
            lang_code = ocr_languages[ocr_language]
            text = pytesseract.image_to_string(Image.open(page_image_path), lang=lang_code)
            extracted_texts.append(text)

            # Translate text if necessary
            translated_texts = []
            src_lang_code = translate_languages[ocr_language]
            dest_lang_code = translate_languages[translate_language]
            for text in extracted_texts:
                translated_text = translate_text(text, src_lang=src_lang_code, dest_lang=dest_lang_code)
                translated_texts.append(translated_text)

            # Detect checkboxes
            checkboxes = detect_checkboxes(page_image_path)

            # Extract key-value pairs from translated text
            extracted_data_list = [extract_key_value_pairs(text, checkboxes) for text in translated_texts]

            # Combine extracted data
            for data in extracted_data_list:
                combined_data_list.append(data)

    # Convert combined data to DataFrame
    df = pd.DataFrame(combined_data_list)
    df = convert_numeric(df)  # Convert numeric columns to correct types

    # Save data to Excel
    excel_path = 'extracted_customer_data.xlsx'
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Data')

    st.success("PDFs have been successfully converted to a single Excel file.")
    st.download_button(label="Download Excel file", data=open(excel_path, "rb").read(), file_name="extracted_customer_data.xlsx")
"""

with open("app.py", "w") as file:
    file.write(code)

# Run the Streamlit app with ngrok
from pyngrok import ngrok

# Set the authtoken
!ngrok authtoken 2k5QrK9IDU2SWnmmtJiBxIGT547_2u3W4dMSG7byC5gmkLSmr

# Connect to ngrok
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")

# Run the Streamlit app
!streamlit run app.py

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-chi-sim is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-deu is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-eng is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-fra is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-hin is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-msa is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-spa is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-tam is already the newest version (1:4.00~git30-7274cfa-1.1).
0