In [None]:
#multiple PDF's

# Install required packages
!pip install streamlit PyMuPDF pytesseract pdf2image Pillow pandas openpyxl opencv-python-headless googletrans==4.0.0-rc1
!apt-get install poppler-utils tesseract-ocr
!pip install pyngrok

# Ensure the necessary Tesseract language data files are installed
!apt-get install tesseract-ocr-eng tesseract-ocr-spa tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-chi-sim tesseract-ocr-msa tesseract-ocr-tam tesseract-ocr-hin

from PIL import Image
import os

# Save the rest of the code to a file
code = """
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pandas as pd
import numpy as np
import cv2
import os
import re
import openpyxl
from openpyxl.styles import Font, Alignment
import streamlit as st
from googletrans import Translator

# Function to detect images
def detect_images(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    images = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 50 and h > 50:  # Filter out small boxes that are likely not images
            image_roi = image[y:y+h, x:x+w]
            image_name = f"detected_image_{x}_{y}.jpg"
            if not os.path.exists('images'):
                os.makedirs('images')
            cv2.imwrite(f'images/{image_name}', image_roi)
            images.append(f'images/{image_name}')

    return images

# Function to extract key-value pairs from text
def extract_key_value_pairs(text):
    lines = text.split('\\n')
    data = {}
    key = None
    for line in lines:
        if ':' in line:
            parts = line.split(':', 1)
            key = parts[0].strip()
            value = parts[1].strip()
            if key and value:
                data[key] = value
        elif key:
            # Handling multiline values
            data[key] += ' ' + line.strip()
    return data

# Function to translate text
def translate_text(text, src_lang, dest_lang):
    translator = Translator()
    translated = translator.translate(text, src=src_lang, dest=dest_lang)
    return translated.text

# Function to convert numeric columns to correct types
def convert_numeric(df):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

# Streamlit app
st.title("PDF to Excel Converter")
st.write("Upload PDF files to convert them to a single Excel file.")

uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)

# Language selection for OCR
ocr_languages = {
    "English": "eng",
    "Spanish": "spa",
    "French": "fra",
    "German": "deu",
    "Chinese (Simplified)": "chi_sim",
    "Malay": "msa",
    "Tamil": "tam",
    "Hindi": "hin"
}
ocr_language = st.selectbox("Select the language for OCR", list(ocr_languages.keys()))

# Language selection for translation
translate_languages = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Chinese (Simplified)": "zh-cn",
    "Malay": "ms",
    "Tamil": "ta",
    "Hindi": "hi"
}
translate_language = st.selectbox("Select the language for Translation", list(translate_languages.keys()))

if uploaded_files:
    combined_data_list = []
    image_data_list = []
    image_counter = 1

    for uploaded_file in uploaded_files:
        # Save uploaded file to disk
        pdf_path = f"uploaded_{uploaded_file.name}"
        with open(pdf_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        # Convert PDF pages to images
        pages = convert_from_path(pdf_path, 300)
        for i, page in enumerate(pages):
            page.save(f'page_{i + 1}.jpg', 'JPEG')

        # Extract text from images
        extracted_texts = []
        lang_code = ocr_languages[ocr_language]
        for i in range(len(pages)):
            page_image_path = f'page_{i + 1}.jpg'
            text = pytesseract.image_to_string(Image.open(page_image_path), lang=lang_code)
            extracted_texts.append(text)

        # Translate text if necessary
        translated_texts = []
        src_lang_code = translate_languages[ocr_language]
        dest_lang_code = translate_languages[translate_language]
        for text in extracted_texts:
            translated_text = translate_text(text, src_lang=src_lang_code, dest_lang=dest_lang_code)
            translated_texts.append(translated_text)

        # Detect images
        for i in range(len(pages)):
            page_image_path = f'page_{i + 1}.jpg'
            images = detect_images(page_image_path)
            numbered_images = [f'image_{image_counter + idx}' for idx, _ in enumerate(images)]
            for idx, image_name in enumerate(images):
                os.rename(image_name, f'images/{numbered_images[idx]}.jpg')
            image_data_list.append(numbered_images)
            image_counter += len(images)

        # Extract key-value pairs from translated text
        extracted_data_list = [extract_key_value_pairs(text) for text in translated_texts]

        # Combine extracted data and images
        for i in range(len(extracted_data_list)):
            combined_data = extracted_data_list[i]
            combined_data['Images'] = ", ".join(image_data_list[i]) if image_data_list[i] else 'None'
            combined_data_list.append(combined_data)

    # Convert combined data to DataFrame
    df = pd.DataFrame(combined_data_list)
    df = convert_numeric(df)  # Convert numeric columns to correct types

    # Save data to Excel
    excel_path = 'extracted_customer_data.xlsx'
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Data')

        # Load the workbook and access the sheet
        workbook = writer.book
        worksheet = workbook['Data']

        # Add hyperlinks to images in the "Images" column
        for row in worksheet.iter_rows(min_row=2, max_row=worksheet.max_row, min_col=worksheet.max_column, max_col=worksheet.max_column):
            for cell in row:
                if cell.value and cell.value != 'None':
                    cell.value = f'=HYPERLINK("{os.getcwd()}/images/{cell.value}.jpg", "View Image")'
                    cell.font = Font(color="0000FF", underline="single")
                    cell.alignment = Alignment(wrap_text=True)

    workbook.save(excel_path)

    st.success("PDFs have been successfully converted to a single Excel file.")
    st.download_button(label="Download Excel file", data=open(excel_path, "rb").read(), file_name="extracted_customer_data.xlsx")
"""

with open("app.py", "w") as file:
    file.write(code)

# Run the Streamlit app with ngrok
from pyngrok import ngrok

# Set the authtoken
!ngrok authtoken 2k5QrK9IDU2SWnmmtJiBxIGT547_2u3W4dMSG7byC5gmkLSmr

# Connect to ngrok
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")

# Run the Streamlit app
!streamlit run app.py