In [4]:
import os
import re
from pdf2image import convert_from_path
import pytesseract
from langdetect import detect, LangDetectException

def process_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    extracted_text = []

    for image in images:
        text = pytesseract.image_to_string(image)
        extracted_text.append(text)

    combined_text = ' '.join(extracted_text)
    return combined_text

def save_text(file_path, text):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

def remove_hindi_text(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    filtered_sentences = []

    for sentence in sentences:
        try:
            # Only process sentences that are not empty or too short
            if len(sentence.strip()) > 20 and detect(sentence) != 'hi':
                filtered_sentences.append(sentence)
        except LangDetectException:
            # Skip sentences that cannot be processed
            continue

    cleaned_text = ' '.join(filtered_sentences)
    return cleaned_text

def main(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            extracted_text = process_pdf(pdf_path)
            cleaned_text = remove_hindi_text(extracted_text)
            txt_filename = f"{os.path.splitext(filename)[0]}.txt"
            txt_path = os.path.join(folder_path, txt_filename)
            save_text(txt_path, cleaned_text)
            print(f"Processed and saved: {txt_filename}")

# Ensure you have installed the `langdetect` package
# You can install it using `pip install langdetect`

# Set the folder path containing PDFs
folder_path = 'tester'

# Run the main function
main(folder_path)


Processed and saved: 2024-05-25_1710233226315.txt
Processed and saved: 2024-05-25_1710844257374.txt
