In [2]:
import os 
import pdfplumber
import pytesseract
import re
import spacy

In [4]:
from PIL import Image
nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_text(file_path:str)->str:
    """
    extract text from file(PDF or Image)
    Parameters:
        file_path(str): Path to file. Supported formats:  "pdf", "jpg", "jpeg", "png".
    Returns:
        str: extracted text form the file.
    Raises:
        ValueError: when the file type is unsupported
    """
    text = ""
    _,file_extension = os.path.splitext(file_path)
    file_extension= file_extension.lower()
    if file_extension in [".pdf"]:
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text+= page.extract_text()
        except Exception as e:
            return f"Error in parsing the pdf: {e}"
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
        except Exception as e:
            return f"Error in parsing in image: {e}"
    else:
        raise ValueError("unsupported file type. Please use 'pdf', 'jpg', 'jpeg' or 'png'")
    return text.strip()

In [16]:
def clean_text(text:str)->str:
    """
    Cleans the extracted text including:
     -Removing extra spaces and new Lines.
     -Handling common ocr errors (e.g., 'ﬁ' to 'fi').
     -Normalizing punctuation.
    Parameters:
        text(str): Text to be cleaned.
    Returns:
        str: Cleaned text.
    """
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove non-ASCII characters
    text = re.sub(r'ﬁ', 'fi', text) # common ocr mistake
    return text
def segment_into_sentences(text: str)->list[str]:
    """
    Segments cleaned text into individual sentences
    Parameters:
        text(str): Cleaned text to be segmented
    Returns:
        list[str]: list of sentences extracted from the text.
    """
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences
def preProcess_chunk_text(text:str, chunk_size = 500)->list[list[str]]:
    """
    Divide the sentences from the cleaned text into manageable chunks. Each chunk contain sentences, and number of words in each chunk does not exceed 
    specified chunk_size
    Paramters:
        text(str): parsed text obtained from the file that is to be chunked.
        chunk_size(int): Maximum number of words per chunk (Default: 500).
    Returns:
        list[list[str]]: list of chunks, where each chunk contains sentences having number of words is less than or equal to chunk_size.|
        
    """
    text = clean_text(text)
    sentences= segment_into_sentences(text)
    
    chunks = []
    chunk = []
    word_count = 0
    for sentence in sentences: 
        words = sentence.split()
        word_count+= len(words)
        if word_count>chunk_size:
            word_count = len(words)
            chunks.append(" ".join(chunk))
            chunk = []
        chunk.append(sentence)
    if chunk:
        chunks.append(' '.join(chunk))
    return chunks
    

In [15]:
text  = extract_text("C:\\Users\\HP\\OneDrive\\Pictures\\Screenshots\\Screenshot 2025-01-06 114334.png")
print(len(preProcess_chunk_text(text)))

1
