In [49]:
# PyMuPDF for text extraction from pdf
import fitz

# for text extraction from word doc
import docx

# for text extraction from image
from PIL import Image
import io 
import pytesseract
import os

# import cv2
# import numpy as np

# for text extraction from excel/csv
import pandas as pd

# for extracting image from word doc
import queue
from spire.doc import *
from spire.doc.common import *

In [28]:
# Try to automatically find the path to Tesseract
tesseract_path = '/usr/bin/tesseract'  # Default for Linux
if os.name == 'nt':  # Windows
    tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
elif os.name == 'posix':
    tesseract_path = '/usr/local/bin/tesseract'  # macOS

In [50]:
def extract_text_from_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def extract_text_from_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def extract_text_from_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

def extract_text_from_excel(file_path):
    # Reading all sheets and combining them into one DataFrame
    excel_data = pd.read_excel(file_path, sheet_name=None)
    text = ""
    for sheet_name, sheet_data in excel_data.items():
        text += sheet_name + "\n"
        text += sheet_data.to_string(index=False)
        text += "\n"
    return text

def extract_text(file_path, output_path):
    file_type = identify_file_type(file_path)
    
    if file_type == "pdf":
        extract_images_from_pdf(file_path, output_path)
        return extract_text_from_pdf(file_path)
    elif file_type == "word_doc":
        extract_images_from_word_doc(file_path, output_path)
        return extract_text_from_word(file_path)
    elif file_type == "excel_or_csv":
        return extract_text_from_excel(file_path)
    elif file_type == "image":
        return extract_text_from_image(file_path)
    else:
        return ""
    
def identify_file_type(file_path):
    # Get the file extension
    file_extension = os.path.splitext(file_path)[1].lower()
    
    # Mapping of file extensions to file types
    file_types = {
        '.pdf': 'pdf',
        '.doc': 'word_doc',
        '.docx': 'word_doc',
        '.xls': 'excel_or_csv',
        '.xlsx': 'excel_or_csv',
        '.csv': 'excel_or_csv',
        '.png': 'image',
        '.jpg': 'image',
        '.jpeg': 'image',
        '.gif': 'image',
        '.bmp': 'image',
        '.tiff': 'image',
    }
    
    # Return the identified file type
    return file_types.get(file_extension, 'other')

def extract_images_from_word_doc(input_file, output_path):
    os.makedirs(output_path, exist_ok=True)
    # Create a Document instance
    document = Document()
    # Load the input Word document
    document.LoadFromFile(input_file)

    # Create a list to store the extracted image data
    images = []

    # Initialize a queue to store document elements for traversal
    nodes = queue.Queue()
    nodes.put(document)

    # Traverse through the document elements
    while not nodes.empty():
        node = nodes.get()
        for i in range(node.ChildObjects.Count):
            obj = node.ChildObjects[i]
            # Find the images
            if isinstance(obj, DocPicture):
                picture = obj
                # Append the image data to the list
                data_bytes = picture.ImageBytes
                images.append(data_bytes)
            elif isinstance(obj, ICompositeObject):
                nodes.put(obj)

    # Save the image data to image files
    for i, image_data in enumerate(images):
        file_name = f"Image-{i}.png"
        with open(os.path.join(output_path, file_name), 'wb') as image_file:
            image_file.write(image_data)

    document.Close()
    
def extract_images_from_pdf(file_path, output_path):
    os.makedirs(output_path, exist_ok=True)
    pdf_file = fitz.open(file_path)
    for page_number in range(len(pdf_file)): 
        page=pdf_file[page_number]
        image_list = page.get_images()
        print(image_list)

        for image_index, img in enumerate(page.get_images(),start=1):
            print(image_index)
            xref = img[0] 
            # extract image bytes 
            base_image = pdf_file.extract_image(xref)
            image_bytes = base_image["image"]
            # get image extension
            image_ext = base_image["ext"]

        # Create a PIL Image object from the image bytes
            pil_image = Image.open(io.BytesIO(image_bytes))

            # Save the image to disk
            image_path = f"image_{page_number}_{image_index}.{image_ext}"
            pil_image.save(os.path.join(output_path, image_path))

In [51]:
file_path = "./sample_docs/agreement.docx"
output_path = "./DocumentImages/"
extract_text(file_path, output_path)

'\n\n\n\nFreelance Agreement\n\nThis Freelance Software Development Agreement (herein after referred as "Agreement") is made and entered into as of Monday, 24-06-2024 interim (hereinafter referred as "Effective Date"), by and between:\n\nBitsBrewery (herein after referred as  "Agency"),\nEmail: contact@bitsbrewery.com,\nContact Numbers: +91 6386820064, +91 8909140299,\n\nAND\n\nClassTym (hereinafter referred as  "Client"),\nEmail: info@classtym.com,\nContact Number: +91 8929676861\n\nWHEREAS,\n\nThe Agency is in the business of providing software development services, including app development, web development, backend development, DevOps, and security services; and\n\nThe Client desires to engage the Agency to provide such services under the terms and conditions set forth in this Agreement.\n\nNOW, THEREFORE, in consideration of the mutual covenants and agreements herein contained, the parties hereto agree as follows:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n1. Term\n\nThe term of this Agreem