## Extracting Embedded Files

In [8]:
import os
import zipfile
import mimetypes
import io
import csv
import re
from PIL import Image
import xml.etree.ElementTree as ET
from urllib.parse import unquote
import shutil

def process_embedded_excel_files(input_xlsx_path, output_folder):
    def extract_embedded_files(file_path, save_path):
        """
        Extracts arbitrary embedded files from an Excel (xlsx) file and saves them.

        Parameters:
        ----------
        file_path : str,
            The path to the xlsx file.

        save_path : str,
            Directory path to save the extracted files.
        """
            # Create the directory if it doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        with zipfile.ZipFile(file_path, 'r') as zip_file:
            # List all files in the archive
            all_files = zip_file.namelist()

            # Iterate through all files in the archive
            for file_info in all_files:
                # Extract the file content
                file_content = zip_file.read(file_info)

                # Save the extracted file to the specified directory
                extracted_file_path = os.path.join(save_path, os.path.basename(file_info))
                with open(extracted_file_path, 'wb') as f:
                    f.write(file_content)

    def convert_files_to_txt(input_folder, output_folder):
        """
        Converts each file in the input folder to a plain text file and saves it in the output folder.

        Parameters:
        ----------
        input_folder : str,
            The path to the folder containing XML and RELS files.

        output_folder : str,
            The path to the folder where the converted text files will be saved.
        """
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for filename in os.listdir(input_folder):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

            try:
                with open(input_file_path, 'r', encoding='utf-8') as input_file:
                    # Read the content of the input file
                    file_content = input_file.read()

                # Optionally, you can parse XML content if needed
                # For example, if the file is in XML format
                # tree = ET.fromstring(file_content)
                # parsed_content = ET.tostring(tree, encoding='utf-8').decode('utf-8')

            # Write the content to the output text file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(file_content)
            except Exception as e:
                print(f"Error processing file {input_file_path}: {e}")
    def extract_text_from_folder(folder_path):
        """
        Extracts text content matching the pattern from text files within a folder.

        Parameters:
        ----------
        folder_path : str,
            The path to the folder containing text files.

        Returns:
        -------
        matching_paths : list,
            A list of concatenated file paths found within the text files.
        """
        matching_paths = []

        # Iterate through each file in the folder
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)

            # Check if it's a text file
            if os.path.isfile(file_path) and filename.lower().endswith('.txt'):
                try:
                    # Read the content of the text file
                    with open(file_path, 'r', encoding='utf-8') as file:
                        file_content = file.read()

                    # Use format() to build the string
                    matches = re.findall(r'file:///C:\\Users\\([^"]+)', file_content)
                    matching_paths.extend(["C:/Users/{}".format(match.replace('\\', '/')) for match in matches])
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

        return matching_paths

    def fetch_files(matching_paths, output_folder):
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for file_path in matching_paths:
            try:
                file_name = os.path.basename(unquote(file_path))
                shutil.copy(unquote(file_path), os.path.join(output_folder, file_name))
            except Exception as e:
                print(f"Error fetching file {file_path}: {e}")

    # Create output folders
    extracted_folder = os.path.join(output_folder, 'extracted_embedded_files')
    converted_folder = os.path.join(output_folder, 'converted_text_files')
    fetched_folder = os.path.join(output_folder, 'fetched_excel_files')

    # Extract embedded files
    extract_embedded_files(input_xlsx_path, extracted_folder)

    # Convert files to text
    convert_files_to_txt(extracted_folder, converted_folder)

    # Extract matching paths from text files
    matching_paths = extract_text_from_folder(converted_folder)

    # Fetch and copy Excel files
    fetch_files(matching_paths, fetched_folder)

    # Delete 'converted_text_files' and 'extracted_embedded_files' folders
    shutil.rmtree(converted_folder)
    shutil.rmtree(extracted_folder)
    
    

# Example usage:
input_xlsx_path = "C:/Users/Subhadeep/Downloads/Test SOT.xlsx"
output_folder = "C:/Users/Subhadeep/Downloads/embedded_files_folder"
process_embedded_excel_files(input_xlsx_path, output_folder)



Error processing file C:/Users/Subhadeep/Downloads/embedded_files_folder\extracted_embedded_files\image1.emf: 'utf-8' codec can't decode byte 0x80 in position 72: invalid start byte
Error processing file C:/Users/Subhadeep/Downloads/embedded_files_folder\extracted_embedded_files\image2.emf: 'utf-8' codec can't decode byte 0x80 in position 72: invalid start byte
Error processing file C:/Users/Subhadeep/Downloads/embedded_files_folder\extracted_embedded_files\image3.emf: 'utf-8' codec can't decode byte 0x80 in position 72: invalid start byte
Error processing file C:/Users/Subhadeep/Downloads/embedded_files_folder\extracted_embedded_files\image4.jpeg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


## Converting the excel files into a single pdf

In [11]:
from spire.xls import *
from spire.xls.common import *
import os

def combine_and_convert_to_pdf(input_folder, output_directory, output_file_name):
    def combine_excel_files(input_folder, output_directory, output_file_name):
        # Create a new workbook
        newbook = Workbook()
        newbook.Version = ExcelVersion.Version2013
        # Clear all default worksheets
        newbook.Worksheets.Clear()

        # Create a temporary workbook
        tempbook = Workbook()

        # Iterate through each file in the folder
        for file in os.listdir(input_folder):
            if file.endswith('.xlsx') or file.endswith('.xls'):
                file_path = os.path.join(input_folder, file)

                # Load the file into the temporary workbook
                tempbook.LoadFromFile(file_path)

                # Iterate through each worksheet in the temporary workbook
                for sheet_index in range(tempbook.Worksheets.Count):
                    # Copy the entire worksheet from the temporary workbook to the new workbook
                    new_sheet = newbook.Worksheets.AddCopy(tempbook.Worksheets[sheet_index], WorksheetCopyType.CopyAll)

        # Specify the output file path
        output_path = os.path.join(output_directory, output_file_name)

        # Create the output directory if it doesn't exist
        os.makedirs(output_directory, exist_ok=True)

        # Save the merged file to the specified directory
        newbook.SaveToFile(output_path, ExcelVersion.Version2013)

        # Dispose of the workbooks
        newbook.Dispose()
        tempbook.Dispose()

    def convert_excel_to_pdf(input_excel_path, output_pdf_path):
        # Create a Workbook object
        workbook = Workbook()

        try:
            # Load an Excel document
            workbook.LoadFromFile(input_excel_path)

            # Iterate through the worksheets in the workbook
            for sheet in workbook.Worksheets:
                # Get the PageSetup object
                pageSetup = sheet.PageSetup

                # Set page margins
                pageSetup.TopMargin = 0.3
                pageSetup.BottomMargin = 0.3
                pageSetup.LeftMargin = 0.3
                pageSetup.RightMargin = 0.3

            # Set worksheet to fit to page when converting
            workbook.ConverterSetting.SheetFitToPage = True

            # Convert to PDF file
            workbook.SaveToFile(output_pdf_path, FileFormat.PDF)
            print(f"Conversion successful. PDF saved to {output_pdf_path}")

        except Exception as e:
            print(f"Error during conversion: {e}")

        finally:
            # Dispose of the workbook
            workbook.Dispose()

    # Combine Excel files
    combined_excel_path = os.path.join(output_directory, "CombinedExcelFiles.xlsx")
    combine_excel_files(input_folder, output_directory, "CombinedExcelFiles.xlsx")

    # Convert combined Excel file to PDF
    output_pdf_path = os.path.join(output_directory, output_file_name)
    convert_excel_to_pdf(combined_excel_path, output_pdf_path)

# Example usage:
input_folder_path = "C:/Users/Subhadeep/Downloads/embedded_files_folder/fetched_excel_files"
output_directory_path = "C:/Users/Subhadeep/Downloads/PDFs for Text Detection/"
output_file_name = "Embedded Bartrack Data.pdf"

combine_and_convert_to_pdf(input_folder_path, output_directory_path, output_file_name)

Conversion successful. PDF saved to C:/Users/Subhadeep/Downloads/PDFs for Text Detection/Embedded Bartrack Data.pdf
