In [25]:
import os
import xml.etree.ElementTree as ET

def convert_files_to_txt(input_folder, output_folder):
    """
    Converts each file in the input folder to a plain text file and saves it in the output folder.

    Parameters:
    ----------
    input_folder : str,
        The path to the folder containing XML and RELS files.

    output_folder : str,
        The path to the folder where the converted text files will be saved.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        try:
            with open(input_file_path, 'r', encoding='utf-8') as input_file:
                # Read the content of the input file
                file_content = input_file.read()

                # Optionally, you can parse XML content if needed
                # For example, if the file is in XML format
                # tree = ET.fromstring(file_content)
                # parsed_content = ET.tostring(tree, encoding='utf-8').decode('utf-8')

            # Write the content to the output text file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(file_content)
        except Exception as e:
            print(f"Error processing file {input_file_path}: {e}")

# Example usage:
input_folder = "C:/Users/Shreshtha/Downloads/extracted_files/"
output_folder = "C:/Users/Shreshtha/Downloads/extracted_files_text/"
convert_files_to_txt(input_folder, output_folder)


Error processing file C:/Users/Shreshtha/Downloads/extracted_files/image1.jpeg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/extracted_files/image2.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/extracted_files/image3.png: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/extracted_files/image4.jpeg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/extracted_files/printerSettings1.bin: 'utf-8' codec can't decode byte 0xdc in position 68: invalid continuation byte


In [36]:
import os
import re

def extract_text_from_folder(folder_path):
    """
    Extracts text content matching the pattern from text files within a folder.

    Parameters:
    ----------
    folder_path : str,
        The path to the folder containing text files.

    Returns:
    -------
    matching_paths : list,
        A list of concatenated file paths found within the text files.
    """
    matching_paths = []

    # Iterate through each file in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Check if it's a text file
        if os.path.isfile(file_path) and filename.lower().endswith('.txt'):
            try:
                # Read the content of the text file
                with open(file_path, 'r', encoding='utf-8') as file:
                    file_content = file.read()

                # Use format() to build the string
                matches = re.findall(r'file:///C:\\Users\\([^"]+)', file_content)
                matching_paths.extend(["C:/Users/{}".format(match.replace('\\', '/')) for match in matches])
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    return matching_paths

# Example usage:
folder_path = "C:/Users/Shreshtha/Downloads/extracted_files_text/"
matching_paths = extract_text_from_folder(folder_path)

if matching_paths:
    print("Matching paths:")
    for path in matching_paths:
        print(path)
else:
    print("No matching paths found in the folder.")


Matching paths:
C:/Users/Shreshtha/Downloads/Extractthis.xlsx
