# Data cleaning

In [6]:
# Import
import os
from pprint import pprint
from concurrent.futures import ProcessPoolExecutor
import unicodedata

# Third party
import pandas as pd
import fitz # PyMuPDF


In [7]:
# Settings
working_directory = os.path.split(os.getcwd())

data_folder = os.path.join(working_directory[0], "data")
print(os.path.exists(data_folder))

input_folder = os.path.join(working_directory[0], "data", "unclean")
print(os.path.exists(input_folder))

output_folder = os.path.join(working_directory[0], "data", "clean")
print(os.path.exists(output_folder))

True
True
True


In [8]:
# Functions
def file_paths_from_directory(dir_path:str, abs_path:bool=False, file_exts:list=None, hidden:bool=False, sys_files:bool=False) -> list:
    """
    Get a list of all files in a directory including subdirectories.

    Args:
    - dir_path (str): The path to the directory to search.
    - hidden (list or None): Limit to specific file extension. Default is None.
    - file_ext (bool): The file extension to search for. Default is None.
    - abs_path (bool): Return absolute file paths. Default is False.
    - hidden (bool): Include hidden files. Default is False.
    - sys_files (bool): Include system files. Default is False.

    Returns:
    - list: A list of file paths. If abs_path is True, the file paths will be absolute. Otherwise, they will be relative.
    """
    # Check if the directory exists
    if not os.path.isdir(dir_path):
        raise FileNotFoundError(f"The directory '{dir_path}' does not exist.")

    # Get all files in the directory
    file_list: list = []
    for root, _, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append(file_path)

    # Remove spaces from the file names
    ################# untested #################
    for file_path in file_list:
        get_clean_file_name(file_path)
    ################# untested #################

    # Filter the file list by file extension
    if file_exts is not None:
        _file_list: list = []
        for file_ext in file_exts:
            _file_list.extend([f for f in file_list if f.lower().endswith(file_ext)])
        file_list = _file_list

    # Filter the file list by hidden files
    if not hidden:
        file_list: list = [f for f in file_list if not os.path.basename(f).startswith(".")]

    # Get the absolute file paths
    if abs_path:
        file_list: list = [os.path.abspath(f) for f in file_list]

    # Filter the file list by system files
    if not sys_files:
        windows_sys_files = ["desktop.ini", "thumbs.db"]
        mac_sys_files = [".ds_store"]
        for sys_file in mac_sys_files + windows_sys_files:
            file_list = [f for f in file_list if not os.path.basename(f).lower().startswith(sys_file)]

    # Return the file list
    return file_list

def str_from_pdf_paths(pdf_paths: list) -> pd.DataFrame:
    """
    Get all text from a list of pdf files.
    This function uses concurrent.futures.ProcessPoolExecutor to parallelize the extraction.
    Uses PyMuPDF (fitz) as the pdf reader.

    Args:
    - pdf_paths (list): A list of paths to the pdf files.

    Returns a DataFrame with columns:
    - pdf_path (str): The path to the pdf file.
    - status_ok (bool): True if the extraction was successful, False if not.
    - result (str): The extracted text if successful, the error message if not.
    """
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(str_from_pdf_path, pdf_paths))

    return pd.DataFrame(results, columns=["pdf_path", "status_ok", "result"])

def str_from_pdf_path(pdf_path: str) -> dict:
    """
    Get all text from a pdf file.
    Uses PyMuPDF (fitz) as the pdf reader.

    Returns dictionary with keys:
    - status_ok: True if the extraction was successful, False if not.
    - result: The extracted text if successful, the error message if not.
    """
    try:
        with open(pdf_path, "rb") as file:
            doc = fitz.open(file)
            return {
                "pdf_path": pdf_path,
                "status_ok": True,
                "result": " ".join([page.get_text() for page in doc])
            }

    except Exception as e:
        return {
            "pdf_path": pdf_path,
            "status_ok": False,
            "result": str(e)
        }
    
def get_clean_file_name(path: str) -> str:
    """
    Remove spaces from the file name.

    Args:
    - path (str): The path to the file.

    Returns:
    - str: The new path to the file.
    """
    file_name = os.path.basename(path)
    new_file_name = file_name.replace(" ", "_")
    new_path = os.path.join(os.path.dirname(path), new_file_name)
    os.rename(path, new_path)

    return new_path

def normalize_text(text:str) -> str:
    """
    Normalize text by removing unnecessary whitespaces and converting unicode characters to ASCII.

    Args:
    - text (str): The text to normalize.

    Returns:
    - str: The normalized text.
    """
    return unicodedata.normalize("NFKD", text)


def str_from_doc_path(doc_path: str) -> dict:
    """
    """
    try:
        with open(doc_path, "rb") as file:
            

            ################# untested #################


            doc = ...
            
            
            
            return {
                "doc_path": doc_path,
                "status_ok": True,
                "result": " ".join([page.get_text() for page in doc])
            }

    except Exception as e:
        return {
            "doc_path": doc_path,
            "status_ok": False,
            "result": str(e)
        }

In [11]:
# Main
## PDF to text
file_paths = file_paths_from_directory(input_folder, file_exts=[".pdf"])
all_results = [str_from_pdf_path(get_clean_file_name(file_path)) for file_path in file_paths]

pdf_texts = pd.DataFrame(all_results, columns=["pdf_path", "status_ok", "result"])

for _, row in pdf_texts.iterrows():
    
    new_filename = os.path.splitext(os.path.basename(row["pdf_path"]))[0] + ".txt"

    new_path = os.path.join(output_folder, os.path.basename(new_filename))

    with open(new_path, "w") as f:
        f.write(row["result"])

## DOC adn DOCX to text
file_paths = file_paths_from_directory(input_folder, file_exts=[".doc", ".docx"])


Process SpawnProcess-1:
Process SpawnProcess-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/mauruswollensak/.pyenv/versions/3.10.13/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/mauruswollensak/.pyenv/versions/3.10.13/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/mauruswollensak/.pyenv/versions/3.10.13/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/mauruswollensak/.pyenv/versions/3.10.13/lib/python3.10/concurrent/futures/process.py", line 240, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Users/mauruswollensak/.pyenv/versions/3.10.13/lib/python3.10/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'str_from_pdf_path' on <module '__main__' (built-in)>
  File "/Users/mauruswollen

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

## JSON export

[
    {
        "file_path": "...",
        "text_section": "..."
    },
]

Split in 2000 characters

In [12]:
all_text_files = file_paths_from_directory(output_folder, file_exts=[".txt"])

results = []

for file in all_text_files:
    with open(file) as f:
        text = f.read()

        length = len(text)
        for i in range(0, length, 2000):

            results.append({
                "file_path": os.path.basename(file),
                "text_section": text[i:i+2000]
            })

text_df = pd.DataFrame(results)
text_df.to_json(os.path.join(data_folder, "text_data.json"), orient="records", indent=4)
