In [6]:
#is_matching_filename

import os
import re
from typing import Union, Pattern

# Global precompiled regex pattern to match filenames containing "Objection <number> Response"
global_compiled_pattern = re.compile(r"Objection \d+ Response", re.IGNORECASE)

def is_matching_filename(file_name: str, pattern: Union[str, Pattern]) -> bool:
    """
    Check if the given file name contains the specified regex pattern phrase.

    This function is designed to accept either a raw string describing the pattern or
    a precompiled regex object. If a string is provided, it's compiled into a regex
    object before checking for matches. By accepting both string and precompiled patterns,
    the function remains flexible while supporting optimized performance.

    Args:
        file_name (str): The name of the file to be checked.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.

    Returns:
        bool: True if the file name contains the phrase specified by the pattern, False otherwise.
    """
    # Compile pattern if provided as a string; otherwise, use the precompiled pattern directly.
    regex = re.compile(pattern) if isinstance(pattern, str) else pattern

    # Use the regex pattern to match the given file name and return the result.
    return bool(regex.search(file_name))

def check_files_in_directory(directory: str, pattern: Union[str, Pattern]) -> None:
    """
    Check all files in a given directory to see if their names contain the specified regex pattern phrase.

    Args:
        directory (str): The path of the directory to search.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.
    """
    # Iterate through all files in the given directory
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            is_match = is_matching_filename(file_name, pattern)
            print(f"Does '{file_path}' contain the phrase? {'Yes' if is_match else 'No'}")

# Example usage
if __name__ == "__main__":
    # Specify the folder path where your files are located
    directory_path = "D:\Scholarship\Objection database code\Test"

    # Check all files in the specified directory against the global pattern
    check_files_in_directory(directory_path, global_compiled_pattern)


Does 'D:\Scholarship\Objection database code\Test\2024 04 11 - ME eSA - Objection 1 Response - Copy - Copy.pdf' contain the phrase? Yes
Does 'D:\Scholarship\Objection database code\Test\2024 04 11 - ME eSA - Objection 1 Response.pdf' contain the phrase? Yes
Does 'D:\Scholarship\Objection database code\Test\202404 11 - ME eSA - Objection 1 Response-Copy.pdf' contain the phrase? Yes
Does 'D:\Scholarship\Objection database code\Test\combined_output.xlsx' contain the phrase? No
Does 'D:\Scholarship\Objection database code\Test\LA GL SERF 1234_v1.docx' contain the phrase? No


In [10]:
import os
import pandas as pd

def classify_files_by_pdf_status(directory: str, pdf_csv_path: str, non_pdf_csv_path: str):
    """
    Classify files in the given directory based on their PDF status and store results in two CSV files.

    Args:
        directory (str): The path of the directory to search.
        pdf_csv_path (str): The path to the CSV file where the list of PDF files will be saved.
        non_pdf_csv_path (str): The path to the CSV file where the list of non-PDF files will be saved.

    Returns:
        None
    """
    # Lists to hold the file paths
    pdf_files = []
    non_pdf_files = []

    def is_pdf(file_name: str) -> bool:
        """
        Check if the given file has a .pdf extension.

        Args:
            file_name (str): The name of the file to check.

        Returns:
            bool: True if the file is a PDF, False otherwise.
        """
        return file_name.lower().endswith(".pdf")

    # Iterate through all files in the given directory
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if is_pdf(file_name):
                pdf_files.append(file_path)
            else:
                non_pdf_files.append(file_path)

    # Create DataFrames for PDF and non-PDF files
    pdf_df = pd.DataFrame(pdf_files, columns=["PDF_Files"])
    non_pdf_df = pd.DataFrame(non_pdf_files, columns=["Non_PDF_Files"])

    # Save DataFrames to CSV files
    pdf_df.to_csv(pdf_csv_path, index=False)
    non_pdf_df.to_csv(non_pdf_csv_path, index=False)

# Example usage
if __name__ == "__main__":
    # Specify the folder path where your files are located
    directory_path = "D:\Scholarship\Objection database code\Test"

    # Specify the paths for the CSV files
    pdf_csv_output = r"D:\Scholarship\Objection database code\Test\pdf_files.csv"
    non_pdf_csv_output = r"D:\Scholarship\Objection database code\Test\non_pdf_files.csv"

    # Classify files and write the results to separate CSV files
    classify_files_by_pdf_status(directory_path, pdf_csv_output, non_pdf_csv_output)

    print("Classification complete. The results have been saved to CSV files.")


Classification complete. The results have been saved to CSV files.


In [17]:
# Getting last modified time for a file

import os
from datetime import datetime

def get_last_modified_date(file_path: str) -> str:
    """
    Get the last modified date of a file in MM/DD/YY format.

    Args:
        file_path (str): The path to the file for which to get the last modified date.

    Returns:
        str: The last modified date in MM/DD/YY format.

    Raises:
        ValueError: If the provided path is not a file.
    """
    if not os.path.isfile(file_path):
        raise ValueError(f"The path '{file_path}' is not a valid file.")

    # Retrieve the last modified timestamp
    last_modified_timestamp = os.path.getmtime(file_path)

    # Convert to a datetime object and then to the desired MM/DD/YY format
    last_modified_date = datetime.fromtimestamp(last_modified_timestamp).strftime("%m/%d/%y")

    return last_modified_date

# Example usage
if __name__ == "__main__":
    # Provide the path to the file for which you want to get the last modified date
    example_file_path = r"D:\Scholarship\Objection database code\Test\202404 11 - ME eSA - Objection 1 Response-Copy.pdf"

    try:
        last_modified_date = get_last_modified_date(example_file_path)
        print(f"Last Modified Date of '{example_file_path}': {last_modified_date}")
    except ValueError as e:
        print(e)


Last Modified Date of 'D:\Scholarship\Objection database code\Test\202404 11 - ME eSA - Objection 1 Response-Copy.pdf': 04/01/24


In [14]:
# File classification system 

#is_pdf: Checks if a file has a .pdf extension.
#is_matching_filename: Checks if a file matches a given pattern.
#classify_files_by_pdf_status: Splits files into PDF and non-PDF lists.
#main_function:
    #Retrieves the lists of all PDFs and non-PDFs.
    #Filters out the matching PDF files that follow a given pattern.
    #Saves the three lists to separate CSV files.






import os
import re
import pandas as pd
from typing import Union, Pattern, List

# Global precompiled regex pattern to match filenames containing "Objection <number> Response"
global_compiled_pattern = re.compile(r"Objection \d+ Response", re.IGNORECASE)

def is_pdf(file_name: str) -> bool:
    """
    Check if the given file has a .pdf extension.

    Args:
        file_name (str): The name of the file to check.

    Returns:
        bool: True if the file is a PDF, False otherwise.
    """
    return file_name.lower().endswith(".pdf")

def is_matching_filename(file_name: str, pattern: Union[str, Pattern]) -> bool:
    """
    Check if the given file name contains the specified regex pattern phrase.

    This function is designed to accept either a raw string describing the pattern or
    a precompiled regex object. If a string is provided, it's compiled into a regex
    object before checking for matches. By accepting both string and precompiled patterns,
    the function remains flexible while supporting optimized performance.

    Args:
        file_name (str): The name of the file to be checked.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.

    Returns:
        bool: True if the file name contains the phrase specified by the pattern, False otherwise.
    """
    # Compile pattern if provided as a string; otherwise, use the precompiled pattern directly.
    regex = re.compile(pattern) if isinstance(pattern, str) else pattern

    # Use the regex pattern to match the given file name and return the result.
    return bool(regex.search(file_name))

def classify_files_by_pdf_status(directory: str) -> (List[str], List[str]):
    """
    Classify files in the given directory based on their PDF status.

    Args:
        directory (str): The path of the directory to search.

    Returns:
        tuple: Two lists, one with PDF files and one with non-PDF files.
    """
    # Lists to hold the file paths
    pdf_files = []
    non_pdf_files = []

    # Iterate through all files in the given directory
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if is_pdf(file_name):
                pdf_files.append(file_path)
            else:
                non_pdf_files.append(file_path)

    return pdf_files, non_pdf_files

def main_function(directory: str, pattern: Union[str, Pattern], pdf_csv_path: str, matching_pdf_csv_path: str, non_pdf_csv_path: str):
    """
    Main function that classifies files by their PDF status and checks if PDF files match a given pattern.

    Args:
        directory (str): The path of the directory to search.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.
        pdf_csv_path (str): The path to the CSV file where the list of all PDF files will be saved.
        matching_pdf_csv_path (str): The path to the CSV file where the list of matching PDF files will be saved.
        non_pdf_csv_path (str): The path to the CSV file where the list of non-PDF files will be saved.

    Returns:
        None
    """
    # Get lists of PDF and non-PDF files
    pdf_files, non_pdf_files = classify_files_by_pdf_status(directory)

    # Filter only PDF files that match the pattern
    matching_pdf_files = [file for file in pdf_files if is_matching_filename(os.path.basename(file), pattern)]

    # Create DataFrames for all PDFs, matching PDFs, and non-PDF files
    pdf_df = pd.DataFrame(pdf_files, columns=["All_PDF_Files"])
    matching_pdf_df = pd.DataFrame(matching_pdf_files, columns=["Matching_PDF_Files"])
    non_pdf_df = pd.DataFrame(non_pdf_files, columns=["Non_PDF_Files"])

    # Save DataFrames to CSV files
    pdf_df.to_csv(pdf_csv_path, index=False)
    matching_pdf_df.to_csv(matching_pdf_csv_path, index=False)
    non_pdf_df.to_csv(non_pdf_csv_path, index=False)

    print("Files have been classified, and the results have been saved to the specified CSV files.")

# Example usage
if __name__ == "__main__":
    # Specify the folder path where your files are located
    directory_path = r"D:\Scholarship\Objection database code\Test"

    # Specify the paths for the CSV files
    pdf_csv_output = r"D:\Scholarship\Objection database code\Test\all_pdf_files.csv"
    matching_pdf_csv_output = r"D:\Scholarship\Objection database code\Test\matching_pdf_files.csv"
    non_pdf_csv_output = r"D:\Scholarship\Objection database code\Test\non_pdf_files.csv"

    # Run the main function to classify files and check PDF pattern matching
    main_function(directory_path, global_compiled_pattern, pdf_csv_output, matching_pdf_csv_output, non_pdf_csv_output)


Files have been classified, and the results have been saved to the specified CSV files.


In [18]:
#file classification system with last modified time 

import os
import re
import pandas as pd
from typing import Union, Pattern, List
from datetime import datetime

# Global precompiled regex pattern to match filenames containing "Objection <number> Response"
global_compiled_pattern = re.compile(r"Objection \d+ Response", re.IGNORECASE)

def is_pdf(file_name: str) -> bool:
    """
    Check if the given file has a .pdf extension.

    Args:
        file_name (str): The name of the file to check.

    Returns:
        bool: True if the file is a PDF, False otherwise.
    """
    return file_name.lower().endswith(".pdf")

def is_matching_filename(file_name: str, pattern: Union[str, Pattern]) -> bool:
    """
    Check if the given file name contains the specified regex pattern phrase.

    This function is designed to accept either a raw string describing the pattern or
    a precompiled regex object. If a string is provided, it's compiled into a regex
    object before checking for matches. By accepting both string and precompiled patterns,
    the function remains flexible while supporting optimized performance.

    Args:
        file_name (str): The name of the file to be checked.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.

    Returns:
        bool: True if the file name contains the phrase specified by the pattern, False otherwise.
    """
    # Compile pattern if provided as a string; otherwise, use the precompiled pattern directly.
    regex = re.compile(pattern) if isinstance(pattern, str) else pattern

    # Use the regex pattern to match the given file name and return the result.
    return bool(regex.search(file_name))

def get_last_modified_date(file_path: str) -> str:
    """
    Get the last modified date of a file in MM/DD/YY format.

    Args:
        file_path (str): The path to the file for which to get the last modified date.

    Returns:
        str: The last modified date in MM/DD/YY format.
    """
    last_modified_timestamp = os.path.getmtime(file_path)
    last_modified_date = datetime.fromtimestamp(last_modified_timestamp).strftime("%m/%d/%y")
    return last_modified_date

def classify_files_by_pdf_status(directory: str) -> (List[str], List[str]):
    """
    Classify files in the given directory based on their PDF status.

    Args:
        directory (str): The path of the directory to search.

    Returns:
        tuple: Two lists, one with PDF files and one with non-PDF files.
    """
    pdf_files = []
    non_pdf_files = []

    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if is_pdf(file_name):
                pdf_files.append(file_path)
            else:
                non_pdf_files.append(file_path)

    return pdf_files, non_pdf_files

def main_function(directory: str, pattern: Union[str, Pattern], pdf_csv_path: str, matching_pdf_csv_path: str, non_pdf_csv_path: str):
    """
    Main function that classifies files by their PDF status and checks if PDF files match a given pattern.
    The results include the last modified dates of files.

    Args:
        directory (str): The path of the directory to search.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.
        pdf_csv_path (str): The path to the CSV file where the list of all PDF files will be saved.
        matching_pdf_csv_path (str): The path to the CSV file where the list of matching PDF files will be saved.
        non_pdf_csv_path (str): The path to the CSV file where the list of non-PDF files will be saved.

    Returns:
        None
    """
    # Get lists of PDF and non-PDF files
    pdf_files, non_pdf_files = classify_files_by_pdf_status(directory)

    # Filter only PDF files that match the pattern
    matching_pdf_files = [file for file in pdf_files if is_matching_filename(os.path.basename(file), pattern)]

    # Get the last modified dates for all PDFs, matching PDFs, and non-PDF files
    pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in pdf_files]
    matching_pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in matching_pdf_files]
    non_pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in non_pdf_files]

    # Create DataFrames for all PDFs, matching PDFs, and non-PDF files
    pdf_df = pd.DataFrame(pdf_files_with_dates)
    matching_pdf_df = pd.DataFrame(matching_pdf_files_with_dates)
    non_pdf_df = pd.DataFrame(non_pdf_files_with_dates)

    # Save DataFrames to CSV files
    pdf_df.to_csv(pdf_csv_path, index=False)
    matching_pdf_df.to_csv(matching_pdf_csv_path, index=False)
    non_pdf_df.to_csv(non_pdf_csv_path, index=False)

    print("Files have been classified, and the results have been saved to the specified CSV files.")

# Example usage
if __name__ == "__main__":
    # Specify the folder path where your files are located
    directory_path = r"D:\Scholarship\Objection database code\Test"

    # Specify the paths for the CSV files
    pdf_csv_output = r"D:\Scholarship\Objection database code\Test\all_pdf_files.csv"
    matching_pdf_csv_output = r"D:\Scholarship\Objection database code\Test\matching_pdf_files.csv"
    non_pdf_csv_output = r"D:\Scholarship\Objection database code\Test\non_pdf_files.csv"

    # Run the main function to classify files and check PDF pattern matching
    main_function(directory_path, global_compiled_pattern, pdf_csv_output, matching_pdf_csv_output, non_pdf_csv_output)


Files have been classified, and the results have been saved to the specified CSV files.


In [None]:
# sub folder testing
import os
import re
import pandas as pd
from typing import Union, Pattern, List
from datetime import datetime

# Global precompiled regex pattern to match filenames containing "Objection <number> Response"
global_compiled_pattern = re.compile(r"Objection \d+ Response", re.IGNORECASE)

def is_pdf(file_name: str) -> bool:
    """
    Check if the given file has a .pdf extension.

    Args:
        file_name (str): The name of the file to check.

    Returns:
        bool: True if the file is a PDF, False otherwise.
    """
    return file_name.lower().endswith(".pdf")

def is_matching_filename(file_name: str, pattern: Union[str, Pattern]) -> bool:
    """
    Check if the given file name contains the specified regex pattern phrase.

    This function is designed to accept either a raw string describing the pattern or
    a precompiled regex object. If a string is provided, it's compiled into a regex
    object before checking for matches. By accepting both string and precompiled patterns,
    the function remains flexible while supporting optimized performance.

    Args:
        file_name (str): The name of the file to be checked.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.

    Returns:
        bool: True if the file name contains the phrase specified by the pattern, False otherwise.
    """
    # Compile pattern if provided as a string; otherwise, use the precompiled pattern directly.
    regex = re.compile(pattern) if isinstance(pattern, str) else pattern

    return bool(regex.search(file_name))

def get_last_modified_date(file_path: str) -> str:
    """
    Get the last modified date of a file in MM/DD/YY format.

    Args:
        file_path (str): The path to the file for which to get the last modified date.

    Returns:
        str: The last modified date in MM/DD/YY format.
    """
    last_modified_timestamp = os.path.getmtime(file_path)
    last_modified_date = datetime.fromtimestamp(last_modified_timestamp).strftime("%m/%d/%y")
    return last_modified_date

def classify_files_in_specified_subfolders(parent_directory: str, subfolders_to_include: List[str]) -> (List[str], List[str]):
    """
    Classify files in specified subfolders based on their PDF status.

    Args:
        parent_directory (str): The parent directory that contains subfolders.
        subfolders_to_include (List[str]): A list of subfolder names to include.

    Returns:
        tuple: Two lists, one with PDF files and one with non-PDF files found in the specified subfolders.
    """
    pdf_files = []
    non_pdf_files = []

    # Normalize the folder names to lowercase for accurate comparisons
    normalized_subfolders = [folder.lower() for folder in subfolders_to_include]

    # Walk through the parent directory
    for root, dirs, files in os.walk(parent_directory):
        # Get the relative path from the parent directory to the current root directory
        relative_path = os.path.relpath(root, parent_directory).lower()
        
        # Check if the relative path matches any of the specified subfolders
        if any(relative_path.startswith(folder) for folder in normalized_subfolders):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                if is_pdf(file_name):
                    pdf_files.append(file_path)
                else:
                    non_pdf_files.append(file_path)

    return pdf_files, non_pdf_files

def main_function(directory: str, pattern: Union[str, Pattern], subfolders: List[str], pdf_csv_path: str, matching_pdf_csv_path: str, non_pdf_csv_path: str):
    """
    Main function that classifies files by their PDF status and checks if PDF files match a given pattern.
    The results include the last modified dates of files.

    Args:
        directory (str): The path of the parent directory to search.
        pattern (Union[str, Pattern]): The regex pattern in string format or a precompiled regex object.
        subfolders (List[str]): List of subfolders to include.
        pdf_csv_path (str): The path to the CSV file where the list of all PDF files will be saved.
        matching_pdf_csv_path (str): The path to the CSV file where the list of matching PDF files will be saved.
        non_pdf_csv_path (str): The path to the CSV file where the list of non-PDF files will be saved.

    Returns:
        None
    """
    # Get lists of PDF and non-PDF files only from the specified subfolders
    pdf_files, non_pdf_files = classify_files_in_specified_subfolders(directory, subfolders)

    # Filter only PDF files that match the pattern
    matching_pdf_files = [file for file in pdf_files if is_matching_filename(os.path.basename(file), pattern)]

    # Get the last modified dates for all PDFs, matching PDFs, and non-PDF files
    pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in pdf_files]
    matching_pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in matching_pdf_files]
    non_pdf_files_with_dates = [{"File": file, "Last_Modified": get_last_modified_date(file)} for file in non_pdf_files]

    # Create DataFrames for all PDFs, matching PDFs, and non-PDF files
    pdf_df = pd.DataFrame(pdf_files_with_dates)
    matching_pdf_df = pd.DataFrame(matching_pdf_files_with_dates)
    non_pdf_df = pd.DataFrame(non_pdf_files_with_dates)

    # Save DataFrames to CSV files
    pdf_df.to_csv(pdf_csv_path, index=False)
    matching_pdf_df.to_csv(matching_pdf_csv_path, index=False)
    non_pdf_df.to_csv(non_pdf_csv_path, index=False)

    print("Files have been classified, and the results have been saved to the specified CSV files.")

# Example usage
if __name__ == "__main__":
    # Specify the parent folder path where your subfolders are located
    parent_directory_path = r"D:\Scholarship\Objection database code\ALL"

    # List the subfolders to include (e.g., "Folder 1", "Folder 2")
    subfolders_to_include = ["Folder 1", "Folder 2"]

    # Specify the paths for the CSV files
    pdf_csv_output = r"D:\Scholarship\Objection database code\ALL\all_pdf_files.csv"
    matching_pdf_csv_output = r"D:\Scholarship\Objection database code\ALL\matching_pdf_files.csv"
    non_pdf_csv_output = r"D:\Scholarship\Objection database code\ALL\non_pdf_files.csv"

    # Run the main function to classify files and check PDF pattern matching in specific subfolders
    main_function(parent_directory_path, global_compiled_pattern, subfolders_to_include, pdf_csv_output, matching_pdf_csv_output, non_pdf_csv_output)
