In [1]:
import os
import re
from pathlib import Path
from collections import defaultdict

def parse_filename(filename):
    """
    Parse filename to extract base name and version number.
    
    Args:
        filename (str): The PDF filename without extension
        
    Returns:
        tuple: (base_name, version_number) or (base_name, 0) if no version
    """
    # Remove .pdf extension if present
    name = filename.replace('.pdf', '')
    
    # Pattern to match version at the end: V followed by digits
    version_pattern = r'V(\d+)$'
    match = re.search(version_pattern, name)
    
    if match:
        version_num = int(match.group(1))
        base_name = name[:match.start()]
        return base_name, version_num
    else:
        # No version number found, treat as version 0
        return name, 0

def cleanup_old_pdf_versions(folder_path, dry_run=True):
    """
    Delete old versions of PDF files, keeping only the latest version.
    
    Args:
        folder_path (str): Path to the folder containing PDF files
        dry_run (bool): If True, only show what would be deleted without actually deleting
    """
    folder = Path(folder_path)
    
    if not folder.exists():
        print(f"Error: Folder '{folder_path}' does not exist.")
        return
    
    if not folder.is_dir():
        print(f"Error: '{folder_path}' is not a directory.")
        return
    
    # Get all PDF files in the folder
    pdf_files = [f for f in folder.glob('*.pdf')]
    
    if not pdf_files:
        print("No PDF files found in the specified folder.")
        return
    
    # Group files by base name and track versions
    file_groups = defaultdict(list)
    
    for pdf_file in pdf_files:
        base_name, version = parse_filename(pdf_file.stem)
        file_groups[base_name].append((pdf_file, version))
    
    # Process each group
    deleted_count = 0
    kept_count = 0
    
    for base_name, files in file_groups.items():
        if len(files) == 1:
            # Only one file for this base name, keep it
            kept_count += 1
            continue
        
        # Sort by version number (descending) to get the latest first
        files.sort(key=lambda x: x[1], reverse=True)
        
        latest_file, latest_version = files[0]
        old_files = files[1:]
        
        print(f"\nProcessing group: {base_name}")
        print(f"  Latest version: {latest_file.name} (V{latest_version})")
        
        if old_files:
            print(f"  Old versions to delete:")
            for old_file, old_version in old_files:
                print(f"    - {old_file.name} (V{old_version})")
                
                if not dry_run:
                    try:
                        old_file.unlink()
                        print(f"      ✓ Deleted: {old_file.name}")
                        deleted_count += 1
                    except Exception as e:
                        print(f"      ✗ Error deleting {old_file.name}: {e}")
                else:
                    deleted_count += 1
        
        kept_count += 1
    
    print(f"\n{'=== DRY RUN SUMMARY ===' if dry_run else '=== SUMMARY ==='}")
    print(f"Files that would be kept: {kept_count}")
    print(f"Files that would be deleted: {deleted_count}")
    
    if dry_run:
        print("\nThis was a dry run. No files were actually deleted.")
        print("Set dry_run=False to perform the actual cleanup.")

In [3]:
# Example usage
if __name__ == "__main__":
    # Replace with your folder path
    folder_path = r"F:\OneDrive - Green Energy\Sakib\GE\Financial-Offer-Challan-Purchase-Order-System\assets\covers\All"
    
    # First run with dry_run=True to see what would be deleted
    print("=== DRY RUN ===")
    cleanup_old_pdf_versions(folder_path, dry_run=False)
    
    # Uncomment the line below to actually delete the files
    # cleanup_old_pdf_versions(folder_path, dry_run=False)

# For Jupyter notebook usage:
def run_cleanup(folder_path, dry_run=True):
    """
    Convenience function for Jupyter notebook usage.
    
    Args:
        folder_path (str): Path to the folder containing PDF files
        dry_run (bool): If True, only show what would be deleted
    """
    cleanup_old_pdf_versions(folder_path, dry_run=dry_run)

# Usage in Jupyter:
# run_cleanup("/path/to/your/folder", dry_run=True)  # Preview
# run_cleanup("/path/to/your/folder", dry_run=False)  # Actually delete

=== DRY RUN ===

Processing group: AMC_UFL_252202GE21
  Latest version: AMC_UFL_252202GE21V2.pdf (V2)
  Old versions to delete:
    - AMC_UFL_252202GE21.pdf (V0)
      ✓ Deleted: AMC_UFL_252202GE21.pdf

Processing group: FO_FC_NSGL_221009GE76
  Latest version: FO_FC_NSGL_221009GE76V2.pdf (V2)
  Old versions to delete:
    - FO_FC_NSGL_221009GE76.pdf (V0)
      ✓ Deleted: FO_FC_NSGL_221009GE76.pdf

Processing group: FO_FC_NSWL_NSGL_220703GE73
  Latest version: FO_FC_NSWL_NSGL_220703GE73V2.pdf (V2)
  Old versions to delete:
    - FO_FC_NSWL_NSGL_220703GE73.pdf (V0)
      ✓ Deleted: FO_FC_NSWL_NSGL_220703GE73.pdf

Processing group: FO_FDS_FD_Snowtex_242303GE58
  Latest version: FO_FDS_FD_Snowtex_242303GE58V3.pdf (V3)
  Old versions to delete:
    - FO_FDS_FD_Snowtex_242303GE58V2.pdf (V2)
      ✓ Deleted: FO_FDS_FD_Snowtex_242303GE58V2.pdf

Processing group: FO_FDS_FPS_DSEL_231004GE410
  Latest version: FO_FDS_FPS_DSEL_231004GE410V6.pdf (V6)
  Old versions to delete:
    - FO_FDS_FPS_DSEL_