In [None]:
#!/usr/bin/env python3
import os
import sys
from collections import defaultdict
from pathlib import Path

In [None]:
#!/usr/bin/env python3
import os
import sys
from collections import defaultdict
from pathlib import Path

# Configuration
SAME_NAME = True
SAME_SIZE = True
INPUT_PATH = r"D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv"

In [9]:
#!/usr/bin/env python3
import os
import sys
from collections import defaultdict
from pathlib import Path

# Configuration
SAME_NAME = True
SAME_SIZE = True
INPUT_PATH = r"D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv"

def find_duplicates(path, check_name=True, check_size=True):
    """
    Find duplicate files based on name and optionally size.
    
    Args:
        path (str): Directory path to search
        check_name (bool): Whether to match by filename
        check_size (bool): Whether to match by file size
    
    Returns:
        dict: Dictionary with duplicate groups
    """
    if not path or not os.path.exists(path):
        print(f"Error: Path '{path}' does not exist or is empty")
        return {}
    
    # Dictionary to group files by their matching criteria
    file_groups = defaultdict(list)
    
    # Walk through all files in the directory tree
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            
            try:
                # Get file stats
                stat = os.stat(file_path)
                file_size = stat.st_size
                
                # Create key based on matching criteria
                if check_name and check_size:
                    key = (file, file_size)
                elif check_name:
                    key = file
                else:
                    key = file_size
                
                file_groups[key].append(file_path)
                
            except (OSError, IOError) as e:
                print(f"Warning: Could not access {file_path}: {e}")
                continue
    
    # Filter groups to only include duplicates (more than 1 file)
    duplicates = {k: v for k, v in file_groups.items() if len(v) > 1}
    
    return duplicates

def format_size(size_bytes):
    """Convert bytes to human readable format"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size_bytes < 1024.0:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024.0
    return f"{size_bytes:.1f} TB"

def print_duplicates(duplicates, check_name=True, check_size=True):
    """Print duplicate files in a formatted way"""
    if not duplicates:
        print("No duplicates found.")
        return
    
    print(f"\nFound {len(duplicates)} duplicate groups:")
    print("=" * 50)
    
    for i, (key, files) in enumerate(duplicates.items(), 1):
        if check_name and check_size:
            filename, size = key
            print(f"\nGroup {i}: '{filename}' ({format_size(size)})")
        elif check_name:
            print(f"\nGroup {i}: '{key}'")
        else:
            print(f"\nGroup {i}: Size {format_size(key)}")
        
        print(f"Found {len(files)} duplicates:")
        for file_path in sorted(files):
            print(f"  {file_path}")

def remove_duplicates(duplicates, keep_strategy='first', dry_run=True):
    """
    Remove duplicate files, keeping one based on strategy.
    
    Args:
        duplicates (dict): Dictionary of duplicate groups from find_duplicates()
        keep_strategy (str): 'first', 'last', 'shortest_path', 'longest_path'
        dry_run (bool): If True, only show what would be deleted without actually deleting
    
    Returns:
        tuple: (files_to_delete, files_kept, total_space_saved)
    """
    files_to_delete = []
    files_kept = []
    total_space_saved = 0
    
    for key, file_list in duplicates.items():
        if len(file_list) <= 1:
            continue
        
        # Sort files based on strategy
        if keep_strategy == 'first':
            sorted_files = sorted(file_list)
        elif keep_strategy == 'last':
            sorted_files = sorted(file_list, reverse=True)
        elif keep_strategy == 'shortest_path':
            sorted_files = sorted(file_list, key=len)
        elif keep_strategy == 'longest_path':
            sorted_files = sorted(file_list, key=len, reverse=True)
        else:
            sorted_files = file_list
        
        # Keep the first file based on strategy, delete the rest
        keep_file = sorted_files[0]
        delete_files = sorted_files[1:]
        
        files_kept.append(keep_file)
        
        for file_path in delete_files:
            try:
                file_size = os.path.getsize(file_path)
                total_space_saved += file_size
                files_to_delete.append((file_path, file_size))
                
                if not dry_run:
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")
                    
            except (OSError, IOError) as e:
                print(f"Error deleting {file_path}: {e}")
    
    return files_to_delete, files_kept, total_space_saved

def preview_removal(duplicates, keep_strategy='first'):
    """Preview what files would be deleted without actually deleting them"""
    print(f"\n=== REMOVAL PREVIEW (Strategy: {keep_strategy}) ===")
    
    files_to_delete, files_kept, total_space_saved = remove_duplicates(
        duplicates, keep_strategy, dry_run=True
    )
    
    if not files_to_delete:
        print("No files to delete.")
        return files_to_delete, files_kept, total_space_saved
    
    print(f"\nFiles that would be KEPT ({len(files_kept)}):")
    for file_path in sorted(files_kept):
        print(f"  ✓ {file_path}")
    
    print(f"\nFiles that would be DELETED ({len(files_to_delete)}):")
    for file_path, size in sorted(files_to_delete):
        print(f"  ✗ {file_path} ({format_size(size)})")
    
    print(f"\nTotal space that would be saved: {format_size(total_space_saved)}")
    
    return files_to_delete, files_kept, total_space_saved

def confirm_and_delete(duplicates, keep_strategy='first'):
    """Show preview and ask for confirmation before deleting"""
    files_to_delete, files_kept, total_space_saved = preview_removal(duplicates, keep_strategy)
    
    if not files_to_delete:
        return
    
    print("\n" + "="*50)
    response = input(f"Delete {len(files_to_delete)} duplicate files? (yes/no): ").strip().lower()
    
    if response in ['yes', 'y']:
        print("\nDeleting files...")
        remove_duplicates(duplicates, keep_strategy, dry_run=False)
        print(f"\n✓ Deleted {len(files_to_delete)} duplicate files")
        print(f"✓ Saved {format_size(total_space_saved)} of disk space")
    else:
        print("Deletion cancelled.")

def main():
    # Use global configuration or get from command line
    global INPUT_PATH, SAME_NAME, SAME_SIZE
    
    # Only use command line arguments if not running in Jupyter
    # (Jupyter adds kernel arguments to sys.argv)
    if len(sys.argv) > 1 and not any('kernel' in arg for arg in sys.argv):
        INPUT_PATH = sys.argv[1]
        
        if len(sys.argv) > 2:
            SAME_NAME = sys.argv[2].lower() == 'true'
        
        if len(sys.argv) > 3:
            SAME_SIZE = sys.argv[3].lower() == 'true'
    
    # Validate input
    if not INPUT_PATH:
        INPUT_PATH = input("Enter the path to search for duplicates: ").strip()
    
    if not INPUT_PATH:
        print("Error: No input path provided")
        return
    
    # Expand user path if needed
    INPUT_PATH = os.path.expanduser(INPUT_PATH)
    
    print(f"Searching for duplicates in: {INPUT_PATH}")
    print(f"Match by name: {SAME_NAME}")
    print(f"Match by size: {SAME_SIZE}")
    
    # If size is false, only name will be matched
    if not SAME_SIZE:
        SAME_NAME = True
    
    # Find duplicates
    duplicates = find_duplicates(INPUT_PATH, SAME_NAME, SAME_SIZE)
    
    # Print results
    print_duplicates(duplicates, SAME_NAME, SAME_SIZE)
    
    # Summary
    total_duplicate_files = sum(len(files) for files in duplicates.values())
    print(f"\nSummary: {total_duplicate_files} duplicate files in {len(duplicates)} groups")
    
    return duplicates

if __name__ == "__main__":
    main()

Searching for duplicates in: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv
Match by name: True
Match by size: True

Found 39 duplicate groups:

Group 1: 'accounting_accounts.csv' (8.2 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_accounts.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\accounting_accounts.csv

Group 2: 'accounting_account_types.csv' (11.0 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_account_types.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\accounting_account_types.csv

Group 3: 'account_detail_types.csv' (33.9 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\account_detail_types.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\account_

# Deletion

In [11]:
# Step 1: Find duplicates (you already did this)
duplicates = main()

# Step 2: Preview what would be deleted
preview_removal(duplicates, 'shortest_path')

# Step 3: Delete with confirmation
confirm_and_delete(duplicates, 'shortest_path')

Searching for duplicates in: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv
Match by name: True
Match by size: True

Found 39 duplicate groups:

Group 1: 'accounting_accounts.csv' (8.2 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_accounts.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\accounting_accounts.csv

Group 2: 'accounting_account_types.csv' (11.0 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_account_types.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\accounting_account_types.csv

Group 3: 'account_detail_types.csv' (33.9 KB)
Found 2 duplicates:
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\account_detail_types.csv
  D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables\account_


Deleting files...
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_accounts.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\accounting_account_types.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\account_detail_types.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\account_subtypes.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\barcodes.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\brands.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\business.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all_tables-before\business_locations.csv
Deleted: D:\OneDrive - Green Energy\Sakib\Scripts\downloaded_db_tables_csv\all