<a href="https://colab.research.google.com/github/OlajideFemi/Carbon-Footprint/blob/main/Rename_files_Dynamically.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [12]:
import os
from Levenshtein import distance as levenshtein_distance # Make sure you have this library installed (pip install python-Levenshtein)

# (Paste your provided functions here: standardize_deepdive_filename and _move_duplicates)

if __name__ == "__main__":
    target_folder = "/content/drive/MyDrive/files" # Replace with the actual path
    archive_location = "/content/drive/MyDrive/files/archive/folder" # Optional: Replace or set to None if you want to delete duplicates

    # Ensure the target folder exists for demonstration purposes
    os.makedirs(target_folder, exist_ok=True)


In [13]:
import os
import re
import shutil
from datetime import datetime
from Levenshtein import distance as levenshtein_distance

def standardize_deepdive_filename(folder_path, archive_folder=None, dry_run=False):
    """
    Standardizes Deep Dive feedback filenames with comprehensive handling.

    Features:
    - Uses Levenshtein distance for optimal matching
    - Archives instead of deletes when archive_folder specified
    - Dry run mode for testing
    - Detailed logging
    - Atomic operations for safety

    Args:
        folder_path: Directory containing files to process
        archive_folder: Where to move duplicates (None = delete)
        dry_run: Test mode (no actual changes)

    Returns:
        Path to standardized file, or None if not found
    """
    standardized_name = "Deep Dive feedback survey.csv"
    standardized_path = os.path.join(folder_path, standardized_name)

    # Configure logging
    def log(action, message):
        prefix = "[DRY RUN] " if dry_run else ""
        print(f"{prefix}{action} {message}")

    # Find candidate files
    candidates = _find_candidate_files(folder_path, standardized_name)

    # Handle no matches case
    if not candidates:
        if os.path.exists(standardized_path):
            log("✓", "Using existing standardized file")
            return standardized_path if not dry_run else None
        log("⚠️", "No matching Deep Dive feedback files found")
        return None

    # Sort candidates by similarity and recency
    candidates.sort(key=lambda x: (x['distance'], -x['mtime']))
    best_candidate = candidates[0]

    # Handle already standardized case
    if best_candidate['distance'] == 0:
        log("✓", "Correct file already exists")
        _handle_duplicates(candidates[1:], archive_folder, dry_run)
        return standardized_path if not dry_run else None

    # Standardization process
    try:
        # Remove existing standardized file if needed
        if os.path.exists(standardized_path) and not dry_run:
            os.remove(standardized_path)
            log("🗑️", "Removed outdated standardized file")

        # Perform the rename
        if not dry_run:
            os.rename(best_candidate['path'], standardized_path)
        log("🔄", f"Renamed '{best_candidate['original']}' (distance: {best_candidate['distance']}) → '{standardized_name}'")

        # Handle duplicates
        _handle_duplicates(candidates[1:], archive_folder, dry_run)

        return standardized_path if not dry_run else None

    except OSError as e:
        log("⚠️", f"Operation failed: {e}")
        return None

def _find_candidate_files(folder_path, standardized_name):
    """Identify potential Deep Dive files with metadata"""
    base_pattern = re.compile(r'deep.*dive.*feedback|feedback.*deep.*dive', re.IGNORECASE)
    candidates = []

    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if os.path.isdir(filepath) or not filename.lower().endswith('.csv'):
            continue

        if base_pattern.search(filename):
            candidates.append({
                'original': filename,
                'path': filepath,
                'mtime': os.path.getmtime(filepath),
                'distance': levenshtein_distance(
                    filename.casefold(),
                    standardized_name.casefold()
                )
            })
    return candidates

def _handle_duplicates(duplicates, archive_folder, dry_run):
    """Move or delete duplicate files"""
    for dup in duplicates:
        try:
            if archive_folder:
                if not dry_run:
                    os.makedirs(archive_folder, exist_ok=True)
                    destination = os.path.join(archive_folder, dup['original'])
                    shutil.move(dup['path'], destination)
                print(f"📦 Moved duplicate to archive: {dup['original']} (distance: {dup['distance']})")
            else:
                if not dry_run:
                    os.remove(dup['path'])
                print(f"🗑️ Removed duplicate: {dup['original']} (distance: {dup['distance']})")
        except Exception as e:
            print(f"⚠️ Failed to handle duplicate '{dup['original']}': {e}")



In [14]:
# Example usage
if __name__ == "__main__":
    # Test run
    test_folder = "/content/drive/MyDrive/files"
    archive = "/content/drive/MyDrive/files/archive"

    print("=== DRY RUN ===")
    standardize_deepdive_filename(test_folder, archive, dry_run=True)

    print("\n=== ACTUAL RUN ===")
    result = standardize_deepdive_filename(test_folder, archive)
    print(f"\nResult: {result}")