In [1]:
# Unzip all zip files under datasets/kanazawa, delete zips, and move images to kanazawa root
import os
import zipfile
import shutil
from pathlib import Path

KANAZAWA_DIR = Path('/Users/takato/proj-crack_seg/datasets/kanazawa')
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.gif', '.webp'}

print(f"Target dir: {KANAZAWA_DIR}")
assert KANAZAWA_DIR.exists(), f"Directory not found: {KANAZAWA_DIR}"

# 1) List zip files
zip_files = sorted([p for p in KANAZAWA_DIR.iterdir() if p.suffix.lower() == '.zip'])
print(f"Found {len(zip_files)} zip files")
for z in zip_files:
    print(" -", z.name)

# 2) Extract each zip to a folder named by zip stem (without extension)
extracted_dirs = []
for z in zip_files:
    dest_dir = KANAZAWA_DIR / z.stem
    dest_dir.mkdir(exist_ok=True)
    print(f"Extracting {z.name} -> {dest_dir}")
    with zipfile.ZipFile(z, 'r') as zip_ref:
        zip_ref.extractall(dest_dir)
    extracted_dirs.append(dest_dir)

# 3) Delete zip files after successful extraction
for z in zip_files:
    try:
        z.unlink()
        print(f"Deleted zip: {z.name}")
    except Exception as e:
        print(f"Could not delete {z}: {e}")

# 4) Move images from extracted folders into kanazawa root
moved_count = 0
skipped_count = 0
conflict_count = 0

for d in extracted_dirs:
    # Walk through all files under extracted dir
    for root, dirs, files in os.walk(d):
        for fname in files:
            src = Path(root) / fname
            if src.suffix.lower() in IMAGE_EXTS:
                dest = KANAZAWA_DIR / src.name
                if dest.exists():
                    # If there's a name conflict, try to add a numeric suffix
                    base = dest.stem
                    ext = dest.suffix
                    i = 1
                    while True:
                        alt = KANAZAWA_DIR / f"{base}_{i}{ext}"
                        if not alt.exists():
                            dest = alt
                            break
                        i += 1
                    conflict_count += 1
                try:
                    shutil.move(str(src), str(dest))
                    moved_count += 1
                except Exception as e:
                    print(f"Failed to move {src} -> {dest}: {e}")
            else:
                skipped_count += 1

print(f"Moved images: {moved_count}")
print(f"Skipped non-images: {skipped_count}")
print(f"Name conflicts resolved: {conflict_count}")

# 5) Optionally remove empty directories created by extraction
removed_dirs = 0
for d in extracted_dirs:
    try:
        # Remove dir tree if empty; otherwise leave it
        # We'll attempt rmdir; if not empty, try to remove empty subdirs
        for root, dirs, files in os.walk(d, topdown=False):
            for sub in dirs:
                p = Path(root) / sub
                try:
                    p.rmdir()
                except OSError:
                    pass
        d.rmdir()
        removed_dirs += 1
        print(f"Removed empty dir: {d}")
    except OSError:
        # Directory not empty; keep it
        pass

# Final summary: list a few files now in kanazawa root
final_files = sorted([p.name for p in KANAZAWA_DIR.iterdir() if p.is_file()])
print(f"Files now in kanazawa root: {len(final_files)} (showing up to 20)")
for name in final_files[:20]:
    print(" *", name)


Target dir: /Users/takato/proj-crack_seg/datasets/kanazawa
Found 12 zip files
 - クラック-20251230T014300Z-3-001.zip
 - クラック-20251230T014300Z-3-002.zip
 - クラック-20251230T014300Z-3-003.zip
 - クラック-20251230T014300Z-3-004.zip
 - クラック-20251230T014300Z-3-005.zip
 - クラック-20251230T014300Z-3-006.zip
 - クラック-20251230T014300Z-3-007.zip
 - クラック-20251230T014300Z-3-008.zip
 - クラック-20251230T014300Z-3-009.zip
 - クラック-20251230T014300Z-3-010.zip
 - クラック-20251230T014300Z-3-011.zip
 - クラック-20251230T014300Z-3-012.zip
Extracting クラック-20251230T014300Z-3-001.zip -> /Users/takato/proj-crack_seg/datasets/kanazawa/クラック-20251230T014300Z-3-001
Extracting クラック-20251230T014300Z-3-002.zip -> /Users/takato/proj-crack_seg/datasets/kanazawa/クラック-20251230T014300Z-3-002
Extracting クラック-20251230T014300Z-3-002.zip -> /Users/takato/proj-crack_seg/datasets/kanazawa/クラック-20251230T014300Z-3-002
Extracting クラック-20251230T014300Z-3-003.zip -> /Users/takato/proj-crack_seg/datasets/kanazawa/クラック-20251230T014300Z-3-003
Extracting クラック-20