In [3]:
# 📦 Import necessary libraries
import os
import hashlib
import pandas as pd
from PIL import Image
from tqdm import tqdm

# 📁 Set your dataset directory
BASE_DIR = "."  # you're already inside data/train
BAD_DIR = "../bad_images"  # save bad images in data/bad_images
os.makedirs(BAD_DIR, exist_ok=True)

# 📋 Initialize data structures
hashes = {}
manifest = []

# 🔄 Traverse through each species folder
for species in tqdm(os.listdir(BASE_DIR), desc="Scanning species folders"):
    species_path = os.path.join(BASE_DIR, species)
    
    # Skip if not a folder
    if not os.path.isdir(species_path):
        continue

    for filename in os.listdir(species_path):
        file_path = os.path.join(species_path, filename)

        # Skip non-image files
        if not filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        try:
            # 🧪 Validate the image
            with Image.open(file_path) as img:
                img.verify()
        except Exception:
            # Move corrupted file
            os.rename(file_path, os.path.join(BAD_DIR, filename))
            print(f"❌ Corrupt image removed: {file_path}")
            continue

        # 🧬 Check for duplicates using MD5 hash
        with open(file_path, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()

        if file_hash in hashes:
            # Duplicate found – remove
            os.remove(file_path)
            print(f"🗑️ Duplicate removed: {file_path}")
            continue

        # Save the hash and record for manifest
        hashes[file_hash] = file_path
        manifest.append({
            'filename': os.path.relpath(file_path, start=".."),
            'species': species
        })

# 📄 Create and save manifest CSV one level up (in data/)
df = pd.DataFrame(manifest)
df.to_csv("../manifest.csv", index=False)

print(f"✅ Cleaning completed! {len(df)} valid images listed in 'manifest.csv'")


Scanning species folders:   9%|█████                                                    | 3/34 [00:00<00:02, 14.64it/s]

🗑️ Duplicate removed: .\amla\6.jpg
🗑️ Duplicate removed: .\amla\images (32).jpeg
🗑️ Duplicate removed: .\amla\images (4).jpeg
🗑️ Duplicate removed: .\amla\images (7).jpeg
🗑️ Duplicate removed: .\amla\images.jpeg
🗑️ Duplicate removed: .\asopalav\images (3).jpeg
🗑️ Duplicate removed: .\asopalav\images (37).jpeg
🗑️ Duplicate removed: .\asopalav\images (7).jpeg


Scanning species folders:  32%|██████████████████                                      | 11/34 [00:00<00:01, 16.81it/s]

🗑️ Duplicate removed: .\bili\download (6).jpeg
🗑️ Duplicate removed: .\bili\download (8).jpeg
🗑️ Duplicate removed: .\bili\images (27).jpeg
🗑️ Duplicate removed: .\bili\images (29).jpeg
🗑️ Duplicate removed: .\bili\images (33).jpeg
🗑️ Duplicate removed: .\bili\images (4).jpeg
🗑️ Duplicate removed: .\champa\images (22).jpg
🗑️ Duplicate removed: .\champa\images (28).jpg
🗑️ Duplicate removed: .\champa\images (30).jpg
🗑️ Duplicate removed: .\champa\images (33).jpg
🗑️ Duplicate removed: .\coconut\download (3).jpg


Scanning species folders:  41%|███████████████████████                                 | 14/34 [00:00<00:01, 17.03it/s]

🗑️ Duplicate removed: .\garmalo\image11.jpg
🗑️ Duplicate removed: .\garmalo\image19.jpg
🗑️ Duplicate removed: .\garmalo\image34.jpg
🗑️ Duplicate removed: .\garmalo\image9.jpg
🗑️ Duplicate removed: .\gulmohor\download (6).jpeg
🗑️ Duplicate removed: .\gulmohor\download.jpeg
🗑️ Duplicate removed: .\gulmohor\images (14).jpeg
🗑️ Duplicate removed: .\gulmohor\images (24).jpeg
🗑️ Duplicate removed: .\gulmohor\images (25).jpeg
🗑️ Duplicate removed: .\gulmohor\images (28).jpeg
🗑️ Duplicate removed: .\gulmohor\images (3).jpeg


Scanning species folders:  53%|█████████████████████████████▋                          | 18/34 [00:01<00:00, 16.11it/s]

🗑️ Duplicate removed: .\gunda\images (9).jpg
🗑️ Duplicate removed: .\jamun\images (20).jpg
🗑️ Duplicate removed: .\jamun\images (24).jpg
🗑️ Duplicate removed: .\jamun\images (9).jpg
🗑️ Duplicate removed: .\kanchan\download.jpg
🗑️ Duplicate removed: .\kanchan\images (11).jpg
🗑️ Duplicate removed: .\kanchan\images (3).jpg
🗑️ Duplicate removed: .\kesudo\images (20).jpeg
🗑️ Duplicate removed: .\kesudo\images (25).jpeg
🗑️ Duplicate removed: .\kesudo\images (32).jpeg
🗑️ Duplicate removed: .\kesudo\images (33).jpeg
🗑️ Duplicate removed: .\kesudo\images (36).jpeg
🗑️ Duplicate removed: .\kesudo\images (4).jpeg
🗑️ Duplicate removed: .\kesudo\images (41).jpeg
🗑️ Duplicate removed: .\kesudo\images (43).jpeg
🗑️ Duplicate removed: .\kesudo\images (6).jpeg
🗑️ Duplicate removed: .\kesudo\images.jpeg


Scanning species folders:  65%|████████████████████████████████████▏                   | 22/34 [00:01<00:00, 14.64it/s]

🗑️ Duplicate removed: .\motichanoti\download (4).jpg
🗑️ Duplicate removed: .\motichanoti\download (5).jpeg
🗑️ Duplicate removed: .\motichanoti\images (22).jpg
🗑️ Duplicate removed: .\motichanoti\images (24).jpg
🗑️ Duplicate removed: .\nilgiri\image28.jpg
🗑️ Duplicate removed: .\nilgiri\image38.jpg
🗑️ Duplicate removed: .\nilgiri\image4.jpg
🗑️ Duplicate removed: .\nilgiri\image41.jpg
🗑️ Duplicate removed: .\nilgiri\image5.jpg


Scanning species folders:  76%|██████████████████████████████████████████▊             | 26/34 [00:01<00:00, 12.00it/s]

🗑️ Duplicate removed: .\pilikaren\download (18).jpg
🗑️ Duplicate removed: .\pilikaren\download (2).jpg
🗑️ Duplicate removed: .\pilikaren\images (27).jpg
🗑️ Duplicate removed: .\pilikaren\images (28).jpg
🗑️ Duplicate removed: .\pipal\image12.jpg
🗑️ Duplicate removed: .\pipal\image17.jpg
🗑️ Duplicate removed: .\pipal\image4.jpg
🗑️ Duplicate removed: .\pipal\image45.jpg
🗑️ Duplicate removed: .\saptaparni\image6.jpg
🗑️ Duplicate removed: .\shirish\download (4).jpeg


Scanning species folders:  88%|█████████████████████████████████████████████████▍      | 30/34 [00:02<00:00, 13.95it/s]

🗑️ Duplicate removed: .\simlo\download (4).jpeg
🗑️ Duplicate removed: .\simlo\images (18).jpeg
🗑️ Duplicate removed: .\simlo\images (19).jpeg
🗑️ Duplicate removed: .\simlo\images (2).jpeg
🗑️ Duplicate removed: .\simlo\images (23).jpeg
🗑️ Duplicate removed: .\simlo\images (29).jpeg
🗑️ Duplicate removed: .\simlo\images (30).jpeg
🗑️ Duplicate removed: .\simlo\images (31).jpeg
🗑️ Duplicate removed: .\simlo\images (35).jpeg
🗑️ Duplicate removed: .\simlo\images (36).jpeg
🗑️ Duplicate removed: .\simlo\images (39).jpeg
🗑️ Duplicate removed: .\simlo\images (40).jpeg
🗑️ Duplicate removed: .\simlo\images (9).jpeg
🗑️ Duplicate removed: .\simlo\images.jpeg
🗑️ Duplicate removed: .\sitafal\images (2).jpeg
🗑️ Duplicate removed: .\sitafal\images (3).jpeg
🗑️ Duplicate removed: .\sitafal\images (5).jpeg
🗑️ Duplicate removed: .\sitafal\images (7).jpeg
🗑️ Duplicate removed: .\sonmahor\download (6).jpeg


Scanning species folders: 100%|████████████████████████████████████████████████████████| 34/34 [00:02<00:00, 14.90it/s]


🗑️ Duplicate removed: .\vad\download (4).jpeg
✅ Cleaning completed! 1514 valid images listed in 'manifest.csv'


In [1]:
import pandas as pd
df = pd.read_csv("data/manifest.csv")
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'data/manifest.csv'