In [2]:
%pip install split-folders

Collecting split-folders
  Using cached split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Using cached split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import hashlib
from PIL import Image
import shutil

def deduplicate_images(source_dir, clean_dir, duplicates_dir):
    """
    Finds and separates duplicate images from a source directory.

    Args:
        source_dir (str): The folder containing the original dataset (e.g., 'dataset_full').
        clean_dir (str): Where to save the unique images.
        duplicates_dir (str): Where to move the found duplicates.
    """
    if not os.path.exists(clean_dir):
        os.makedirs(clean_dir)
    if not os.path.exists(duplicates_dir):
        os.makedirs(duplicates_dir)

    hashes = {}
    total_files = 0
    duplicate_count = 0

    print(f"Starting de-duplication of '{source_dir}'...")

    # Walk through all subdirectories and files
    for dirpath, _, filenames in os.walk(source_dir):
        # Create corresponding subdirectories in clean and duplicates folders
        structure_clean = os.path.join(clean_dir, os.path.relpath(dirpath, source_dir))
        structure_duplicates = os.path.join(duplicates_dir, os.path.relpath(dirpath, source_dir))
        if not os.path.isdir(structure_clean):
            os.makedirs(structure_clean)
        if not os.path.isdir(structure_duplicates):
            os.makedirs(structure_duplicates)

        for filename in filenames:
            total_files += 1
            path = os.path.join(dirpath, filename)
            try:
                with Image.open(path) as im:
                    im = im.convert("RGB")
                    data = im.tobytes()
                    file_hash = hashlib.md5(data).hexdigest()

                # If we've seen this hash before, it's a duplicate
                if file_hash in hashes:
                    print(f"  DUPLICATE: '{path}' is a copy of '{hashes[file_hash]}'")
                    shutil.move(path, os.path.join(structure_duplicates, filename))
                    duplicate_count += 1
                else:
                    # If it's a new hash, keep it and copy the file
                    hashes[file_hash] = path
                    shutil.copy2(path, os.path.join(structure_clean, filename))

            except Exception as e:
                print(f"Could not process {path}: {e}")

    print("\n--- De-duplication Complete ---")
    print(f"Total files scanned: {total_files}")
    print(f"Unique images found: {len(hashes)}")
    print(f"Duplicates moved: {duplicate_count}")
    print(f"Clean dataset saved to: '{clean_dir}'")

# --- CONFIGURE AND RUN ---
# 1. Your original dataset with all the duplicates
source_dataset_folder = "Bharatanatyam-Mudra-Dataset"

# 2. A new folder where the clean, unique images will be saved
clean_dataset_folder = "dataset_clean"

# 3. A folder to store the duplicates for inspection
duplicates_folder = "dataset_duplicates"

deduplicate_images(source_dataset_folder, clean_dataset_folder, duplicates_folder)

Starting de-duplication of 'Bharatanatyam-Mudra-Dataset'...
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_128.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_11.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_130.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_126.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_142.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_114.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_16.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_154.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_160.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_152.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_166.jpg' is a copy of 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)\Alapadmam_152.jpg'
  DUPLICATE: 'Bharatanatyam-Mudra-Dataset\Alapadmam(1)

In [10]:
import splitfolders

# The location of your single, combined dataset
input_folder = "dataset_foundational_29"

# The location where you want the new split dataset
output_folder = "dataset_split"

# This will split the data into 80% training, 10% validation, and 10% testing
# It ensures no files are shared between them.
# The 'stratify' option is not needed as the library does it by default.
splitfolders.ratio(
    input_folder,
    output=output_folder,
    seed=42,  # A fixed seed ensures the split is the same every time you run it
    ratio=(0.8, 0.1, 0.1)  # 80% train, 10% validation, 10% test
)

print("Dataset has been split successfully!")

Copying files: 11861 files [00:05, 2061.51 files/s]

Dataset has been split successfully!





In [12]:
import os

def find_overlap(train_dir, test_dir):
    train_files = set()
    for root, _, files in os.walk(train_dir):
        for f in files:
            train_files.add(f)
    test_files = set()
    for root, _, files in os.walk(test_dir):
        for f in files:
            if f in train_files:
                print("Exact filename overlap:", f)
find_overlap("dataset_split/train", "dataset_split/test")


In [13]:
from PIL import Image
import os
import hashlib

def img_hash(path):
    with Image.open(path) as im:
        im = im.convert("RGB")
        data = im.tobytes()
    return hashlib.md5(data).hexdigest()

def find_pixel_duplicates(train_dir, test_dir):
    train_hashes = {}
    for root, _, files in os.walk(train_dir):
        for f in files:
            p = os.path.join(root, f)
            try:
                h = img_hash(p)
                train_hashes.setdefault(h, []).append(p)
            except Exception:
                pass

    for root, _, files in os.walk(test_dir):
        for f in files:
            p = os.path.join(root, f)
            try:
                h = img_hash(p)
                if h in train_hashes:
                    print("Pixel duplicate found:", p, "matches", train_hashes[h][:3])
            except Exception:
                pass

find_pixel_duplicates("dataset_split/train", "dataset_split/test")


In [9]:
import os
import shutil

# --- Configuration ---

# 1. The source of your clean, de-duplicated dataset.
#    Make sure this path is correct.
source_dir = "dataset_clean"

# 2. The name of the new folder for your focused dataset.
target_dir = "dataset_foundational_29"

# 3. The list of the 29 foundational classes to copy.
#    These names should match the folder names in your source_dir.
classes_to_copy = [
    'Pathaka(1)', 'Tripathaka(1)', 'Ardhapathaka(1)', 'Katrimukha(1)',
    'Mayura(1)', 'Ardhachandran(1)', 'Aralam(1)', 'Shukatundam(1)',
    'Mushti(1)', 'Sikharam(1)', 'Kapith(1)', 'Katakamukha_1',
    'Katakamukha_2', 'Katakamukha_3', 'Suchi(1)', 'Chandrakala(1)',
    'Padmakosha(1)', 'Sarpasirsha(1)', 'Mrigasirsha(1)', 'Simhamukham(1)',
    'Kangulam(1)', 'Alapadmam(1)', 'Chaturam(1)', 'Bramaram(1)',
    'Hamsasyam(1)', 'Hamsapaksha(1)', 'Mukulam(1)', 'Tamarachudam(1)',
    'Trishulam(1)'
]

# --- Script Logic ---

print(f"Creating new dataset at '{target_dir}'...")

# Create the main target directory if it doesn't exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

copied_count = 0
not_found_count = 0

# Loop through the list of classes to copy
for class_name in classes_to_copy:
    source_class_path = os.path.join(source_dir, class_name)
    target_class_path = os.path.join(target_dir, class_name)

    # Check if the class exists in the source directory
    if os.path.exists(source_class_path):
        print(f"  Copying '{class_name}'...")
        # Use shutil.copytree to copy the entire folder
        shutil.copytree(source_class_path, target_class_path)
        copied_count += 1
    else:
        print(f"  WARNING: Class '{class_name}' not found in source directory.")
        not_found_count += 1

print("\n--- Process Complete ---")
print(f"Successfully copied {copied_count} class folders.")
if not_found_count > 0:
    print(f"Skipped {not_found_count} classes that were not found.")
print(f"Your new, focused dataset is ready in the '{target_dir}' folder.")

Creating new dataset at 'dataset_foundational_29'...
  Copying 'Pathaka(1)'...
  Copying 'Tripathaka(1)'...
  Copying 'Ardhapathaka(1)'...
  Copying 'Katrimukha(1)'...
  Copying 'Mayura(1)'...
  Copying 'Ardhachandran(1)'...
  Copying 'Aralam(1)'...
  Copying 'Shukatundam(1)'...
  Copying 'Mushti(1)'...
  Copying 'Sikharam(1)'...
  Copying 'Kapith(1)'...
  Copying 'Katakamukha_1'...
  Copying 'Katakamukha_2'...
  Copying 'Katakamukha_3'...
  Copying 'Suchi(1)'...
  Copying 'Chandrakala(1)'...
  Copying 'Padmakosha(1)'...
  Copying 'Sarpasirsha(1)'...
  Copying 'Mrigasirsha(1)'...
  Copying 'Simhamukham(1)'...
  Copying 'Kangulam(1)'...
  Copying 'Alapadmam(1)'...
  Copying 'Chaturam(1)'...
  Copying 'Bramaram(1)'...
  Copying 'Hamsasyam(1)'...
  Copying 'Hamsapaksha(1)'...
  Copying 'Mukulam(1)'...
  Copying 'Tamarachudam(1)'...
  Copying 'Trishulam(1)'...

--- Process Complete ---
Successfully copied 29 class folders.
Your new, focused dataset is ready in the 'dataset_foundational_29'