## Import and path Definitions


In [17]:
import os
import glob
import random
import shutil

img_path = 'images'
ann_path = 'annotations'
output_path = 'dataset'



### Create Train/Val/Test Folder Structure

In [18]:
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(output_path, split, 'images'), exist_ok=True)
    os.makedirs(os.path.join(output_path, split, 'annotations'), exist_ok=True)

print("Folder structure created successfully.")

Folder structure created successfully.


### Step 3: List & Shuffle Annotation Files

In [19]:
ann_files = glob.glob(os.path.join(ann_path, '*.xml'))
random.seed(42)  # For reproducibility
random.shuffle(ann_files)

num_files = len(ann_files)
print(f"Total annotation files: {num_files}")

Total annotation files: 853


### Step 4: Compute Split Indices

In [20]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

n_train=(int(num_files * train_ratio))
n_val=int(num_files * val_ratio)

train_files = ann_files[:n_train]
val_files = ann_files[n_train:n_train + n_val]
test_files = ann_files[n_train + n_val:]

print(f"Train files: {len(train_files)}, Val files: {len(val_files)}, Test files: {len(test_files)}")

Train files: 597, Val files: 127, Test files: 129


In [21]:
def copypath(ann_list,split_list):
    for ann_file in ann_list:
        fname=os.path.splitext(os.path.basename(ann_file))[0]
        img_file = os.path.join(img_path, fname + '.png')

        shutil.copy2(ann_file,os.path.join(output_path, split_list, 'annotations'))
        shutil.copy2(img_file, os.path.join(output_path, split_list, 'images'))

In [22]:
copypath(train_files, 'train')
copypath(val_files, 'val')
copypath(test_files, 'test')

for split in ['train', 'val', 'test']:
    n_images = len(os.listdir(os.path.join(output_path, split, 'images')))
    n_annotations = len(os.listdir(os.path.join(output_path, split, 'annotations')))
    print(f"{split.capitalize()} - Images: {n_images}, Annotations: {n_annotations}")

Train - Images: 597, Annotations: 597
Val - Images: 127, Annotations: 127
Test - Images: 129, Annotations: 129


In [None]:
import os
import xml.etree.ElementTree as ET

# 1) Adjust these if your folder names differ
BASE_DIR = "dataset"
SPLITS   = ["train", "val", "test"]

# 2) Map exactly what your XML <name> tag contains
CLASS_MAP = {
    "with_mask":      1,
    "without_mask": 0,
    "mask_weared_incorrect":2
}

def xml_to_label(xml_path):
    """Parse one Pascal‑VOC XML and return its class as 0 or 1."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    # assume exactly one <object> per file
    name = root.find("object/name").text.lower()
    return CLASS_MAP[name]

def collect_paths_and_labels(split_name):
    """Returns two lists: [img_paths], [labels] for that split."""
    img_dir = os.path.join(BASE_DIR, split_name, "images")
    ann_dir = os.path.join(BASE_DIR, split_name, "annotations")
    
    paths, labels = [], []
    for fn in os.listdir(img_dir):
        # only image files
        if not fn.lower().endswith((".png", ".jpg", ".jpeg")):
            continue
        
        img_path = os.path.join(img_dir, fn)
        xml_path = os.path.join(ann_dir, fn.rsplit(".",1)[0] + ".xml")
        if not os.path.exists(xml_path):
            # skip if no matching XML
            continue
        
        # append to lists
        paths.append(img_path)
        labels.append(xml_to_label(xml_path))
    
    return paths, labels

# 3) Run for each split
train_paths, train_labels = collect_paths_and_labels("train")
val_paths,   val_labels   = collect_paths_and_labels("val")
test_paths,  test_labels  = collect_paths_and_labels("test")



# 4) Sanity‐check counts & balance
print(f"▶️ Train: {len(train_paths)} images  (mask={sum(train_labels)}, no_mask={len(train_labels)-sum(train_labels)})")
print(f"▶️ Val:   {len(val_paths)} images   (mask={sum(val_labels)},   no_mask={len(val_labels)-sum(val_labels)})")
print(f"▶️ Test:  {len(test_paths)} images  (mask={sum(test_labels)},  no_mask={len(test_labels)-sum(test_labels)})")


['dataset\\train\\images\\maksssksksss1.png', 'dataset\\train\\images\\maksssksksss10.png', 'dataset\\train\\images\\maksssksksss101.png', 'dataset\\train\\images\\maksssksksss102.png', 'dataset\\train\\images\\maksssksksss105.png']
[1, 1, 1, 1, 1]
▶️ Train: 597 images  (mask=534, no_mask=63)
▶️ Val:   127 images   (mask=112,   no_mask=15)
▶️ Test:  129 images  (mask=124,  no_mask=5)
