In [None]:
import os
import json
import shutil
import pandas as pd
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import numpy as np





DATASET_PATH = "/kaggle/input/datasets/kagglertw/dal-shemagh/dal-shemagh-detection-challenge"
IMAGES_DIR = os.path.join(DATASET_PATH, "images/train")  
LABELS_DIR = os.path.join(DATASET_PATH, "labels/train")  
CSV_PATH = os.path.join(DATASET_PATH, "train_labels.csv")

OUTPUT_DIR = "/kaggle/working/data"
TRAIN_DIR = os.path.join(OUTPUT_DIR, "train")
VAL_DIR = os.path.join(OUTPUT_DIR, "val")  


os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)





df = pd.read_csv(CSV_PATH)
right_place_dict = dict(zip(df["filename"], df["right_place"]))

print(f"CSV loaded: {len(df)} entries")
print(f"Images directory: {IMAGES_DIR}")
print(f"Labels directory: {LABELS_DIR}")





image_files = sorted([f for f in os.listdir(IMAGES_DIR) if f.endswith(".jpg")])
print(f"\nFound {len(image_files)} images\n")

image_info = []

for img_file in tqdm(image_files, desc="Analyzing images"):
    label_path = os.path.join(LABELS_DIR, img_file.replace(".jpg", ".txt"))
    
    has_head = False
    has_shemagh = False
    num_objects = 0
    
    if os.path.exists(label_path):
        with open(label_path, "r") as f:
            lines = f.readlines()
        
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 5:
                class_id = int(float(parts[0]))
                num_objects += 1
                if class_id == 0:
                    has_head = True
                elif class_id == 1:
                    has_shemagh = True
    
    right_place = right_place_dict.get(img_file, False)
    
    
    obj_bucket = "none" if num_objects == 0 else ("few" if num_objects <= 2 else "many")
    strat_key = f"rp{int(right_place)}_h{int(has_head)}_s{int(has_shemagh)}_{obj_bucket}"
    
    image_info.append({
        "filename": img_file,
        "right_place": right_place,
        "has_head": has_head,
        "has_shemagh": has_shemagh,
        "num_objects": num_objects,
        "strat_key": strat_key
    })






strat_groups = defaultdict(list)
for info in image_info:
    strat_groups[info["strat_key"]].append(info["filename"])

train_files = []
val_files = []

print("\n" + "="*60)
print("STRATIFIED SPLIT DISTRIBUTION:")
print("="*60)

for key, files in sorted(strat_groups.items()):
    n_total = len(files)
    n_val = max(1, int(n_total * 0.2))  
    n_train = n_total - n_val
    
    
    np.random.seed(42)
    shuffled = np.random.permutation(files).tolist()
    
    train_files.extend(shuffled[:n_train])
    val_files.extend(shuffled[n_train:])
    
    print(f"{key:30s} | Total: {n_total:3d} | Train: {n_train:3d} | Val: {n_val:3d}")

print("="*60)
print(f"Total images: {len(image_files)}")
print(f"Train images: {len(train_files)}")
print(f"Val images: {len(val_files)}")
print(f"Split ratio: {len(val_files)/len(image_files)*100:.1f}% validation")
print("="*60 + "\n")





categories = [
    {"id": 0, "name": "head"},
    {"id": 1, "name": "shemagh"},
]





def create_coco_json(image_list, split_name):
    
    coco_output = {
        "images": [],
        "annotations": [],
        "categories": categories,
    }
    
    annotation_id = 0
    
    output_img_dir = TRAIN_DIR if split_name == "train" else VAL_DIR
    
    for image_id, img_file in enumerate(tqdm(image_list, desc=f"Processing {split_name}")):
        
        img_path = os.path.join(IMAGES_DIR, img_file)
        label_path = os.path.join(LABELS_DIR, img_file.replace(".jpg", ".txt"))
        
        
        with Image.open(img_path) as img:
            width, height = img.size
        
        
        coco_output["images"].append({
            "id": image_id,
            "file_name": img_file,
            "width": width,
            "height": height,
            "right_place": bool(right_place_dict.get(img_file, False))
        })
        
        
        shutil.copy(img_path, os.path.join(output_img_dir, img_file))
        
        
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                lines = f.readlines()
            
            for line in lines:
                parts = line.strip().split()
                
                if len(parts) != 5:
                    continue
                
                class_id, x_center, y_center, w, h = map(float, parts)
                
                
                x_center *= width
                y_center *= height
                w *= width
                h *= height
                
                x_min = x_center - (w / 2)
                y_min = y_center - (h / 2)
                
                coco_output["annotations"].append({
                    "id": annotation_id,
                    "image_id": image_id,
                    "category_id": int(class_id),
                    "bbox": [x_min, y_min, w, h],
                    "area": w * h,
                    "iscrowd": 0
                })
                
                annotation_id += 1
    
    
    json_path = os.path.join(output_img_dir, "_annotations.coco.json")
    with open(json_path, "w") as f:
        json.dump(coco_output, f, indent=2)
    
    print(f"✅ {split_name} COCO JSON saved at: {json_path}")
    print(f"   Images: {len(coco_output['images'])}")
    print(f"   Annotations: {len(coco_output['annotations'])}\n")





create_coco_json(train_files, "train")
create_coco_json(val_files, "val")





print("=" * 60)
print("FINAL DIRECTORY STRUCTURE:")
print("=" * 60)
print(f"\n{OUTPUT_DIR}/")
print(f"├── train/")
print(f"│   ├── _annotations.coco.json")
print(f"│   └── [images]")
print(f"└── val/")
print(f"    ├── _annotations.coco.json")
print(f"    └── [images]\n")

print(f"Train images: {len(os.listdir(TRAIN_DIR)) - 1}")
print(f"Val images: {len(os.listdir(VAL_DIR)) - 1}")

print("\n✅ Dataset ready for MMDetection training!")
print(f"\nNote: The original 'test' folder has no labels, so we created")
print(f"a validation set from the training data (80/20 split).")

CSV loaded: 651 entries
Images directory: /kaggle/input/datasets/justforfun44/dal-shemagh/dal-shemagh-detection-challenge/images/train
Labels directory: /kaggle/input/datasets/justforfun44/dal-shemagh/dal-shemagh-detection-challenge/labels/train

Found 651 images



Analyzing images: 100%|██████████| 651/651 [00:02<00:00, 236.39it/s]



STRATIFIED SPLIT DISTRIBUTION:
rp0_h0_s0_none                 | Total: 203 | Train: 163 | Val:  40
rp0_h0_s1_few                  | Total:  98 | Train:  79 | Val:  19
rp0_h0_s1_many                 | Total:   2 | Train:   1 | Val:   1
rp0_h1_s0_few                  | Total: 281 | Train: 225 | Val:  56
rp0_h1_s0_many                 | Total:  13 | Train:  11 | Val:   2
rp0_h1_s1_few                  | Total:  31 | Train:  25 | Val:   6
rp0_h1_s1_many                 | Total:  18 | Train:  15 | Val:   3
rp1_h1_s1_few                  | Total:   3 | Train:   2 | Val:   1
rp1_h1_s1_many                 | Total:   2 | Train:   1 | Val:   1
Total images: 651
Train images: 522
Val images: 129
Split ratio: 19.8% validation



Processing train: 100%|██████████| 522/522 [00:04<00:00, 116.97it/s]


✅ train COCO JSON saved at: /kaggle/working/data/train/_annotations.coco.json
   Images: 522
   Annotations: 489



Processing val: 100%|██████████| 129/129 [00:01<00:00, 91.26it/s]

✅ val COCO JSON saved at: /kaggle/working/data/val/_annotations.coco.json
   Images: 129
   Annotations: 122

FINAL DIRECTORY STRUCTURE:

/kaggle/working/data/
├── train/
│   ├── _annotations.coco.json
│   └── [images]
└── val/
    ├── _annotations.coco.json
    └── [images]

Train images: 522
Val images: 129

✅ Dataset ready for MMDetection training!

Note: The original 'test' folder has no labels, so we created
a validation set from the training data (80/20 split).





In [2]:
!mkdir /kaggle/working/data/test

In [None]:
!cp -r /kaggle/input/datasets/kagglertw/dal-shemagh/dal-shemagh-detection-challenge/images/test/* /kaggle/working/data/test

In [4]:
def count_png_files(folder_path):
    count = 0
    for file in os.listdir(folder_path):
        if file.lower().endswith(".jpg"):
            count += 1
    return count



In [5]:
print(count_png_files("/kaggle/working/data/test"))

842


In [6]:
print(count_png_files("/kaggle/working/data/train"))

522


In [7]:
print(count_png_files("/kaggle/working/data/val"))

129
