In [None]:
import os
import json
import shutil
import pandas as pd
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import numpy as np





DATASET_PATH = "/kaggle/input/datasets/kagglertw/dal-shemagh/dal-shemagh-detection-challenge"
IMAGES_DIR = os.path.join(DATASET_PATH, "images/train")
LABELS_DIR = os.path.join(DATASET_PATH, "labels/train")
CSV_PATH = os.path.join(DATASET_PATH, "train_labels.csv")

OUTPUT_DIR = "/kaggle/working/data_5fold"
N_FOLDS = 5


os.makedirs(OUTPUT_DIR, exist_ok=True)





df = pd.read_csv(CSV_PATH)
right_place_dict = dict(zip(df["filename"], df["right_place"]))

print(f"CSV loaded: {len(df)} entries")
print(f"Images directory: {IMAGES_DIR}")
print(f"Labels directory: {LABELS_DIR}")





image_files = sorted([f for f in os.listdir(IMAGES_DIR) if f.endswith(".jpg")])
print(f"\nFound {len(image_files)} images\n")

image_info = []

for img_file in tqdm(image_files, desc="Analyzing images"):
    label_path = os.path.join(LABELS_DIR, img_file.replace(".jpg", ".txt"))
    
    has_head = False
    has_shemagh = False
    num_objects = 0
    
    if os.path.exists(label_path):
        with open(label_path, "r") as f:
            lines = f.readlines()
        
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 5:
                class_id = int(float(parts[0]))
                num_objects += 1
                if class_id == 0:
                    has_head = True
                elif class_id == 1:
                    has_shemagh = True
    
    right_place = right_place_dict.get(img_file, False)
    
    
    obj_bucket = "none" if num_objects == 0 else ("few" if num_objects <= 2 else "many")
    strat_key = f"rp{int(right_place)}_h{int(has_head)}_s{int(has_shemagh)}_{obj_bucket}"
    
    image_info.append({
        "filename": img_file,
        "right_place": right_place,
        "has_head": has_head,
        "has_shemagh": has_shemagh,
        "num_objects": num_objects,
        "strat_key": strat_key
    })






strat_groups = defaultdict(list)
for info in image_info:
    strat_groups[info["strat_key"]].append(info["filename"])


fold_assignments = {}

print("\n" + "="*80)
print("STRATIFIED 5-FOLD DISTRIBUTION:")
print("="*80)
print(f"{'Stratification Key':<35} | {'Total':>5} | Fold Distribution")
print("-"*80)

np.random.seed(42)

for key, files in sorted(strat_groups.items()):
    n_total = len(files)
    
    
    shuffled = np.random.permutation(files).tolist()
    
    
    fold_counts = [0] * N_FOLDS
    fold_dist = []
    
    for idx, file in enumerate(shuffled):
        fold_id = idx % N_FOLDS
        fold_assignments[file] = fold_id
        fold_counts[fold_id] += 1
    
    fold_dist_str = " | ".join([f"F{i}:{fold_counts[i]}" for i in range(N_FOLDS)])
    print(f"{key:<35} | {n_total:>5} | {fold_dist_str}")

print("="*80 + "\n")


fold_totals = [0] * N_FOLDS
for file, fold_id in fold_assignments.items():
    fold_totals[fold_id] += 1

print("FOLD SIZES:")
for i in range(N_FOLDS):
    print(f"  Fold {i}: {fold_totals[i]} images")
print()





categories = [
    {"id": 0, "name": "head"},
    {"id": 1, "name": "shemagh"},
]





def create_coco_json(image_list, output_dir, split_name):
    
    coco_output = {
        "images": [],
        "annotations": [],
        "categories": categories,
    }
    
    annotation_id = 0
    
    for image_id, img_file in enumerate(tqdm(image_list, desc=f"  {split_name}", leave=False)):
        
        img_path = os.path.join(IMAGES_DIR, img_file)
        label_path = os.path.join(LABELS_DIR, img_file.replace(".jpg", ".txt"))
        
        
        with Image.open(img_path) as img:
            width, height = img.size
        
        
        coco_output["images"].append({
            "id": image_id,
            "file_name": img_file,
            "width": width,
            "height": height,
            "right_place": bool(right_place_dict.get(img_file, False))
        })
        
        
        shutil.copy(img_path, os.path.join(output_dir, img_file))
        
        
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                lines = f.readlines()
            
            for line in lines:
                parts = line.strip().split()
                
                if len(parts) != 5:
                    continue
                
                class_id, x_center, y_center, w, h = map(float, parts)
                
                
                x_center *= width
                y_center *= height
                w *= width
                h *= height
                
                x_min = x_center - (w / 2)
                y_min = y_center - (h / 2)
                
                coco_output["annotations"].append({
                    "id": annotation_id,
                    "image_id": image_id,
                    "category_id": int(class_id),
                    "bbox": [x_min, y_min, w, h],
                    "area": w * h,
                    "iscrowd": 0
                })
                
                annotation_id += 1
    
    
    json_path = os.path.join(output_dir, "_annotations.coco.json")
    with open(json_path, "w") as f:
        json.dump(coco_output, f, indent=2)
    
    return len(coco_output['images']), len(coco_output['annotations'])





print("="*80)
print("CREATING 5-FOLD CROSS-VALIDATION SPLITS")
print("="*80 + "\n")

for fold_id in range(N_FOLDS):
    print(f"Processing Fold {fold_id}...")
    
    
    fold_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_id}")
    train_dir = os.path.join(fold_dir, "train")
    val_dir = os.path.join(fold_dir, "val")
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    
    
    train_files = []
    val_files = []
    
    for file, assigned_fold in fold_assignments.items():
        if assigned_fold == fold_id:
            val_files.append(file)
        else:
            train_files.append(file)
    
    
    n_train_imgs, n_train_anns = create_coco_json(train_files, train_dir, "train")
    n_val_imgs, n_val_anns = create_coco_json(val_files, val_dir, "val")
    
    print(f"  ✅ Fold {fold_id}: Train={n_train_imgs} imgs ({n_train_anns} anns) | "
          f"Val={n_val_imgs} imgs ({n_val_anns} anns)")
    print()





print("Copying test data...")
test_dir = os.path.join(OUTPUT_DIR, "test")
os.makedirs(test_dir, exist_ok=True)

test_src = os.path.join(DATASET_PATH, "images/test")
if os.path.exists(test_src):
    for file in tqdm(os.listdir(test_src), desc="  test images"):
        if file.lower().endswith(".jpg"):
            shutil.copy(os.path.join(test_src, file), os.path.join(test_dir, file))
    n_test = len([f for f in os.listdir(test_dir) if f.endswith(".jpg")])
    print(f"  ✅ Test: {n_test} images\n")





print("=" * 80)
print("FINAL DIRECTORY STRUCTURE:")
print("=" * 80)
print(f"\n{OUTPUT_DIR}/")
print(f"├── fold_0/")
print(f"│   ├── train/")
print(f"│   │   ├── _annotations.coco.json")
print(f"│   │   └── [images]")
print(f"│   └── val/")
print(f"│       ├── _annotations.coco.json")
print(f"│       └── [images]")
print(f"├── fold_1/")
print(f"│   ├── ...")
print(f"├── fold_2/")
print(f"│   ├── ...")
print(f"├── fold_3/")
print(f"│   ├── ...")
print(f"├── fold_4/")
print(f"│   ├── ...")
print(f"└── test/")
print(f"    └── [images]\n")

print("="*80)
print("SUMMARY")
print("="*80)
for fold_id in range(N_FOLDS):
    fold_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_id}")
    train_dir = os.path.join(fold_dir, "train")
    val_dir = os.path.join(fold_dir, "val")
    
    n_train = len([f for f in os.listdir(train_dir) if f.endswith(".jpg")])
    n_val = len([f for f in os.listdir(val_dir) if f.endswith(".jpg")])
    
    print(f"Fold {fold_id}: Train={n_train:3d} | Val={n_val:3d} | Total={n_train+n_val}")

print("="*80)
print("\n✅ 5-Fold Cross-Validation dataset ready for MMDetection training!")
print(f"\nTo train on a specific fold, use the paths:")
print(f"  Train: {OUTPUT_DIR}/fold_X/train")
print(f"  Val:   {OUTPUT_DIR}/fold_X/val")

CSV loaded: 651 entries
Images directory: /kaggle/input/datasets/justforfun44/dal-shemagh/dal-shemagh-detection-challenge/images/train
Labels directory: /kaggle/input/datasets/justforfun44/dal-shemagh/dal-shemagh-detection-challenge/labels/train

Found 651 images



Analyzing images: 100%|██████████| 651/651 [00:05<00:00, 122.41it/s]



STRATIFIED 5-FOLD DISTRIBUTION:
Stratification Key                  | Total | Fold Distribution
--------------------------------------------------------------------------------
rp0_h0_s0_none                      |   203 | F0:41 | F1:41 | F2:41 | F3:40 | F4:40
rp0_h0_s1_few                       |    98 | F0:20 | F1:20 | F2:20 | F3:19 | F4:19
rp0_h0_s1_many                      |     2 | F0:1 | F1:1 | F2:0 | F3:0 | F4:0
rp0_h1_s0_few                       |   281 | F0:57 | F1:56 | F2:56 | F3:56 | F4:56
rp0_h1_s0_many                      |    13 | F0:3 | F1:3 | F2:3 | F3:2 | F4:2
rp0_h1_s1_few                       |    31 | F0:7 | F1:6 | F2:6 | F3:6 | F4:6
rp0_h1_s1_many                      |    18 | F0:4 | F1:4 | F2:4 | F3:3 | F4:3
rp1_h1_s1_few                       |     3 | F0:1 | F1:1 | F2:1 | F3:0 | F4:0
rp1_h1_s1_many                      |     2 | F0:1 | F1:1 | F2:0 | F3:0 | F4:0

FOLD SIZES:
  Fold 0: 135 images
  Fold 1: 133 images
  Fold 2: 131 images
  Fold 3: 126 images

                                                        

  ✅ Fold 0: Train=516 imgs (478 anns) | Val=135 imgs (133 anns)

Processing Fold 1...


                                                         

  ✅ Fold 1: Train=518 imgs (479 anns) | Val=133 imgs (132 anns)

Processing Fold 2...


                                                         

  ✅ Fold 2: Train=520 imgs (488 anns) | Val=131 imgs (123 anns)

Processing Fold 3...


                                                        

  ✅ Fold 3: Train=525 imgs (500 anns) | Val=126 imgs (111 anns)

Processing Fold 4...


                                                        

  ✅ Fold 4: Train=525 imgs (499 anns) | Val=126 imgs (112 anns)

Copying test data...


  test images: 100%|██████████| 842/842 [00:08<00:00, 96.10it/s]

  ✅ Test: 842 images

FINAL DIRECTORY STRUCTURE:

/kaggle/working/data_5fold/
├── fold_0/
│   ├── train/
│   │   ├── _annotations.coco.json
│   │   └── [images]
│   └── val/
│       ├── _annotations.coco.json
│       └── [images]
├── fold_1/
│   ├── ...
├── fold_2/
│   ├── ...
├── fold_3/
│   ├── ...
├── fold_4/
│   ├── ...
└── test/
    └── [images]

SUMMARY
Fold 0: Train=516 | Val=135 | Total=651
Fold 1: Train=518 | Val=133 | Total=651
Fold 2: Train=520 | Val=131 | Total=651
Fold 3: Train=525 | Val=126 | Total=651
Fold 4: Train=525 | Val=126 | Total=651

✅ 5-Fold Cross-Validation dataset ready for MMDetection training!

To train on a specific fold, use the paths:
  Train: /kaggle/working/data_5fold/fold_X/train
  Val:   /kaggle/working/data_5fold/fold_X/val





In [2]:
!mkdir /kaggle/working/data/test

mkdir: cannot create directory ‘/kaggle/working/data/test’: No such file or directory


In [None]:
!cp -r /kaggle/input/datasets/kagglertw/dal-shemagh/dal-shemagh-detection-challenge/images/test/* /kaggle/working/data/test

cp: target '/kaggle/working/data/test' is not a directory


In [4]:
def count_png_files(folder_path):
    count = 0
    for file in os.listdir(folder_path):
        if file.lower().endswith(".jpg"):
            count += 1
    return count



In [5]:
print(count_png_files("/kaggle/working/data/test"))

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/data/test'