In [1]:
import pandas as pd
import os

In [4]:
UECFOOD100_folder_path = r"D:\Projects\CDS\Capstone Project\UECFOOD100"
uecfood100 = pd.read_pickle(os.path.join(UECFOOD100_folder_path, 'images.pkl'))
uecfood100

Unnamed: 0,ingredient_category,name,image_bytes,x1,y1,x2,y2
0,1,rice,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,0,143,370,486
1,1,rice,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x00...,71,16,478,328
2,1,rice,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,0,49,360,420
3,1,rice,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,19,69,355,372
4,1,rice,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,0,4,91,246
...,...,...,...,...,...,...,...
14356,100,goya chanpuru,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,39,58,328,220
14357,100,goya chanpuru,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,4,11,166,92
14358,100,goya chanpuru,b'\xff\xd8\xff\xe1\x03\x88Exif\x00\x00II*\x00\...,0,0,299,175
14359,100,goya chanpuru,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,456,125,800,351


In [7]:
import os
import shutil
import pandas as pd
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# -------------- PATHS -------------- #
base_path = r"D:\Projects\CDS\Capstone Project\UECFOOD100"
output_path = os.path.join(base_path, "processed_for_yolo")

train_images_dir = os.path.join(output_path, "train", "images")
train_labels_dir = os.path.join(output_path, "train", "labels")
val_images_dir = os.path.join(output_path, "val", "images")
val_labels_dir = os.path.join(output_path, "val", "labels")

for path in [train_images_dir, train_labels_dir, val_images_dir, val_labels_dir]:
    os.makedirs(path, exist_ok=True)

# -------------- READ CATEGORY MAPPING -------------- #
category_path = os.path.join(base_path, "category.txt")
cat_df = pd.read_csv(category_path, sep='\t')

# Build mapping dictionary
id2name = {}
for _, row in cat_df.iterrows():
    id2name[int(row['id'])] = row['name']

# -------------- GATHER ALL DATA -------------- #
all_data = []

# Each class folder
for class_id in tqdm(sorted(os.listdir(base_path))):
    class_folder = os.path.join(base_path, class_id)
    if not os.path.isdir(class_folder):
        continue
    bb_info_file = os.path.join(class_folder, "bb_info.txt")
    if not os.path.exists(bb_info_file):
        continue

    # Read bounding box info
    bb_df = pd.read_csv(bb_info_file, sep=' ')

    for _, row in bb_df.iterrows():
        image_filename = f"{int(row['img'])}.jpg"
        image_path = os.path.join(class_folder, image_filename)

        if not os.path.exists(image_path):
            continue  # Some missing images maybe, be safe

        all_data.append({
            "image_path": image_path,
            "class_id": int(class_id),
            "x1": row['x1'],
            "y1": row['y1'],
            "x2": row['x2'],
            "y2": row['y2']
        })

# To dataframe
all_df = pd.DataFrame(all_data)
print(f"Total images found: {len(all_df)}")

# -------------- SPLIT INTO TRAIN & VAL -------------- #
train_df, val_df = train_test_split(all_df, test_size=0.2, random_state=42, stratify=all_df['class_id'])

# -------------- HELPER: Convert bbox to YOLO format -------------- #
def convert_bbox(size, box):
    dw = 1.0 / size[0]
    dh = 1.0 / size[1]
    x_center = (box[0] + box[2]) / 2.0
    y_center = (box[1] + box[3]) / 2.0
    w = box[2] - box[0]
    h = box[3] - box[1]
    return (x_center * dw, y_center * dh, w * dw, h * dh)

# -------------- COPY IMAGES AND CREATE LABEL FILES -------------- #
def process_split(df_split, split):
    for idx, row in tqdm(df_split.iterrows(), total=len(df_split), desc=f"Processing {split}"):
        img = cv2.imread(row['image_path'])
        h, w = img.shape[:2]

        # Copy image
        img_filename = os.path.basename(row['image_path'])
        dest_img_path = os.path.join(output_path, split, "images", img_filename)
        shutil.copy(row['image_path'], dest_img_path)

        # Create label file
        yolo_bbox = convert_bbox((w, h), (row['x1'], row['y1'], row['x2'], row['y2']))
        label_filename = img_filename.replace(".jpg", ".txt")
        dest_label_path = os.path.join(output_path, split, "labels", label_filename)

        with open(dest_label_path, 'w') as f:
            f.write(f"{row['class_id'] - 1} " + " ".join([f"{coord:.6f}" for coord in yolo_bbox]) + "\n")

process_split(train_df, "train")
process_split(val_df, "val")

# -------------- CREATE YAML FILE FOR YOLOv8 -------------- #
names_list = [id2name[i] for i in sorted(id2name.keys())]

yaml_content = f"""
path: {output_path}
train: train/images
val: val/images

names: {names_list}
"""

with open(os.path.join(output_path, 'dataset.yaml'), 'w') as f:
    f.write(yaml_content)

print("✅ Dataset ready for YOLOv8 training!")


100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [00:01<00:00, 59.30it/s]


Total images found: 14611


Processing train: 100%|█████████████████████████████████████████████████████████| 11688/11688 [01:54<00:00, 101.66it/s]
Processing val: 100%|█████████████████████████████████████████████████████████████| 2923/2923 [00:28<00:00, 103.40it/s]

✅ Dataset ready for YOLOv8 training!



