In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
from collections import defaultdict

# Defining all paths
base_dir = '/content/drive/My Drive/Colab Notebooks/food_items'
splits = ['train', 'val', 'test']
split_stats = {}

# Loop through each split
for split in splits:
    split_path = os.path.join(base_dir, split)
    class_counts = defaultdict(int)
    total_images = 0
    class_names = []

    for class_name in os.listdir(split_path):
        class_path = os.path.join(split_path, class_name)
        if os.path.isdir(class_path):
            image_files = [
                f for f in os.listdir(class_path)
                if f.lower().endswith(('.jpg', '.jpeg', '.png'))
            ]
            count = len(image_files)
            class_counts[class_name] = count
            total_images += count
            class_names.append(class_name)

    split_stats[split] = {
        'total_images': total_images,
        'num_classes': len(class_counts),
        'class_counts': class_counts,
        'class_names': sorted(class_names)
    }

# Checking class consistency across all splits
train_classes = set(split_stats['train']['class_names'])
val_classes = set(split_stats['val']['class_names'])
test_classes = set(split_stats['test']['class_names'])

print("Split Summary:")
for split in splits:
    print(f"{split.upper()} — {split_stats[split]['num_classes']} classes, {split_stats[split]['total_images']} images")

print("\n Class Matching Check:")
print("Classes in all splits identical? ", train_classes == val_classes == test_classes)
print("Classes only in train:", train_classes - val_classes - test_classes)
print("Classes only in val:", val_classes - train_classes - test_classes)
print("Classes only in test:", test_classes - train_classes - val_classes)

Split Summary:
TRAIN — 101 classes, 70705 images
VAL — 101 classes, 15150 images
TEST — 101 classes, 15154 images

 Class Matching Check:
Classes in all splits identical?  True
Classes only in train: set()
Classes only in val: set()
Classes only in test: set()


In [None]:
import os
from collections import defaultdict

# Using the confirmed train directory
train_dir = '/content/drive/My Drive/Colab Notebooks/food_items/train'

# Counting the number of images per class
class_counts = defaultdict(int)

for class_name in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_name)
    if os.path.isdir(class_path):
        image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        class_counts[class_name] = len(image_files)

# Sorting the image count (descending)
sorted_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)


print("Top 20 food classes by image count:")
for name, count in sorted_classes[:20]:
    print(f"{name}: {count} images")

Top 20 food classes by image count:
hamburger: 704 images
huevos_rancheros: 701 images
baby_back_ribs: 700 images
beef_tartare: 700 images
baklava: 700 images
beef_carpaccio: 700 images
apple_pie: 700 images
beet_salad: 700 images
bibimbap: 700 images
beignets: 700 images
breakfast_burrito: 700 images
caprese_salad: 700 images
bread_pudding: 700 images
cannoli: 700 images
carrot_cake: 700 images
ceviche: 700 images
caesar_salad: 700 images
bruschetta: 700 images
chicken_wings: 700 images
chicken_curry: 700 images


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import shutil
from pathlib import Path
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define source and target directories
original_base = Path("/content/drive/MyDrive/food_subset")
target_dir = Path("/content/drive/MyDrive/food_subset_shared_final")

# Step 3: Define selected categories
categories = {
    "main_course": ["chicken_curry", "hamburger", "fried_rice", "caesar_salad"],
    "dessert": ["apple_pie", "ice_cream", "carrot_cake", "baklava"]
}

# Step 4: Limit number of images per class
MAX_IMAGES = 100

# Step 5: Copy files to new structure
for split in ["train", "val", "test"]:
    src_split_path = original_base / split
    dst_split_path = target_dir / split

    for label, foods in categories.items():
        for food in foods:
            src = src_split_path / food
            dst = dst_split_path / label
            dst.mkdir(parents=True, exist_ok=True)
            if src.exists():
                files = sorted(os.listdir(src))[:MAX_IMAGES]
                for file in files:
                    full_file = src / file
                    if full_file.is_file():
                        shutil.copy(full_file, dst)

# Step 6: summary of what was saved
subset_summary = {
    split: {
        label: len(os.listdir(target_dir / split / label))
        for label in os.listdir(target_dir / split)
    }
    for split in ["train", "val", "test"]
}

subset_summary

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


{'train': {'main_course': 400, 'dessert': 200},
 'val': {'main_course': 400, 'dessert': 200},
 'test': {'main_course': 400, 'dessert': 200}}

In [6]:
import os
import pandas as pd

base_dir = "/content/drive/MyDrive/food_subset_shared_final"
split_counts = {}

for split in ["train", "val", "test"]:
    total = 0
    split_path = os.path.join(base_dir, split)
    if not os.path.exists(split_path):
        print(f"Folder not found: {split_path}")
        continue
    for category in os.listdir(split_path):
        class_dir = os.path.join(split_path, category)
        if os.path.isdir(class_dir):
            total += len(os.listdir(class_dir))
    split_counts[split] = total

total_images = sum(split_counts.values())

if total_images == 0:
    print(" No images found in any split. Please check the folder structure.")
else:
    for split, count in split_counts.items():
        ratio = round((count / total_images) * 100, 2)
        print(f"{split.upper():<5} → {count} images ({ratio}%)")

TRAIN → 600 images (33.33%)
VAL   → 600 images (33.33%)
TEST  → 600 images (33.33%)


In [7]:
import os
from PIL import Image
import pandas as pd

# Set your dataset base path
base_dir = "/content/drive/MyDrive/food_subset_shared_final"

# To collect invalid image info
corrupt_images = []

# Loop through each split folder
for split in ["train", "val", "test"]:
    split_path = os.path.join(base_dir, split)
    if os.path.exists(split_path):
        for category in os.listdir(split_path):
            category_path = os.path.join(split_path, category)
            if os.path.isdir(category_path):
                for fname in os.listdir(category_path):
                    file_path = os.path.join(category_path, fname)
                    try:
                        with Image.open(file_path) as img:
                            img.verify()  # Check for file corruption
                    except Exception as e:
                        corrupt_images.append({
                            "Split": split,
                            "Category": category,
                            "File": fname,
                            "Error": str(e)
                        })

# Creating summary table
df_corrupt = pd.DataFrame(corrupt_images)

#results
if df_corrupt.empty:
    print(" All images are valid. No corrupt images found.")
else:
    print(" Found corrupt images:")
    print(df_corrupt)

✅ All images are valid. No corrupt images found.


In [8]:
import os
import shutil
import random
from tqdm import tqdm

# Step 1: Set paths
original_data_dir = "/content/drive/MyDrive/food_subset_shared_final/train"
resplit_dir = "/content/drive/MyDrive/food_subset_resplit_70_10_10"


# Step 3: Clean old folder if exists
if os.path.exists(resplit_dir):
    shutil.rmtree(resplit_dir)

# Step 4: Create folder structure
splits = ["train", "val", "test"]
categories = ["main_course", "dessert"]

for split in splits:
    for category in categories:
        os.makedirs(os.path.join(resplit_dir, split, category), exist_ok=True)

# Step 5: Perform splitting
for category in tqdm(categories, desc="Processing categories"):
    category_path = os.path.join(original_data_dir, category)
    files = [f for f in os.listdir(category_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    random.shuffle(files)

    total = len(files)
    train_end = int(0.7 * total)
    val_end = train_end + int(0.1 * total)

    split_map = {
        "train": files[:train_end],
        "val": files[train_end:val_end],
        "test": files[val_end:]
    }

    for split, split_files in split_map.items():
        for f in split_files:
            src = os.path.join(category_path, f)
            dst = os.path.join(resplit_dir, split, category, f)
            shutil.copy2(src, dst)

print(" Done: Images re-split into 70% train, 10% val, 10% test.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Processing categories: 100%|██████████| 2/2 [00:06<00:00,  3.40s/it]

✅ Done: Images re-split into 70% train, 10% val, 10% test.





In [13]:

resplit_dir = "/content/drive/MyDrive/food_subset_resplit_70_10_10"

split_counts = {}
total_images = 0

for split in ["train", "val", "test"]:
    count = 0
    split_path = os.path.join(resplit_dir, split)
    if not os.path.exists(split_path):
        print(f"Folder not found: {split_path}")
        continue
    for category in os.listdir(split_path):
        category_path = os.path.join(split_path, category)
        count += len([img for img in os.listdir(category_path) if img.lower().endswith(('.jpg', '.jpeg', '.png'))])
    split_counts[split] = count
    total_images += count

#summary
print("Final Image Split Summary:")
for split, count in split_counts.items():
    ratio = round((count / total_images) * 100, 2) if total_images else 0
    print(f"{split.upper():<5} → {count} images ({ratio}%)")

Final Image Split Summary:
TRAIN → 420 images (70.0%)
VAL   → 60 images (10.0%)
TEST  → 120 images (20.0%)
