In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import shutil
import random
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

In [2]:
SOURCE = './'

# Main dataset
DATASET_PATH = os.path.join(SOURCE, "raw_dataset")

# Split dataset
SPLIT_DATASET_PATH = os.path.join(SOURCE, "split_dataset") # Change it

TRAIN_PATH = os.path.join(SPLIT_DATASET_PATH, 'train')
VAL_PATH = os.path.join(SPLIT_DATASET_PATH, 'val')
TEST_PATH = os.path.join(SPLIT_DATASET_PATH, 'test')

# Food Categories
FOOD_LIST = os.listdir(DATASET_PATH)
NUM_OF_FOOD = len(FOOD_LIST)

print('Food total:', NUM_OF_FOOD)

print('\nFood category:', FOOD_LIST)

Food total: 10

Food category: ['ayam_goreng', 'bakso', 'bubur', 'gado_gado', 'mie_ayam', 'nasi_goreng', 'nasi_padang', 'rawon', 'sate_ayam', 'soto_ayam']


In [13]:
def print_dataset_info(dataset_path):
  IMAGE_COUNT = []
  food_list = os.listdir(dataset_path)

  print('Number of Images')
  for index, food in enumerate(sorted(food_list)):
      path = os.path.join(dataset_path, food)
      num_images = len(os.listdir(path))
      IMAGE_COUNT.append(num_images)

      print('{:2}. {:16} : {:5} '.format(index + 1, food, num_images))

  IMAGE_COUNT = np.array(IMAGE_COUNT)

  print('\nTotal images: {}'.format(np.sum(IMAGE_COUNT)))
  print('Lowest: {}'.format(np.min(IMAGE_COUNT)))
  print('Highest: {}'.format(np.max(IMAGE_COUNT)))

  print('\nMean: {:.2f}'.format(np.mean(IMAGE_COUNT)))

print_dataset_info(DATASET_PATH)

Number of Images
 1. ayam_goreng      :   151 
 2. bakso            :   148 
 3. bubur            :   152 
 4. gado_gado        :   148 
 5. mie_ayam         :   151 
 6. nasi_goreng      :   149 
 7. nasi_padang      :   145 
 8. rawon            :   148 
 9. sate_ayam        :   147 
10. soto_ayam        :   150 

Total images: 1489
Lowest: 145
Highest: 152

Mean: 148.90


In [14]:
def reset_split_dataset(clean_data_path, train_path, val_path, test_path):
    # Delete existing
    if os.path.exists(clean_data_path):
        shutil.rmtree(clean_data_path)

    # Create new empty directory
    for food in FOOD_LIST:
        train_spice_path = os.path.join(train_path, food)
        os.makedirs(train_spice_path)

        val_spice_path = os.path.join(val_path, food)
        os.makedirs(val_spice_path)

        test_spice_path = os.path.join(test_path, food)
        os.makedirs(test_spice_path)

    print('Reset complete.\n')

In [15]:
def resize_rename_save(source_dir, temp, food_name, thresh, image_size=(384, 384)):
  # Ensure the destination directory exists
  os.makedirs(temp, exist_ok=True)

  count = 0
  for food in os.listdir(source_dir):
    # if count >= thresh:
    #   break

  # Construct the full image path
    image_path = os.path.join(source_dir, food)

    try:
      # Open, resize, and convert the image
      with Image.open(image_path) as img:
        img = img.convert('RGB').resize(image_size)

      # Rename and save
      jpeg_name = food_name + str(count).zfill(4) + ".jpeg"
      image_dest_path = os.path.join(temp, jpeg_name)
      img.save(image_dest_path, "JPEG")

      count += 1
    except Exception as e:
      print(f"Skipping file {food} due to an error: {e}")

  print(f"Processed {count} images.")

In [16]:
def split_dataset(dataset_path, train_path, val_path, test_path, split=(0.8, 0.1, 0.1)):
    train_size, val_size, test_size = split
    if not round(train_size + val_size + test_size, 2) == 1.0:
        raise ValueError("Split values must add up to 1.0")
    
    # Get images
    dir_list = os.listdir(dataset_path)
    # Remove zero-size images
    final_list = [filename for filename in dir_list if os.path.getsize(os.path.join(dataset_path, filename)) > 0]
    total_images = len(final_list)
    
    # Calculate exact split sizes
    train_split = int(train_size * total_images)
    val_split = int(val_size * total_images)
    
    # Shuffle the data for random distribution
    import random
    random.shuffle(final_list)
    
    # Fixed indexing: Use proper slicing to ensure all images are included
    train_files = final_list[:train_split]
    val_files = final_list[train_split:(train_split + val_split)]
    test_files = final_list[(train_split + val_split):]  # No need to calculate test_split
    
    # Move files to respective directories
    for filename in train_files:
        shutil.move(os.path.join(dataset_path, filename), os.path.join(train_path, filename))
    for filename in val_files:
        shutil.move(os.path.join(dataset_path, filename), os.path.join(val_path, filename))
    for filename in test_files:
        shutil.move(os.path.join(dataset_path, filename), os.path.join(test_path, filename))

In [17]:
def create_split_dataset(dataset_path, train_path, val_path, test_path, split, tresh):
    temp_folder_path = './tmp'

    # Create the temp folder and clean up any existing folder
    if os.path.exists(temp_folder_path):
        shutil.rmtree(temp_folder_path)
    os.makedirs(temp_folder_path, exist_ok=True)

    for food in FOOD_LIST:
        source_path = os.path.join(dataset_path, food)

        train = os.path.join(train_path, food)
        val = os.path.join(val_path, food)
        test = os.path.join(test_path, food)

        # Process and save images to the temp directory
        resize_rename_save(source_path, temp_folder_path, food, tresh)

        # Split data into training, validation, and test sets
        split_dataset(temp_folder_path, train, val, test, split)

        # Clean up the temporary folder
        shutil.rmtree(temp_folder_path)
        print(f'Data processed and splitting completed for: {food} \n')

In [18]:
# IMAGE_SIZE = (384, 384)
SPLIT_SIZE = (.8, .15, .05)
THRESHOLD = None

reset_split_dataset(SPLIT_DATASET_PATH, TRAIN_PATH, VAL_PATH, TEST_PATH)
create_split_dataset(DATASET_PATH, TRAIN_PATH, VAL_PATH, TEST_PATH, SPLIT_SIZE, THRESHOLD)

Reset complete.

Processed 151 images.
Data processed and splitting completed for: ayam_goreng 

Processed 148 images.
Data processed and splitting completed for: bakso 

Processed 152 images.
Data processed and splitting completed for: bubur 

Processed 148 images.
Data processed and splitting completed for: gado_gado 

Processed 151 images.
Data processed and splitting completed for: mie_ayam 

Processed 149 images.
Data processed and splitting completed for: nasi_goreng 

Processed 145 images.
Data processed and splitting completed for: nasi_padang 

Processed 148 images.
Data processed and splitting completed for: rawon 

Processed 147 images.
Data processed and splitting completed for: sate_ayam 

Processed 150 images.
Data processed and splitting completed for: soto_ayam 



In [24]:
SPLIT_DATASET_PATH = os.path.join(SOURCE, "split_dataset_4") # I've renamed it

TRAIN_PATH = os.path.join(SPLIT_DATASET_PATH, 'train')
VAL_PATH = os.path.join(SPLIT_DATASET_PATH, 'val')
TEST_PATH = os.path.join(SPLIT_DATASET_PATH, 'test')

def check_total_images(folder_name, data_path):
    total_sum = 0
    for rootdir, dirs, files in os.walk(data_path):
        for subdir in dirs:
            path = os.path.join(rootdir, subdir)
            total_sum += len(os.listdir(path))
    print('Total Images in {}: {}'.format(folder_name, total_sum))
    return total_sum

train_count = check_total_images('Train', TRAIN_PATH)
dev_count = check_total_images('Val', VAL_PATH)
test_count = check_total_images('Test', TEST_PATH)

total_count = train_count + dev_count + test_count

ratio_train = round(train_count / total_count, 2)
ratio_dev = round(dev_count / total_count, 2)
ratio_test = round(test_count / total_count, 2)

print('\nTotal Images in Clean Dataset: {}\n\nTrain Ratio: {}\nDev Ratio: {}\nTest Ratio: {}'.format(total_count, ratio_train, ratio_dev, ratio_test))
print(round((ratio_train + ratio_dev + ratio_test), 2))

Total Images in Train: 1178
Total Images in Val: 220
Total Images in Test: 80

Total Images in Clean Dataset: 1478

Train Ratio: 0.8
Dev Ratio: 0.15
Test Ratio: 0.05
1.0


In [20]:
# Run this for creating a list of categories in .txt file
def write_final_categories(txt_file, directory):
  categories = os.listdir(directory)

  with open(txt_file, 'w') as f:
    for item in sorted(categories):
      f.write("%s\n" % item)

  print('Re-Write Successfull')

write_final_categories('./categories.txt', DATASET_PATH)

Re-Write Successfull
