In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
import os
import csv
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [22]:
# creates custom train test split
def create_train_test_split_folders_custom(src_path, dst_path, test_size = 0.4):

    classes = os.listdir(src_path)

    # if destination folder does not exist, create folder
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)

    # Create train and test folders if they don't exist
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Iterate through each class folder
    for class_name in classes:
        class_path = os.path.join(src_path, class_name)

        # List all files in the class folder
        files = os.listdir(class_path)

        # Split the files into train and test sets
        train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)

        # Create subdirectories for this class in the train and test folders
        train_class_folder = os.path.join(train_folder, class_name)
        test_class_folder = os.path.join(test_folder, class_name)
        os.makedirs(train_class_folder, exist_ok=True)
        os.makedirs(test_class_folder, exist_ok=True)

        # Move the train files to the train class folder
        for file in train_files:
            src = os.path.join(class_path, file)
            dst = os.path.join(train_class_folder, file)
            shutil.copy(src, dst)

        # Move the test files to the test class folder
        for file in test_files:
            src = os.path.join(class_path, file)
            dst = os.path.join(test_class_folder, file)
            shutil.copy(src, dst)

# creates train/test split based on the original file
def create_train_test_split_folders_original(src_path, dst_path, train_test_df):

    # list of class names
    classes = os.listdir(src_path)

    # if destination folder does not exist, create folder
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)

    # Create train and test folders if they don't exist
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Iterate through each class folder
    for class_name in classes:
        class_path_src = os.path.join(src_path, class_name)

        # List all files in the class folder
        files = os.listdir(class_path_src)

        # for each file in the class
        for file in files:

            # src file path
            file_path_src = os.path.join(class_path_src, file)

            # check if file is train or test
            file_key = os.path.join(class_name, file)
            train_val = int(train_test_df[train_test_df['filename'] == 'file_key']['train'])

            if train_val:
                class_path_dst = os.path.join(train_folder, class_name)
                os.makedirs(class_path_dst, exist_ok=True)
                file_path_dst = os.path.join(train_folder, class_name, file)

            else:
                class_path_dst = os.path.join(test_folder, class_name)
                file_path_dst = os.path.join(test_folder, class_name, file)

            # copy file
            os.makedirs(class_path_dst, exist_ok=True)
            shutil.copy(file_path_src, file_path_dst)


# create df with image names and test info
def get_train_test_df(an_file, tr_tst_file):

    # read annotations file
    df1 = pd.read_csv(an_file, header=None)
    df1.columns = ['filename', 'label']

    # read .txt file with train info
    df2 = pd.read_csv(tr_tst_file, sep=' ', header=None, usecols=[1], names=['train'])

    # concat dfs
    result_df = pd.concat([df1, df2], axis=1)
    return result_df


def create_annotations_file(label_map, root_folder, csv_dest):
    with open(csv_dest, 'w', newline='') as csv_file:
        fieldnames = ['ImageName', 'Label']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for class_folder in os.listdir(root_folder):
            class_path = os.path.join(root_folder, class_folder)

            # Check if the item in the directory is a folder
            if os.path.isdir(class_path):
                label = label_map.get(class_folder, None)

                # Process images in the class folder
                for image_name in os.listdir(class_path):
                    # Assuming images have common formats like JPG or PNG
                    if image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                        writer.writerow({'ImageName': image_name, 'Label': label})

        print(f'Successfully created annotations.csv file at {csv_dest}.\n')

In [23]:
def main():
    images_dir_path = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/images_cropped'
    train_test_dest_path = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_original_cropped'
    annotations_file = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/image_labels.csv'
    train_test_file = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_split.txt'

    # create concatenated df with filenames and train info
    df = get_train_test_df(annotations_file, train_test_file)

    # create train/test folders

    # create separate train/test annotations file


In [24]:
df = main()

Unnamed: 0,filename,label,train
0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0
3,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
