In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [59]:
import os
import csv
import shutil
import pandas as pd
import multiprocessing
from functools import partial
from multiprocessing import Pool
from sklearn.model_selection import train_test_split

## Utils

In [None]:
# create df with image names and test info
def get_train_test_df(an_file, tr_tst_file):

    # read annotations file
    df1 = pd.read_csv(an_file, header=None)
    df1.columns = ['filename', 'label']

    # read .txt file with train info
    df2 = pd.read_csv(tr_tst_file, sep=' ', header=None, usecols=[1], names=['train'])

    # concat dfs
    result_df = pd.concat([df1, df2], axis=1)
    return result_df

# create annotation file for individual folders
def create_annotation_file_for_custom(root_f, ann_path, folder_name, fname):

    os.makedirs(ann_path, exist_ok=True)
    csv_path = os.path.join(ann_path, fname)
    root_folder = os.path.join(root_f, folder_name)

    with open(csv_path, 'w', newline='') as csv_file:
        fieldnames = ['ImageName', 'Label']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for class_folder in os.listdir(root_folder):
            class_path = os.path.join(root_folder, class_folder)

            # Process images in the class folder
            for file in os.listdir(class_path):

                # get label from folder name
                label = int(class_folder.split('.')[0])

                # Assuming images have common formats like JPG or PNG
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    image_name = os.path.join(class_folder, file)
                    writer.writerow({'ImageName': image_name, 'Label': label})

        print(f'Successfully {fname} file at {csv_path}.\n')

# create train and test annotations file from df
def create_annotations_file(df, ann_dest):

    df_train = df[df['train'] == 1][['filename', 'label']]
    df_test = df[df['train'] == 0][['filename', 'label']]
    df_train.columns =['ImageName', 'Label']
    df_test.columns =['ImageName', 'Label']

    # for annotations we need
    os.makedirs(ann_dest, exist_ok=True)
    ann_train = os.path.join(ann_dest, 'annotations_orig_train.csv')
    ann_test = os.path.join(ann_dest, 'annotations_orig_test.csv')

    df_train.to_csv(ann_train, index=False)
    df_test.to_csv(ann_test, index=False)

In [82]:
# function to pool: for each class folder, copies files to train and test folders
def copy_files_custom(class_name, src_path, dst_path, test_size):
    class_path = os.path.join(src_path, class_name)
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')

    # List all files in the class folder
    files = os.listdir(class_path)

    # Split the files into train and test sets
    train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)

    # Create subdirectories for this class in the train and test folders
    train_class_folder = os.path.join(train_folder, class_name)
    test_class_folder = os.path.join(test_folder, class_name)
    os.makedirs(train_class_folder, exist_ok=True)
    os.makedirs(test_class_folder, exist_ok=True)

    # Move the train files to the train class folder
    for file in train_files:
        src = os.path.join(class_path, file)
        dst = os.path.join(train_class_folder, file)
        shutil.copy(src, dst)

    # Move the test files to the test class folder
    for file in test_files:
        src = os.path.join(class_path, file)
        dst = os.path.join(test_class_folder, file)
        shutil.copy(src, dst)

# creates custom train test split
def create_train_test_split_folders_custom(src_path, dst_path, test_size = 0.3):

    classes = os.listdir(src_path)

    # if destination folder does not exist, create folder
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)

    # Create train and test folders if they don't exist
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    pool = multiprocessing.Pool(processes=4)

    copy_files_custom_x=partial(copy_files_custom, src_path=src_path, dst_path=dst_path, test_size=test_size)
    pool.map(copy_files_custom_x, classes)

    print(f'Finished creating custom train/test split folders.')




# function to pool: for each class folder, copies files to train and test folders
def copy_files_original(class_name, src_path, dst_path, train_test_df):
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')
    class_path_src = os.path.join(src_path, class_name)

    # List all files in the class folder
    files = os.listdir(class_path_src)

    # for each file in the class
    for file in files:

        # src file path
        file_path_src = os.path.join(class_path_src, file)

        # check if file is train or test
        file_key = os.path.join(class_name, file)
        train_val = train_test_df[train_test_df['filename'] == file_key]['train']).iloc[0]

        if train_val == int(1):
            class_path_dst = os.path.join(train_folder, class_name)
            file_path_dst = os.path.join(train_folder, class_name, file)
        elif train_val == int(0):
            class_path_dst = os.path.join(test_folder, class_name)
            file_path_dst = os.path.join(test_folder, class_name, file)

        # copy file
        os.makedirs(class_path_dst, exist_ok=True)
        shutil.copy(file_path_src, file_path_dst)

# creates train/test split based on the original file
def create_train_test_split_folders_original(src_path, dst_path, train_test_df):

    # list of class names
    classes = os.listdir(src_path)

    # if destination folder does not exist, create folder
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)

    # Create train and test folders if they don't exist
    train_folder = os.path.join(dst_path, 'train')
    test_folder = os.path.join(dst_path, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    pool = multiprocessing.Pool(processes=4)

    copy_files_original_x=partial(copy_files_original, src_path=src_path, dst_path=dst_path, train_test_df=train_test_df)
    pool.map(copy_files_original_x, classes)
    print(f'Finished creating original train/test split folders.')


In [85]:
def main():
    images_dir_path = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/images_cropped'
    train_test_dest1 = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_original_cropped'
    train_test_dest2 = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_custom_cropped'
    annotations_file = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/image_labels.csv'
    ann_dest1 = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_original_cropped/annotations'
    ann_dest2 = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_custom_cropped/annotations'
    train_test_file = '/content/drive/MyDrive/Colab Notebooks/Data/CUB_200_2011/train_test_split.txt'

    # create concatenated df with filenames and train info
    df = get_train_test_df(annotations_file, train_test_file)
    return df
    # create train/test folders
    # create_train_test_split_folders_custom(images_dir_path, train_test_dest2)
    create_train_test_split_folders_original(images_dir_path, train_test_dest1, df)

    # create annotation files
    # create_annotations_file(df, ann_dest1)


In [86]:
df = main()