# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Change path
## Set the path to where this notebook and the 'global_images' as well as 'global_annotations' are

In [1]:
!pip install pandas



# Imports

In [2]:
from pathlib import Path
import random
import pandas as pd

# Global Files

In [3]:
annotation_folder = Path("./global_annotations")
image_folder = Path("./global_images")

# Read file types

In [4]:
annotation_files = {f.stem for f in Path(annotation_folder).iterdir()}
image_files = {f.stem for f in Path(image_folder).iterdir() if f.suffix.lower() in {'.jpg', '.jpeg', '.png'}}

matching_base_names = annotation_files.intersection(image_files)

matching_annotation_files = sorted([f.name for f in Path(annotation_folder).iterdir() if f.stem in matching_base_names])
matching_image_files = sorted([f.name for f in Path(image_folder).iterdir() if f.stem in matching_base_names])

print(f"Matching annotation files count {len(matching_annotation_files)}:", matching_annotation_files)
print(f"Matching image files count {len(matching_image_files)}:", matching_image_files)

Matching annotation files count 3616: ['1.txt', '10.txt', '1000.txt', '1001.txt', '1002.txt', '1003.txt', '1004.txt', '1005.txt', '1006.txt', '1007.txt', '1008.txt', '1009.txt', '1010.txt', '1011.txt', '1012.txt', '1013.txt', '1014.txt', '1015.txt', '1016.txt', '1017.txt', '1018.txt', '1019.txt', '1020.txt', '1021.txt', '1022.txt', '1023.txt', '1024.txt', '1025.txt', '1026.txt', '1027.txt', '1028.txt', '1029.txt', '1030.txt', '1031.txt', '1032.txt', '1033.txt', '1034.txt', '1035.txt', '1036.txt', '1037.txt', '1038.txt', '1039.txt', '1040.txt', '11.txt', '12.txt', '13.txt', '14.txt', '1441.txt', '1442.txt', '1443.txt', '1444.txt', '1445.txt', '1446.txt', '1447.txt', '1448.txt', '1449.txt', '1450.txt', '1451.txt', '1452.txt', '1453.txt', '1454.txt', '1455.txt', '1456.txt', '1457.txt', '1458.txt', '1459.txt', '1460.txt', '1461.txt', '1462.txt', '1463.txt', '1464.txt', '1465.txt', '1466.txt', '1467.txt', '1468.txt', '1469.txt', '1470.txt', '1471.txt', '1472.txt', '1473.txt', '1474.txt', '1

# Data List

In [5]:
data = list(zip(matching_image_files, matching_annotation_files))

# Select your CSE428 Section & Project-Group

In [7]:
# change section number and group number to generate csv files
section_number = 1
group_number = 9

seed = section_number*100+group_number
random.seed(seed)
random.shuffle(data)

total_size = len(data)
train_size = int(0.7 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]

# Data Split Sizes

In [8]:
print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Train set size: 2531
Validation set size: 723
Test set size: 362


# Split list save

In [9]:
train_df = pd.DataFrame(train_data, columns=['Train Images', 'Train Annotations'])
val_df = pd.DataFrame(val_data, columns=['Val Images', 'Val Annotations'])
test_df = pd.DataFrame(test_data, columns=['Test Images', 'Test Annotations'])

group_folder = Path(f"section{section_number}-group{group_number}")
group_folder.mkdir(parents=True, exist_ok=True)

# 3 csv files will be saved which you will dedicatedly use for your detection work
# The csv files will be inside a folder where your notebook is, named: 'sectionX-groupY'
train_df.to_csv(group_folder.joinpath("train.csv"), index=False)
val_df.to_csv(group_folder.joinpath("val.csv"), index=False)
test_df.to_csv(group_folder.joinpath("test.csv"), index=False)