### `Notebook version of setup.py. Enables modification to the setup procedure`

# Setup

In [None]:
# Library import
import json
import os
import pandas as pd
from sklearn.utils import Bunch
import requests
 
from utils.GenerateFileList import unpack_json, balanced_category_sampling
from utils.FetchImages import download_images
from utils.ImageJsonGenerator import create_subset_json
from utils.YOLOLabelGenerator import generate_txt_files


## Collecting paths

In [None]:
# Necessary Paths
WORKING_DIRECTORY = os.getcwd()
DEPENDENCIES = os.path.join(WORKING_DIRECTORY, 'dependencies')
DATA_PATH = os.path.join(DEPENDENCIES, 'yolo_data')
COCO_ANNOTATIONS = os.path.join(DATA_PATH, 'coco2017')

TRAINING_PATH = os.path.join(DATA_PATH, 'training')
TRAINING_IMAGES = os.path.join(TRAINING_PATH, 'images')
TRAINING_DATA = os.path.join(TRAINING_PATH, 'data')
TRAINING_LABEL = os.path.join(TRAINING_PATH, 'labels')

TEST_PATH = os.path.join(DATA_PATH, 'test')
TEST_IMAGES = os.path.join(TEST_PATH, 'images')
TEST_DATA = os.path.join(TEST_PATH, 'data')
TEST_LABEL = os.path.join(TEST_PATH, 'labels')

VALIDATION_PATH = os.path.join(DATA_PATH, 'validation')
VALIDATION_IMAGES = os.path.join(VALIDATION_PATH, 'images')
VALIDATION_DATA = os.path.join(VALIDATION_PATH, 'data')
VALIDATION_LABELS = os.path.join(VALIDATION_PATH, 'labels')

# Combining all paths
PATHS = Bunch(
    WORKING_DIRECTORY=WORKING_DIRECTORY,
    DEPENDENCIES=DEPENDENCIES,
    DATA_PATH=DATA_PATH,
    COCO_ANNOTATIONS=COCO_ANNOTATIONS,
    TRAINING_PATH=TRAINING_PATH,
    TRAINING_IMAGES=TRAINING_IMAGES,
    TRAINING_DATA=TRAINING_DATA,
    TRAINING_LABEL=TRAINING_LABEL,
    TEST_PATH=TEST_PATH,
    TEST_IMAGES=TEST_IMAGES,
    TEST_DATA=TEST_DATA,
    TEST_LABEL=TEST_LABEL,
    VALIDATION_PATH=VALIDATION_PATH,
    VALIDATION_IMAGES=VALIDATION_IMAGES,
    VALIDATION_DATA=VALIDATION_DATA,
    VALIDATION_LABELS=VALIDATION_LABELS,
)


In [None]:
CATEGORIES = ["traffic light", "bus", "train", "truck", "car", "bicycle", "person"]

# Body

### 1. + 2. Generate directories and download data

In [None]:
# Creating relevant directories
for p in PATHS.values():
    if not os.path.exists(p):
        print(f'{os.path.basename(p)} does not exists')
        os.makedirs(p)
print('Done')

# Downloading COCO annotations if none exists
print('(2/x): Downloading COCO annotations...')
# Download coco annotations from
if not os.path.exists(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_train2017.json')):
    print('Downloading training annotations...')
    URL = "https://huggingface.co/datasets/merve/coco/resolve/main/annotations/instances_train2017.json"
    train_annotation_url = requests.get(URL).content

    with open(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_train2017.json'), "wb") as file:
        file.write(train_annotation_url)
    print('Done')
else:
    print('Training annotations already exists')

if not os.path.exists(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_val2017.json')):
    print('Downloading validation annotations...')
    URL = "https://huggingface.co/datasets/merve/coco/resolve/main/annotations/instances_val2017.json"
    train_annotation_url = requests.get(URL).content

    with open(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_val2017.json'), "wb") as file:
        file.write(train_annotation_url)
    print('Done')
else:
    print('Validation annotations already exists')



### 3. Extract necessary information from data

In [None]:
# Unpacking instances json
train_files, train_data = unpack_json(labels=CATEGORIES, annotation_file_name='instances_train2017.json',
                                      max_img_categories=3, annotation_path=PATHS.COCO_ANNOTATIONS)

val_files, val_data = unpack_json(labels=CATEGORIES, annotation_file_name='instances_val2017.json',
                                  max_img_categories=3, annotation_path=PATHS.COCO_ANNOTATIONS)

### 4. Balance samples equally across categories

In [None]:
# Balancing categories, so all equally represented
train_images, train_annot = balanced_category_sampling(files=train_files,
                                          data=train_data,
                                          size=2500,
                                          categories=CATEGORIES,
                                          list_of_files_to_exclude=os.listdir(TRAINING_IMAGES))

val_images, val_annot = balanced_category_sampling(files=val_files,
                                        data=val_data,
                                        size=800,
                                        categories=CATEGORIES,
                                        list_of_files_to_exclude=list(train_files.keys()))

### 5. Download Images

In [None]:
if 5 % 5 == 0:
    print('hello')

In [None]:
# Download images
print('(5/8): Downloading images...')
download_images(train_images, PATHS.TRAINING_IMAGES)
download_images(val_images, PATHS.VALIDATION_IMAGES)

### Generate test set (optional)
*This requires that images are already ingested into training images folder. To do this, increase the size training and validation, specifyling which images not to include (e.g. those in training images path - ensures no duplicates). Reason for increasing size is that their is not an abundance of instances in the dataset across all categories*

In [None]:
# Concatenating the two dataframes
annot = pd.concat([train_annot, val_annot])
img = pd.concat([train_images, val_images])

In [None]:
# Sectioning the dataset into two equally large sets, where categories are equally dispersed between them
first_half = annot.groupby('category_id').apply(lambda x: x[:round(len(x.category_id)/2)]).reset_index(drop=True).image_id.to_list()
sec_half = annot[-annot.image_id.isin(first_half)].image_id.to_list()

# Define images to move from validation to test
images_to_move = val_images[val_images.id.isin(sec_half)].file_name.to_list()

In [None]:
# Moving images
for img in images_to_move:
    os.rename(os.path.join(VALIDATION_IMAGES, img),
              os.path.join(TEST_IMAGES, img)
              )

In [None]:
data = {'info': train_data['info'],
        'licenses': train_data['licenses'],
        'images': train_data['images'] + val_data['images'],
        'annotations': train_data['annotations'] + val_data['annotations'],
        'categories': train_data['categories']}


### Generate subset instance json data

In [None]:
# Generating new instance jsons for each subset
create_subset_json(data = train_data,
                          file_name='train',
                          image_path = PATHS.TRAINING_IMAGES,
                          data_path= PATHS.TRAINING_DATA) # Training JSON

create_subset_json(data=data,
                   file_name='validation',
                   image_path=PATHS.VALIDATION_IMAGES,
                   data_path=PATHS.VALIDATION_DATA)  # Val JSON

create_subset_json(data=data,
                   file_name='test',
                   image_path=PATHS.TEST_IMAGES,
                   data_path=PATHS.TEST_DATA)  # Test JSON

### Generate yolo text labels

In [None]:
# Generate yolo .txt files
generate_txt_files(data_path=PATHS.TRAINING_DATA,
                  img_path=PATHS.TRAINING_IMAGES,
                  label_path=PATHS.TRAINING_LABEL,
                  categories=CATEGORIES,
                  data_filename = 'train.json') # Training labels

generate_txt_files(data_path=PATHS.TEST_DATA,
                   img_path=PATHS.TEST_IMAGES,
                   label_path=PATHS.TEST_LABEL,
                   categories=CATEGORIES,
                   data_filename='test.json') # Test labels 


generate_txt_files(data_path=PATHS.VALIDATION_DATA,
                   img_path=PATHS.VALIDATION_IMAGES,
                   label_path=PATHS.VALIDATION_LABELS,
                   categories=CATEGORIES,
                   data_filename='validation.json') # Validation labels
