### `Notebook To Extract Data`

This notebook is used for extracting, processing and loading images needed for training *YOLOv5*, *TinyVGG* and the *Simplified Xception*.

**Note**\
YOLOv5 requires that data is saved within its directory. Therefore, after having run the section for YOLO, locate the images within **`dependencies/yolov_data`**, and move these into **`dependencies/yolov5/datasets`** (This is only necessary if you are cloning from Github)

# Setup

In [None]:
# Library import
import json
import os
import pandas as pd
from sklearn.utils import Bunch
import requests
from io import BytesIO
import uuid
from PIL import Image
 
from utils.GenerateFileList import unpack_json, balanced_category_sampling
from utils.FetchImages import download_images
from utils.ImageJsonGenerator import create_subset_json
from utils.YOLOLabelGenerator import generate_txt_files
from utils.ImageModifier import resize_images
from utils.DataLoader import import_data

## Collecting paths

In [None]:
# Necessary Paths
WORKING_DIRECTORY = os.getcwd()
DEPENDENCIES = os.path.join(WORKING_DIRECTORY, 'dependencies')
DATA_PATH = os.path.join(DEPENDENCIES, 'yolo_data')
COCO_ANNOTATIONS = os.path.join(DATA_PATH, 'coco2017')

TRAINING_PATH = os.path.join(DATA_PATH, 'training')
TRAINING_IMAGES = os.path.join(TRAINING_PATH, 'images')
TRAINING_DATA = os.path.join(TRAINING_PATH, 'data')
TRAINING_LABEL = os.path.join(TRAINING_PATH, 'labels')

TEST_PATH = os.path.join(DATA_PATH, 'test')
TEST_IMAGES = os.path.join(TEST_PATH, 'images')
TEST_DATA = os.path.join(TEST_PATH, 'data')
TEST_LABEL = os.path.join(TEST_PATH, 'labels')

VALIDATION_PATH = os.path.join(DATA_PATH, 'validation')
VALIDATION_IMAGES = os.path.join(VALIDATION_PATH, 'images')
VALIDATION_DATA = os.path.join(VALIDATION_PATH, 'data')
VALIDATION_LABELS = os.path.join(VALIDATION_PATH, 'labels')

# Combining all paths
PATHS = Bunch(
    WORKING_DIRECTORY=WORKING_DIRECTORY,
    DEPENDENCIES=DEPENDENCIES,
    DATA_PATH=DATA_PATH,
    COCO_ANNOTATIONS=COCO_ANNOTATIONS,
    TRAINING_PATH=TRAINING_PATH,
    TRAINING_IMAGES=TRAINING_IMAGES,
    TRAINING_DATA=TRAINING_DATA,
    TRAINING_LABEL=TRAINING_LABEL,
    TEST_PATH=TEST_PATH,
    TEST_IMAGES=TEST_IMAGES,
    TEST_DATA=TEST_DATA,
    TEST_LABEL=TEST_LABEL,
    VALIDATION_PATH=VALIDATION_PATH,
    VALIDATION_IMAGES=VALIDATION_IMAGES,
    VALIDATION_DATA=VALIDATION_DATA,
    VALIDATION_LABELS=VALIDATION_LABELS,
)


In [None]:
CATEGORIES = ["traffic light", "bus", "train", "truck", "car", "bicycle", "person"]

# `YOLOv5 DATA EXTRACTION`

### 1. + 2. Generate directories and download data

In [None]:
# Creating relevant directories
for p in PATHS.values():
    if not os.path.exists(p):
        print(f'{os.path.basename(p)} does not exists')
        os.makedirs(p)

# Downloading COCO annotations if none exists
# Download coco annotations from
if not os.path.exists(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_train2017.json')):
    print('Downloading training annotations...')
    URL = "https://huggingface.co/datasets/merve/coco/resolve/main/annotations/instances_train2017.json"
    train_annotation_url = requests.get(URL).content

    with open(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_train2017.json'), "wb") as file:
        file.write(train_annotation_url)
    print('Done')
else:
    print('Training annotations already exists')

if not os.path.exists(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_val2017.json')):
    print('Downloading validation annotations...')
    URL = "https://huggingface.co/datasets/merve/coco/resolve/main/annotations/instances_val2017.json"
    train_annotation_url = requests.get(URL).content

    with open(os.path.join(PATHS.COCO_ANNOTATIONS, 'instances_val2017.json'), "wb") as file:
        file.write(train_annotation_url)
    print('Done')
else:
    print('Validation annotations already exists')





### 3. Extract necessary information from data

In [None]:
# Unpacking instances json
train_files, train_data = unpack_json(labels=CATEGORIES, annotation_file_name='instances_train2017.json',
                                      max_img_categories=3, annotation_path=PATHS.COCO_ANNOTATIONS)

val_files, val_data = unpack_json(labels=CATEGORIES, annotation_file_name='instances_val2017.json',
                                  max_img_categories=3, annotation_path=PATHS.COCO_ANNOTATIONS)

test_files, test_data = unpack_json(labels=CATEGORIES, annotation_file_name='instances_val2017.json',
                                  max_img_categories=3, annotation_path=PATHS.COCO_ANNOTATIONS)

### 4. Balance samples equally across categories

In [None]:
# Balancing categories, so all equally represented
train_images, train_annot = balanced_category_sampling(files=train_files,
                                                       data=train_data,
                                                       size=2500,
                                                       categories=CATEGORIES)


test_images, test_annot = balanced_category_sampling(files=val_files,
                                                     data=val_data,
                                                     size=500,
                                                     categories=CATEGORIES,
                                                     list_of_files_to_exclude=list(train_files.keys()))


val_images, val_annot = balanced_category_sampling(files=val_files,
                                                   data=val_data,
                                                   size=500,
                                                   categories=CATEGORIES,
                                                   list_of_files_to_exclude=list(train_files.keys()))


### 5. Download Images

In [None]:
# Checking first to see how many existing files there are
def extract_existing_files(size, path, images):
    # Fetch files in path
    files = os.listdir(path)
    
    if size <= len(files):
        print('Already sufficient images in folder')
        return None
    
    elif len(files) < size:
        print(f'Existing files in folder, removing {len(files)} from {len(images)}')
        images = images[-images.file_name.isin(files)]
        images = images.head(size)
    else:
        pass
    
    return images

In [None]:
train_images = extract_existing_files(2500, PATHS.TRAINING_IMAGES, train_images)
test_images = extract_existing_files(500, PATHS.TEST_IMAGES, test_images)
val_images = extract_existing_files(500, PATHS.VALIDATION_IMAGES, val_images)

In [None]:
# Download images
try:
    download_images(train_images, PATHS.TRAINING_IMAGES)
except TypeError:
    print('Image count is already sufficient for traininig')
    
try:
    download_images(val_images, PATHS.VALIDATION_IMAGES)
except TypeError:
    print('Image count is already sufficient for validation')

try:
    download_images(test_images, PATHS.TEST_IMAGES)
except TypeError:
    print('Image count is already sufficient for test')

print('Done')

### 6. Resize Images

In [None]:
# Resizing images to 640x640
resize_images(PATHS.TRAINING_IMAGES)
resize_images(PATHS.TEST_IMAGES)
resize_images(PATHS.VALIDATION_IMAGES)

### 7.  Generate subset instance json data

In [None]:
# Generating new instance jsons for each subset
create_subset_json(data = train_data,
                          file_name='train',
                          image_path = PATHS.TRAINING_IMAGES,
                          data_path= PATHS.TRAINING_DATA) # Training JSON

create_subset_json(data=val_data,
                   file_name='validation',
                   image_path=PATHS.VALIDATION_IMAGES,
                   data_path=PATHS.VALIDATION_DATA)  # Val JSON

create_subset_json(data=test_data,
                   file_name='test',
                   image_path=PATHS.TEST_IMAGES,
                   data_path=PATHS.TEST_DATA)  # Test JSON

### 8. Generate yolo text labels

In [None]:
# Generate yolo .txt files
generate_txt_files(data_path=PATHS.TRAINING_DATA,
                  img_path=PATHS.TRAINING_IMAGES,
                  label_path=PATHS.TRAINING_LABEL,
                  categories=CATEGORIES,
                  data_filename = 'train.json')

ok = generate_txt_files(data_path=PATHS.TEST_DATA, 
                        img_path=PATHS.TEST_IMAGES,
                        label_path=PATHS.TEST_LABEL,
                        categories=CATEGORIES,
                        data_filename='test.json')

generate_txt_files(data_path=PATHS.VALIDATION_DATA,
                   img_path=PATHS.VALIDATION_IMAGES,
                   label_path=PATHS.VALIDATION_LABELS,
                   categories=CATEGORIES,
                   data_filename='validation.json') # Validation labels

# `TinyVGG and Simplified Xception Model Data Extrcation`

In [None]:
# Needed pahts
WORKING_DIRECTORY = os.getcwd()
DATA_FOLDER = os.path.join(WORKING_DIRECTORY, 'dependencies', 'yolo_data', 'coco2017')
TRAINING_FOLDER = os.path.join(WORKING_DIRECTORY, 'dependencies', 'cnn_data', 'training')
TEST_FOLDER = os.path.join(WORKING_DIRECTORY, 'dependencies', 'cnn_data', 'test')

# Specifying desired categories
CATEGORIES = ["car", "bicycle", "person"]

In [None]:
# Extract data using import data function
data = import_data(DATA_FOLDER, file_name='instances_train2017.json')

In [None]:
# Generate annotations dataframe
def generate_annot_df(bunch, cats):

    # Filename and URL
    dic = {x['id']: x['file_name'] for x in bunch.images}
    url = {x['id']: x['coco_url'] for x in bunch.images}

    # Extract id for categories
    category_ids = {cat['id']: cat['name']
                    for cat in bunch.catagories if cat['name'] in cats}

    df = pd.DataFrame(bunch.annotations)

    # Removing images of crowds
    df = df[-df.image_id.isin(df[df.iscrowd == 1].image_id.to_list())
            ][['image_id', 'area', 'bbox', 'category_id']]

    # Adding file name to datafram
    df['file_name'] = df['image_id'].map(dic)

    # Adding coco url needed for extraction
    df['coco_url'] = df['image_id'].map(url)

    # Unpacking bounding box column
    df = df.assign(
        x_min=lambda x: x['bbox'].apply(lambda x: x[0]),
        y_min=lambda x: x['bbox'].apply(lambda x: x[1]),
        w=lambda x: x['bbox'].apply(lambda x: x[2]),
        h=lambda x: x['bbox'].apply(lambda x: x[3]),
        x_max=lambda x: x.x_min + x.w,
        y_max=lambda x: x.y_min + x.h)

    # Resetting index and dropping all annotations outside of desired list of categories
    df = df.reset_index(drop=True)
    df = df[df.category_id.isin(category_ids.keys())][['file_name', 'coco_url',
                                                       'image_id', 'category_id', 'area', 'bbox', 'x_min', 'y_min', 'x_max', 'y_max']]

    # Creating new category ids (so that they are not 2, 5 , 7, and instead, 0, 1, 2)
    categories = {old: new for (new, old) in zip(
        range(len(category_ids.keys())), category_ids.keys())}

    # Get category names and add to dataframe
    category_names = {value: category_ids[key]
                      for key, value in categories.items()}
    df.category_id = df.category_id.map(categories)
    df['category_name'] = df.category_id.map(category_names)

    return df, category_names


In [None]:
annot, category_names = generate_annot_df(data, CATEGORIES)
# del data # Removing data dict to not take up too much ram


In [None]:
def fetch_crop_and_save_image(annot_df, train_folder, test_folder, images_per_category=1000, test_size=0.15):
    
    # Categories
    cats = ['person', 'car', 'bicycle']
    
    # Extract equal number of images based on biggest bbox from annotations within desired categories
    train = annot_df[annot_df.category_name.isin(cats)].sort_values(by='area', ascending=False).groupby(
        'category_id').apply(lambda x: x[:images_per_category]).reset_index(drop=True)
    
    # Same for test set
    test = annot_df[(annot_df.category_name.isin(cats)) & (annot_df.image_id.isin(train.image_id.to_list()))].sort_values(by='area', ascending=False).groupby(
        'category_id').apply(lambda x: x[:round(images_per_category*test_size)]).reset_index(drop=True)
    
        
    for data, folder in zip([train, test],[train_folder, test_folder]):
        # Creating directories to store images (TensorFlow can infer labels from directory structure)
        for cat in cats:
            if not os.path.exists(os.path.join(folder, cat)):
                os.makedirs(os.path.join(folder, cat))
    
        
        print(f'Cropping and moving: {len(data)} images')
        images_done = 0
    
        # New image size
        new_size = (250, 250)
    
        # looping through dataset to crop images according to category
        for ind, row in data.iterrows():
            images_done += 1
    
            # Fetching image contents
            response = requests.get(row.coco_url)
    
            # Cropping images based on bbox
            (left, top, right, bottom) = row.x_min, row.y_min, row.x_max, row.y_max
    
            # Create uuid for naming (only using the first part of the uuid), to use for renaming
            uid = str(uuid.uuid4()).split('-')[0]
    
            # Extract, transform and load images into new folder
            with Image.open(BytesIO(response.content)) as img:
                img = img.crop((left, top, right, bottom))
                img = img.resize(new_size)
                img.save(
                    f'{os.path.join(folder, row.category_name)}/{row.category_name}_{uid}.jpg')
    
            print(f'Images done: {images_done}/{len(data)}')
    

In [None]:
fetch_crop_and_save_image(annot, TRAINING_FOLDER, TEST_FOLDER) # TensorFlow automatically seperates train set into train and validation