## Load data for Azure Machine Learning AutoML

This notebook extracts data from open-images-v7 and send it to an Azure Custom Vision project

It assumes you are working with one class only.

### Pre-requisites:

1. Create a `.env`:

```bash
cp .env-template .env # copy file
```

2. Add the required fields

In [None]:
# erase folder ˜/fiftyone/open-images-v7
import os
os.system('rm -rf ~/fiftyone/open-images-v7')

In [None]:
import os
import fiftyone as fo
import fiftyone.zoo as foz

TargetLabel = "Coffee cup"
TargetImages = 200

try:
    # Load the dataset
    foz.load_zoo_dataset(
        "open-images-v7",
        split="validation",
        label_types=["detections"],
        classes=[TargetLabel],
        max_samples=TargetImages
    )
except Exception as e:
    # if exception is realted to mongo db, it is ok to proceed
    if 'MongoDB' in str(e):
        pass
    else:
        raise e

In [None]:
# delete all content of ./data folder
os.system('rm -rf ./data')
# images are written into ~/fiftyone/open-images-v7/validation . copy them to ./data
os.system('cp -r ~/fiftyone/open-images-v7/validation ./data')
os.system('ls ./data')


In [None]:
# using glob read the names of all files under data/data

import glob
files = glob.glob('./data/data/*')

# create a list called files_names with the names of the files, exlucluding the path and the extension
# hint: use os.path.basename and os.path.splitext
file_names = [os.path.splitext(os.path.basename(f))[0] for f in files]
file_names

In [None]:
import pandas as pd

# read data/metadata/classes.csv into dataframe classes - classes does not have headers. first column is called LabelName, second is called LabelDisplayName
classes = pd.read_csv('./data/metadata/classes.csv', header=None, names=['LabelName', 'LabelDisplayName'])

# filter classes to only include the class where LabelName is in df.LabelName
classes = classes[classes['LabelDisplayName'] == TargetLabel]

classes.head()

In [None]:
# read ./data/labels/detections.csv into a pandas dataframe
import pandas as pd

detect = pd.read_csv('./data/labels/detections.csv')

# keep only rows where LabelName is in classes.LabelName and ImageID is in file_names
detect = detect[detect['LabelName'].isin(classes['LabelName']) & detect['ImageID'].isin(file_names)]
detect


In [None]:
# add two columns: one with the class name and one with the full file path

# merge detect with classes on LabelName
detect = pd.merge(detect, classes, on='LabelName')

# add a column Class with the value in LabelDisplayName
detect['Class'] = detect['LabelDisplayName']

# add a column Path with the value './data/data/' + ImageID + '.jpg'
detect['Path'] = './data/data/' + detect['ImageID'] + '.jpg'

detect

In [None]:
# keep only Path, Class, XMin	XMax	YMin	YMax
detect = detect[['Path', 'Class', 'XMin', 'XMax', 'YMin', 'YMax']]
detect

In [None]:
from collections import defaultdict

# Create a dictionary to hold the bounding boxes for each image
image_bboxes = defaultdict(list)

# Iterate over the rows in the detect dataframe
for _, row in detect.iterrows():
    image_path = row['Path']
    bbox = (row['XMin'], row['XMax'], row['YMin'], row['YMax'])
    image_bboxes[image_path].append(bbox)

# Convert the dictionary to a list of tuples (image_path, bboxes)
merged_bboxes = [(image_path, bboxes) for image_path, bboxes in image_bboxes.items()]
merged_bboxes[0]

In [None]:
import shutil

# Rename the folder
os.rename('./data/data', './data/images')

# Remove other folders from data (anything that is not named "images")
for item in os.listdir('./data'):
    item_path = os.path.join('./data', item)
    if item != 'images' and os.path.isdir(item_path):
        shutil.rmtree(item_path)

# List the contents of the data folder to verify
os.listdir('./data')

In [None]:
# Convert merged_bboxes to a DataFrame
merged_bboxes_df = pd.DataFrame(merged_bboxes, columns=['Path', 'BBoxes'])

# # Save the DataFrame to detections.csv
# merged_bboxes_df.to_csv('./data/detections.csv', index=False)

## Create output in format that can be used for automl for images

In [None]:
import json


def create_annotations(df, output_file):
    # Open the output file in write mode
    with open(output_file, 'w') as f:
        # Iterate over the rows in the merged_bboxes_df dataframe
        for _, row in df.iterrows():
            # Create a dictionary for the current image
            image_dict = {
                "image_url": row['Path'].split('/')[-1],
                # "label": TargetLabel,
                "label": []
            }
            
            # Iterate over the bounding boxes for the current image
            for bbox in row['BBoxes']:
                # Create a dictionary for the current bounding box
                bbox_dict = {
                    "label": TargetLabel,
                    "topX": bbox[0],
                    "topY": bbox[2],
                    "bottomX": bbox[1],
                    "bottomY": bbox[3]
                }
                # Append the bounding box dictionary to the image dictionary
                image_dict["label"].append(bbox_dict)
            
            # Write the image dictionary as a JSON object to the output file
            f.write(json.dumps(image_dict) + '\n')

    # Print the path to the output file
    print(f'JSONL file saved to {output_file}')

In [None]:
# split merged_bbox_df into 3 dataframes: 60% train, 20% validation, 20% test

import numpy as np
train_df, validation_df, test_df = np.split(merged_bboxes_df.sample(frac=1), [int(.6*len(merged_bboxes_df)), int(.8*len(merged_bboxes_df))])



In [None]:
# Save train annotations
create_annotations(train_df, './data/train_annotations.jsonl')

# Save validation annotations
create_annotations(validation_df, './data/validation_annotations.jsonl')

# Save test annotations
create_annotations(test_df, './data/test_annotations.jsonl')