# Define each imports and base variables
TODO: Write more to explain what is done in this notebook and comment each line of code

In [1]:
import os
import zipfile
from tqdm import tqdm
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from PIL.ExifTags import TAGS
import pickle

# Check if images folder exist
base_output_folder = '../output'
base_image_path = '../output/images'
base_metadata_path = '../output/metadata'
base_dataset_url = 'https://storage.googleapis.com/kaggle-data-sets/1698586/2782287/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230203%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230203T140822Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=cabc89efba4dc70b929b3caea5c0a48861e6ac2de471c7ba3debd7682007d0d9d79ce67c085cd4f83048a72e6a1b09bd850481685e292b8d0b22cf8654160fde86f223e2bbb0d87a82274d8935c3350d792d3b143e86ba76cb3169c7a2f49e5e10da8e2a12cbcb374e2f28aed524b9b2d5f5aa7e7d22372e9d49b67a0a8da00f172f69919d75c670fae4354672dd4ce74e1b34c17af3aac25cd8be9001ed73347daf10ff95f3cd1e41fc15a4a441146ee079e55043928b68a2591374164113e74d28b64be45f23a674e9c26b82f3a5f2a00a3d3d8d442a23e27722bbee7e419e8be68136db3826b05bb7629cb8012976eb1fc80a30394d85b2b0a5df47bcc97f'
base_dataset_url = 'http://images.cocodataset.org/zips/test2017.zip'

# Define base methods

In [125]:
def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
        print("Folder " + path + " created")

def download(url, filename):
    import functools
    import pathlib
    import shutil
    import requests
    from tqdm.auto import tqdm

    r = requests.get(url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        r.raise_for_status()  # Will only raise for 4xx codes, so...
        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
    file_size = int(r.headers.get('Content-Length', 0))

    path = pathlib.Path(filename).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)

    desc = "(Unknown total file size)" if file_size == 0 else ""
    r.raw.read = functools.partial(r.raw.read, decode_content=True)  # Decompress if needed
    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
        with path.open("wb") as f:
            shutil.copyfileobj(r_raw, f)

    return path

# Create init folder method to create all folders

In [126]:
def init_folder(output_folder, image_path, metadata_path):
    create_folder(output_folder)
    create_folder(image_path)
    create_folder(metadata_path)


In [127]:
init_folder(base_output_folder, base_image_path, base_metadata_path)

# Create download dataset method to download dataset

In [128]:
def download_dataset(dataset_url, image_path):
    if not os.path.exists('archive.zip'):
        download(dataset_url, 'archive.zip')
        print("Dataset downloaded !")
        # Unzip archive.zip to images folder
        with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
            zip_ref.extractall(image_path)
            print("Dataset Unziped")

        # Remove archive.zip after unzip
        os.remove('archive.zip')


In [129]:
download_dataset(base_dataset_url, base_image_path)

# Define a method to easily get images

In [130]:
def get_all_images(image_path):
    return [os.path.join(root, name)
            for root, dirs, files in os.walk(image_path)
            for name in files
            if name.endswith((".png", ".jpg"))]

# Define methods to arrange images in folders

In [131]:
# Create a checkpoint to save progress of rearranging dataset (because of the time it takes to rearrange dataset)
def create_checkpoint(latest_file):
    with open('checkpoint.txt', 'w') as f:
        f.write(latest_file)


def load_checkpoint():
    # first verify if checkpoint exist
    if os.path.exists('checkpoint.txt'):
        with open('checkpoint.txt', 'r') as f:
            return f.read()
    else:
        return None


def remove_checkpoint():
    if os.path.exists('checkpoint.txt'):
        os.remove('checkpoint.txt')


def set_test_dataset(image_path, amount=100):
    for root, dirs, files in os.walk(image_path, topdown=False):
        for name in files:
            if len(os.listdir(image_path)) > amount:
                os.remove(os.path.join(root, name))
    print("All images removed except " + str(amount) + " images")


def arrange_dataset(image_path, is_test=False):
    img_files = get_all_images(image_path)
    # Move all file to images folder and remove all subfolder and add progress bar for moving file
    # Use checkpoint to avoid repeating operation, if checkpoint, skip all file to next file after the file in the checkpoint
    checkpoint = load_checkpoint()
    for file in tqdm(img_files, desc="Moving all file to images folder"):
        # if checkpoint is not none, verify if the file is equal to checkpoint, if yes, set checkpoint to None to go to the next image and continue arrangement, else, continue
        if checkpoint == file:
            checkpoint = None
            continue
        elif checkpoint is not None:
            continue
        else:
            os.rename(file, os.path.join(image_path, os.path.basename(file)))
            create_checkpoint(file)
    print("All file moved to images folder")
    # remove checkpoint
    remove_checkpoint()

    # Remove all subfolder
    for root, dirs, files in os.walk(image_path, topdown=False):
        for name in dirs:
            os.rmdir(os.path.join(root, name))
    print("All subfolder removed")

    if is_test:
        set_test_dataset(image_path)


# Arrange dataset

In [132]:
arrange_dataset(base_image_path, is_test=False)

# Define method to get metadata of one image

In [133]:
def get_metadata(img_file, image_path):
    img_name = os.path.basename(img_file)
    img_exif = Image.open(os.path.join(image_path, img_name)).getexif()
    img_exif = {TAGS[k]: v for k, v in img_exif.items() if k in TAGS}
    # Read image
    img = cv2.imread(img_file)
    try:
        # Get image size in bytes
        height, width, channels = img.shape
        # Get image format (dtype)
        img_format = img.dtype
        # Get image size in bytes (nbytes)
        img_size = img.nbytes
        # Get image dimension (ndim)
        img_dimension = img.ndim
        # Get image shape (shape)
        img_shape = img.shape
        # Get image mean value
        img_mean = np.mean(img)
        # Get image standard deviation value
        img_std = np.std(img)
        # Get image min value
        img_min = np.min(img)
        # Get image max value
        img_max = np.max(img)
        # Get image median value
        img_median = np.median(img)
        # Get image variance value
        img_var = np.var(img)
        # Get image percentile 99
        img_percentile = np.percentile(img, 99)
        # Get image entropy value
        img_entropy = cv2.Laplacian(img, cv2.CV_64F).var()
        # Get image unique values
        img_unique_values = np.unique(img).size
        # Get image mode value
        img_mode = int(pd.Series(img.ravel()).mode())
        # Get image mode frequency
        img_skewness = pd.Series(img.ravel()).skew()
        # Get image kurtosis value
        img_kurtosis = pd.Series(img.ravel()).kurt()
    except:
        print(f"Error in {img_name}")
        return None
    # Create a dict to store all metadata of the image
    return {
        'height': height,
        'width': width,
        'channels': channels,
        'format': str(img_format),
        'size': img_size,
        'dimension': img_dimension,
        'shape': str(img_shape),
        'mean': img_mean,
        'std': img_std,
        'min': img_min,
        'max': img_max,
        'median': img_median,
        'var': img_var,
        'percentile': img_percentile,
        'entropy': img_entropy,
        'unique_values': img_unique_values,
        'mode': img_mode,
        'skewness': img_skewness,
        'kurtosis': img_kurtosis,
        'entropy': img_entropy,
        'exif': img_exif
    }

In [142]:
def get_basic_metadata(img_file, image_path):
    img_name = os.path.basename(img_file)
    image = Image.open(os.path.join(image_path, img_name))
    exif = image.getexif()

    metadata = {
        "filename": image.filename,
        "size": image.size,
        "height": image.height,
        "width": image.width,
        "format": image.format,
        "mode": image.mode,
    }
    print(metadata)
    print(exif)

get_basic_metadata("000000000001.jpg", base_image_path)

# Define a method to save metadata to pickle file

In [134]:
def save_metadata(metadata, img_name, metadata_path):
    # Save metadata to json file
    with open(os.path.join(metadata_path, os.path.splitext(os.path.basename(img_name))[0] + '.pickle'), 'wb') as f:
        pickle.dump(metadata, f)

# Define a method to get all metadata of all images and save it to pickle file

In [135]:
def get_all_metadata(image_path, metadata_path):
    img_files = get_all_images(image_path)
    # Get all metadata of the images and save to individual json file
    checkpoint = load_checkpoint()
    for img in tqdm(img_files, desc="Get all metadata of the images and save to individual pikkle file"):
        if checkpoint == img:
            checkpoint = None
            continue
        elif checkpoint is not None:
            continue
        else:
            metadata = get_metadata(img, image_path)
            save_metadata(metadata, img, metadata_path)
            create_checkpoint(img)

    remove_checkpoint()

In [None]:
get_all_metadata(base_image_path, base_metadata_path)