## **Explore COCO Dataset**
This notebook will be used for the basic exploration for COCO dataset.

In [None]:
import numpy as np
import os
import json
import coco_dataset
from coco_dataset import coco_dataset_download as cocod
from PIL import Image
import itertools
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import zipfile
import pandas as pd




In [None]:
## Globals
home_dir = "/home/tbaweja/"
dataset_path = os.path.join(home_dir, "coco_dataset")
metadata_path = os.path.join(home_dir, "wproj", "metadata")

In [None]:
## Utils functions

def unzip_file(src_file: str, dst_dir: str):
    with zipfile.ZipFile(src_file, 'r') as zip_ref:
        zip_ref.extractall(dst_dir)

def read_json(file_path: str):
    with open(file_path, "r") as f:
        data = json.load(f)
    
    return data

def load_image_and_annotations(metadata_dict: dict, image_id: int):
    data_path = os.path.join(dataset_path, "train2014")
    image_to_ann = metadata_dict["imgToAnns"][str(image_id)]
    image_path = os.path.join(data_path, f"COCO_train2014_000000{image_id}.jpg")
    annotations = [metadata_dict["anns"][str(ann_idx)] for ann_idx in image_to_ann]
    img = Image.open(image_path)
    img_arr = np.array(img)
    return img_arr, annotations

def show_image_and_annotations(image: np.ndarray, annotations: list, plot_bbox: bool = True):
    bboxes = [ann["bbox"] for ann in annotations]
    text_prefix = "This image contains "
    for annotation in annotations:
        text_annotation = annotation["utf8_string"]
        text_prefix += text_annotation + ", "
    # the length of images must be even
    fig, ax = plt.subplots(figsize = (10, 6))
    ax.imshow(image)

    if plot_bbox:
        for bbox in bboxes:
            x, y, width, height = bbox
            rect_instance = patches.Rectangle((x, y), width, height, linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect_instance)
    
    fig.suptitle(text_prefix, fontsize=10)
    fig.show()




In [None]:

# zip_file_path = os.path.join(dataset_path, "cocotext.v2.zip")
# dst_dir = dataset_path
# unzip_file(zip_file_path, dst_dir)

In [None]:
# zip_file_path = os.path.join(dataset_path, "annotations_trainval2014.zip")
# dst_dir = dataset_path
# unzip_file(zip_file_path, dst_dir)

## **Link Annotations and Images**

In [None]:
# raw_metadata = os.path.join(dataset_path, "cocotext.v2.json")
# metadata_dict = read_json(raw_metadata)
# metadata_dict.keys()

In [None]:
captions_metatdata = os.path.join(dataset_path, "annotations", "captions_train2014.json")
captions_metadata_dict = read_json(captions_metatdata)
captions_metadata_dict.keys()

In [None]:
instances_metatdata = os.path.join(dataset_path, "annotations", "instances_train2014.json")
instances_metatdata_dict = read_json(instances_metatdata)
instances_metatdata_dict["annotations"][0].keys()

In [None]:
# # process captions and instances metadata
# caption_annotations = captions_metadata_dict["annotations"]
# caption_annotations = {item['image_id']: item for item in caption_annotations}

# instance_annotations = instances_metatdata_dict["annotations"]
# instance_annotations = {item["image_id"]: item for item in instance_annotations}

In [None]:
# sorted_captions = sorted(captions_metadata_dict["annotations"], key = lambda x: x["id"])
# sorted_captions[:5]

In [None]:
image_id = 167467
image, annotations = load_image_and_annotations(metadata_dict, image_id)


In [None]:
show_image_and_annotations(image, annotations)

## **Create DataFrame for Images**

In [None]:
images_caption = captions_metadata_dict["images"]
images_metadata_df = pd.DataFrame(images_caption)
images_metadata_df = images_metadata_df.sort_values("id")
images_metadata_df = images_metadata_df.reset_index()
images_metadata_df.head()


In [None]:
annotation_captions = captions_metadata_dict["annotations"]
annotation_captions_df = pd.DataFrame(annotation_captions)
annotation_captions_df = annotation_captions_df.sort_values("image_id")
annotation_captions_df.head()

In [None]:
## combine image and captions

# initialize the column 
images_metadata_df["image_captions"] = None
for idx, row in images_metadata_df.iterrows():
    image_id = row["id"]
    caption_annotation_subset = annotation_captions_df.loc[annotation_captions_df["image_id"] == image_id]
    captions_list = []
    for _, cap_row in caption_annotation_subset.iterrows():
        captions_list.append(cap_row["caption"])    
    
    images_metadata_df.at[idx, "image_captions"] = captions_list
    

In [None]:
images_metadata_df.head()

In [None]:
instance_categories = instances_metatdata_dict["categories"]
len(instance_categories)

In [None]:
instance_annotations = instances_metatdata_dict["annotations"]
instance_annotations_df = pd.DataFrame(instance_annotations)
instance_annotations_df = instance_annotations_df.sort_values("image_id")
instance_annotations_df = instance_annotations_df.reset_index()
instance_annotations_df.head()

In [None]:
## Concatenate label information
# initialize label column

images_metadata_df["labels"] = None
for idx, row in images_metadata.iterrows():
    image_id = row["id"]
    instance_subset_df = instance_annotations_df.loc[instance_annotations_df["image_id"] == image_id]
    labels_list = set()
    for _, inst_row in instance_subset_df.iterrows():
        cat_idx = inst_row["category_id"]
        labels_list.add(cat_idx)
    
    images_metadata_df.at[idx, "labels"] = labels_list
    


In [None]:
images_metadata_df.head()

In [None]:
## store all metadata files
file_path = os.path.join(metadata_path, "image_caption_metdata.csv")
images_metadata_df.to_csv(file_path, index = False)

In [None]:
file_path = os.path.join(metadata_path, "instance_annotations.csv")
instance_annotations_df.to_csv(file_path, index = False)