## Cleaning COCO annotations

In [29]:
import pandas as pd
import numpy as np
import json

Download COCO annotations from the following link:
<br>https://cocodataset.org/#download
<br>Download the 2017 Train/Val annotations dataset.

In [34]:
# load data using Python JSON module
with open('panoptic_val2017.json','r') as f:
    val_data = json.loads(f.read())
with open('panoptic_train2017.json','r') as f:
    train_data = json.loads(f.read())

# Normalizing data
val_images = pd.json_normalize(val_data, record_path=['images'])
train_images = pd.json_normalize(train_data, record_path=['images'])

val_annotations = pd.json_normalize(val_data['annotations'], record_path=['segments_info'], meta=['image_id','file_name'])
train_annotations = pd.json_normalize(train_data['annotations'], record_path=['segments_info'], meta=['image_id','file_name'])

categories = pd.json_normalize(train_data, record_path=['categories'])

Let's see what the data looks like:

In [76]:
train_images.head()

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-14 11:38:44,http://farm1.staticflickr.com/1/127244861_ab0c...,522418
2,3,000000184613.jpg,http://images.cocodataset.org/train2017/000000...,336,500,2013-11-14 12:36:29,http://farm3.staticflickr.com/2169/2118578392_...,184613
3,3,000000318219.jpg,http://images.cocodataset.org/train2017/000000...,640,556,2013-11-14 13:02:53,http://farm5.staticflickr.com/4125/5094763076_...,318219
4,3,000000554625.jpg,http://images.cocodataset.org/train2017/000000...,640,426,2013-11-14 16:03:19,http://farm5.staticflickr.com/4086/5094162993_...,554625


In [77]:
categories.head()

Unnamed: 0,supercategory,isthing,id,name
0,person,1,1,person
1,vehicle,1,2,bicycle
2,vehicle,1,3,car
3,vehicle,1,4,motorcycle
4,vehicle,1,5,airplane


json_normalize takes the json objects as input and prints out the tables.

Now let's create a map between category id and name. We'll use this to create a new "category_name" column in our annotations data.

In [43]:
category_map = {}
for index, row in categories.iterrows():
    category_map[int(row['id'])] = row['name']

In [44]:
train_annotations['category_name'] = train_annotations['category_id'].apply(lambda x: category_map[x])
val_annotations['category_name'] = val_annotations['category_id'].apply(lambda x: category_map[x])

In [45]:
train_annotations.head()

Unnamed: 0,id,category_id,iscrowd,bbox,area,image_id,file_name,category_name
0,8345037,51,0,"[0, 14, 434, 374]",24315,9,000000000009.png,bowl
1,6968006,51,0,"[312, 4, 319, 229]",34234,9,000000000009.png,bowl
2,2005197,51,0,"[1, 189, 612, 285]",70036,9,000000000009.png,bowl
3,3658235,55,0,"[387, 74, 83, 70]",3566,9,000000000009.png,orange
4,2803959,55,0,"[376, 40, 76, 47]",2241,9,000000000009.png,orange


In [46]:
val_images.shape, train_images.shape

((5000, 8), (118287, 8))

In [47]:
train_images.head()

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-14 11:38:44,http://farm1.staticflickr.com/1/127244861_ab0c...,522418
2,3,000000184613.jpg,http://images.cocodataset.org/train2017/000000...,336,500,2013-11-14 12:36:29,http://farm3.staticflickr.com/2169/2118578392_...,184613
3,3,000000318219.jpg,http://images.cocodataset.org/train2017/000000...,640,556,2013-11-14 13:02:53,http://farm5.staticflickr.com/4125/5094763076_...,318219
4,3,000000554625.jpg,http://images.cocodataset.org/train2017/000000...,640,426,2013-11-14 16:03:19,http://farm5.staticflickr.com/4086/5094162993_...,554625


In [48]:
train_annotations.head()

Unnamed: 0,id,category_id,iscrowd,bbox,area,image_id,file_name,category_name
0,8345037,51,0,"[0, 14, 434, 374]",24315,9,000000000009.png,bowl
1,6968006,51,0,"[312, 4, 319, 229]",34234,9,000000000009.png,bowl
2,2005197,51,0,"[1, 189, 612, 285]",70036,9,000000000009.png,bowl
3,3658235,55,0,"[387, 74, 83, 70]",3566,9,000000000009.png,orange
4,2803959,55,0,"[376, 40, 76, 47]",2241,9,000000000009.png,orange


Now let's create the full train and validation dataset, which contains image information, and for each image, the # of occurences of each category type.

In [50]:
# Joining image and annotation DFs to get the full-info.
train = train_images.merge(train_annotations, left_on='id', right_on='image_id')
train.drop(['file_name_y','id_x'], axis=1, inplace=True)
train.columns = ['license', 'file_name', 'coco_url', 'height', 'width',
       'date_captured', 'flickr_url', 'annotation_id', 'category_id', 'iscrowd',
       'bbox', 'area', 'image_id','category_name']

# Getting the # of (non-zero) occurences of each category for every image
train_cat_counts = train.groupby(['image_id','category_name']).count()['annotation_id'].reset_index()

# Getting a single one-hot representation of category counts. Here we're getting all the categories and their counts, including zeros.
train_one_hot_counts = pd.pivot_table(train_cat_counts, values='annotation_id', index=['image_id'],
                    columns=['category_name'], aggfunc=np.sum)
train_one_hot_counts.fillna(0, inplace=True)

# We'll merge the images and one-hot counts to get the final dataset.
train_data = train_images.merge(train_one_hot_counts, left_on='id', right_on='image_id')

# We'll repeat the same for validation data

val = val_images.merge(val_annotations, left_on='id', right_on='image_id')
val.drop(['file_name_y','id_x'], axis=1, inplace=True)
val.columns = ['license', 'file_name', 'coco_url', 'height', 'width',
       'date_captured', 'flickr_url', 'annotation_id', 'category_id', 'iscrowd',
       'bbox', 'area', 'image_id','category_name']

val_cat_counts = val.groupby(['image_id','category_name']).count()['annotation_id'].reset_index()

val_one_hot_counts = pd.pivot_table(val_cat_counts, values='annotation_id', index=['image_id'],
                    columns=['category_name'], aggfunc=np.sum)
val_one_hot_counts.fillna(0, inplace=True)

val_data = val_images.merge(val_one_hot_counts, left_on='id', right_on='image_id')

In [51]:
train_data.shape, val_data.shape

((118267, 141), (5000, 141))

In [52]:
train.shape, val.shape

((1329984, 14), (56728, 14))

In [53]:
train_one_hot_counts.shape, train.shape

((118267, 133), (1329984, 14))

In [54]:
train_cat_counts.shape, train_images.shape

((816269, 3), (118287, 8))

In [55]:
train_images.head(2)

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-14 11:38:44,http://farm1.staticflickr.com/1/127244861_ab0c...,522418


In [56]:
train_cat_counts.head(2)

Unnamed: 0,image_id,category_name,annotation_id
0,9,bowl,3
1,9,broccoli,1


Now let's create the occurences dataset that stores the category_name and number of occurences for every image.
<br>Note that we only store category information of an image that has a non-zero occurence count.

In [58]:
train_cat_occurences = train_images.merge(train_cat_counts, left_on='id', right_on='image_id').drop('id', axis=1)
train_cat_occurences.columns = ['license', 'file_name', 'coco_url', 'height', 'width', 'date_captured',
       'flickr_url', 'image_id', 'category_name', 'occurences']

val_cat_occurences = val_images.merge(val_cat_counts, left_on='id', right_on='image_id').drop('id', axis=1)
val_cat_occurences.columns = ['license', 'file_name', 'coco_url', 'height', 'width', 'date_captured',
       'flickr_url', 'image_id', 'category_name', 'occurences']

In [59]:
train_cat_occurences.head()

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,image_id,category_name,occurences
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895,bicycle,1
1,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895,bridge,1
2,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895,dirt-merged,1
3,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895,motorcycle,1
4,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895,mountain-merged,1


Let's merge the train and validation datasets and store them as one CSV file:

In [67]:
data = pd.concat([train_data, val_data])
cat_occurences = pd.concat([train_cat_occurences, val_cat_occurences])

data.to_csv('data.csv', index=False)
cat_occurences.to_csv('cat_occurences.csv', index=False)

In [68]:
data.shape, cat_occurences.shape

((123267, 141), (850999, 10))