In [26]:
import pandas as pd 
import json 
import numpy as np 

import os
from tqdm import tqdm

import shapely
import shapely.wkt
from shapely import Point, Polygon

import cv2
from PIL import Image

## Complete dataset building for training data

### Grab the buildings identified within each image

In [27]:
dir_path = "train/labels/"
training_labels = os.listdir(dir_path)
sample = training_labels

In [40]:
def find_building_dataset(dir_path, data_labels): 

    df = pd.DataFrame(columns = ['image_name', 'classification', 'uid', 'polygon'])
    for path in tqdm(data_labels): 
        d = pd.read_json(dir_path + path)
        # look to see if a feature is identified in this image 
        ll = d['features']['xy']
        if len(ll) > 0: 
            # look to see if any of these features are buildings 
            for i in range(0, len(ll)): 
                if ll[i]['properties']['feature_type'] == 'building' and 'subtype' in ll[i]['properties'].keys(): 
                    add = {'image_name': path, 'classification': ll[i]['properties']['subtype'], 'uid': ll[i]['properties']['uid'], 'polygon': ll[i]['wkt']}
                    df.loc[len(df.index)] = add
                    
    return df 

In [29]:
df = find_building_dataset(dir_path, sample)

100%|██████████| 5598/5598 [11:59<00:00,  7.79it/s]


In [30]:
df['classification'].value_counts()

no-damage        117426
minor-damage      14980
major-damage      14161
destroyed         13227
un-classified      2993
Name: classification, dtype: int64

In [31]:
df.head()

Unnamed: 0,image_name,classification,uid,polygon
0,midwest-flooding_00000308_post_disaster.json,no-damage,3350c4e6-e7dc-4313-8b23-59233a6e2078,"POLYGON ((319.9690811897951 118.8051167728774,..."
1,midwest-flooding_00000308_post_disaster.json,no-damage,4ac81fae-312a-4050-bd41-92f677311b09,"POLYGON ((448.3688195265795 92.6786134661785, ..."
2,midwest-flooding_00000308_post_disaster.json,no-damage,bcfa6989-a2b4-4aa6-a1f9-6adc62301b6c,"POLYGON ((862.4316509232316 13.3651563899852, ..."
3,midwest-flooding_00000308_post_disaster.json,no-damage,497560fe-6da5-4826-a443-cd76fa4f2f86,"POLYGON ((898.5940151494626 38.91158143295991,..."
4,midwest-flooding_00000308_post_disaster.json,no-damage,2d171f2b-c103-42d2-b1a4-8e954bbceaef,"POLYGON ((927.1242390329007 30.31084096671428,..."


### Restrict image to bounding box area and export

In [32]:
# tracking the scale of all the buildings so we can decide what to scale to 
width = []
height = []

In [42]:
def export_building(name, uid, polygon, file_path, folder): 
    # Extract the point values that define the perimeter of the polygon
    polygon = shapely.wkt.loads(polygon)
    x, y = polygon.exterior.coords.xy

    bbox = {
        'x_min': round(min(x.tolist())), 
        'x_max': round(max(x.tolist())), 
        'y_min': round(min(y.tolist())), 
        'y_max': round(max(y.tolist()))
    }

    image = cv2.imread(folder + "/images/" + name.split(".json")[0] + ".png")
    cut_image = Image.fromarray(image[bbox['y_min']:bbox['y_max'], bbox['x_min']:bbox['x_max']])

    # check to make sure image isn't cut out of a missing satelitte peice 

    avg_color_per_row = np.average(cut_image, axis=0)
    avg_color = np.average(avg_color_per_row, axis=0)

    if avg_color[0] > 10 and avg_color[1] > 10 and avg_color[2] > 10: 
        
        # tracking sizes for later scaling
        width.append(bbox['x_max'] - bbox['x_min'])
        height.append(bbox['y_max'] - bbox['y_min'])

        # to visualize exported image
        # display(cut_image)
        
        # to export image 
        cut_image.save(file_path + uid + '.png')

In [34]:
# testing to make sure function works correctly 
# val = 40
# export_building(df.loc[val]['image_name'], df.loc[val]['uid'], df.loc[val]['polygon'])

In [36]:
for i in tqdm(range(0, len(df))): 
    export_building(df.loc[i]['image_name'], df.loc[i]['uid'], df.loc[i]['polygon'], 'classification_images/')

100%|██████████| 162787/162787 [1:01:36<00:00, 44.04it/s]


In [37]:
# determine what we should scale data to 
print("Average pixel height is " + str(round(sum(height) / len(height))) + " pixels.")
print("Average pixel width is " + str(round(sum(width) / len(width))) + " pixels.")

Average pixel height is 35 pixels.
Average pixel width is 35 pixels.


### Export labels for dataset for training

In [38]:
# get all the successful uid's for training data 
dir_path = "classification_images/"
success_uids = os.listdir(dir_path)
print("The dataset to be used in training contains " + str(len(success_uids)) + " images.")

su = [x.split('.png')[0] for x in success_uids]
success_df = df[df['uid'].isin(su)]
success_df.to_csv('training_data.csv', index=None)

success_df['classification'].value_counts() 

The dataset to be used in training contains 162698 images.


no-damage        117346
minor-damage      14976
major-damage      14159
destroyed         13225
un-classified      2992
Name: classification, dtype: int64

## Complete dataset building for validation and test data

In [44]:
dir_path = "hold/labels/"
valid_labels = os.listdir(dir_path)

df = find_building_dataset(dir_path, valid_labels)

for i in tqdm(range(0, len(df))): 
    export_building(df.loc[i]['image_name'], df.loc[i]['uid'], df.loc[i]['polygon'], 'holdout_images/', 'hold')


100%|██████████| 1866/1866 [01:25<00:00, 21.77it/s]
100%|██████████| 54392/54392 [20:36<00:00, 44.00it/s]


In [45]:
# get all the successful uid's for training data 
dir_path = "holdout_images/"
success_uids = os.listdir(dir_path)
print("The dataset to be used in validation contains " + str(len(success_uids)) + " images.")

su = [x.split('.png')[0] for x in success_uids]
success_df = df[df['uid'].isin(su)]
success_df.to_csv('validation_data.csv', index=None)

The dataset to be used in validation contains 54307 images.


In [47]:
dir_path = "test/labels/"
testing_labels = os.listdir(dir_path)

df = find_building_dataset(dir_path, testing_labels)

for i in tqdm(range(0, len(df))): 
    export_building(df.loc[i]['image_name'], df.loc[i]['uid'], df.loc[i]['polygon'], 'testing_images/', 'test')

100%|██████████| 1866/1866 [01:25<00:00, 21.83it/s]
100%|██████████| 54862/54862 [20:31<00:00, 44.55it/s]


In [48]:
# get all the successful uid's for training data 
dir_path = "testing_images/"
success_uids = os.listdir(dir_path)
print("The dataset to be used in testing contains " + str(len(success_uids)) + " images.")

su = [x.split('.png')[0] for x in success_uids]
success_df = df[df['uid'].isin(su)]
success_df.to_csv('testing_data.csv', index=None)

The dataset to be used in testing contains 54842 images.


In [49]:
# determine what we should scale data to 
print("Average pixel height (training&hold&test) is " + str(round(sum(height) / len(height))) + " pixels.")
print("Average pixel width (training&hold&test) is " + str(round(sum(width) / len(width))) + " pixels.")

Average pixel height (training&hold&test) is 35 pixels.
Average pixel width (training&hold&test) is 35 pixels.


In [50]:
df.head()

Unnamed: 0,image_name,classification,uid,polygon
0,hurricane-harvey_00000037_post_disaster.json,major-damage,5b3cc30b-0144-484d-8132-b6dd145f5b14,"POLYGON ((99.9995775966231 770.3191669041304, ..."
1,hurricane-harvey_00000037_post_disaster.json,destroyed,ff5b6760-fcc4-4da6-802f-8b8cccbd2438,POLYGON ((-4.580597812984772e-06 662.348436749...
2,socal-fire_00000753_post_disaster.json,no-damage,7f095106-efeb-4a69-9c5e-212fbdffe691,"POLYGON ((16.81967734972283 339.0362917135125,..."
3,socal-fire_00000753_post_disaster.json,no-damage,2484b94c-a406-4314-b69d-1e15e274281f,POLYGON ((-5.539019670407656e-07 355.604034808...
4,socal-fire_00000753_post_disaster.json,no-damage,be68c2c5-0e18-4896-8177-2994ff3631db,"POLYGON ((17.49449341888473 391.3798187193235,..."
