In [None]:
from PIL import Image
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import math
import random
import json
import cv2
import shapely.wkt
import shapely
from shapely.geometry import Polygon
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [None]:
damage_intensity_encoding = defaultdict(lambda: 0)
damage_intensity_encoding['destroyed'] = 3
damage_intensity_encoding['major-damage'] = 2
damage_intensity_encoding['minor-damage'] = 1
damage_intensity_encoding['no-damage'] = 0

In [None]:
def process_img(img_array, polygon_pts, scale_pct):
    """Process Raw Data into
            Args:
                img_array (numpy array): numpy representation of image.
                polygon_pts (array): corners of the building polygon.
            Returns:
                numpy array: .
    """

    height, width, _ = img_array.shape

    xcoords = polygon_pts[:, 0]
    ycoords = polygon_pts[:, 1]
    xmin, xmax = np.min(xcoords), np.max(xcoords)
    ymin, ymax = np.min(ycoords), np.max(ycoords)

    xdiff = xmax - xmin
    ydiff = ymax - ymin

    #Extend image by scale percentage
    xmin = max(int(xmin - (xdiff * scale_pct)), 0)
    xmax = min(int(xmax + (xdiff * scale_pct)), width)
    ymin = max(int(ymin - (ydiff * scale_pct)), 0)
    ymax = min(int(ymax + (ydiff * scale_pct)), height)

    return img_array[ymin:ymax, xmin:xmax, :]

In [None]:
def process_data(input_path, output_path, output_csv_path, val_split_pct):
    """Process Raw Data into
        Args:
            dir_path (path): Path to the xBD dataset.
            data_type (string): String to indicate whether to process
                                train, test, or holdout data.
        Returns:
            x_data: A list of numpy arrays representing the images for training
            y_data: A list of labels for damage represented in matrix form
    """
    x_data = []
    y_data = []

    disaster_paths = ([input_path + "/" +  "Images" ])
    image_paths = []
    image_paths.extend([(disaster_path + "/" + pic) for pic in os.listdir(disaster_path)] for disaster_path in disaster_paths)
    img_paths = np.concatenate(image_paths)

    for img_path in tqdm(img_paths):

        img_obj = Image.open(img_path)
        img_array = np.array(img_obj)

        #Get corresponding label for the current image
        label_path = img_path.replace('png', 'json').replace('Images', 'labels')
        label_file = open(label_path)
        label_data = json.load(label_file)

        for feat in label_data['features']['xy']:

            # only images post-disaster will have damage type
            try:
                damage_type = feat['properties']['subtype']
            except: # pre-disaster damage is default no-damage
                damage_type = "no-damage"
                continue

            poly_uuid = feat['properties']['uid'] + ".png"

            y_data.append(damage_intensity_encoding[damage_type])

            polygon_geom = shapely.wkt.loads(feat['wkt'])
            polygon_pts = np.array(list(polygon_geom.exterior.coords))
            poly_img = process_img(img_array, polygon_pts, 0.8)
            cv2.imwrite(output_path + "/" + poly_uuid, poly_img)
            x_data.append(poly_uuid)
    
    output_train_csv_path = os.path.join(output_csv_path, "train.csv")

    if(val_split_pct > 0):
       x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=val_split_pct)
       data_array_train = {'uuid': x_train, 'labels': y_train}
       data_array_test = {'uuid': x_test, 'labels': y_test}
       output_test_csv_path = os.path.join(output_csv_path, "test.csv")
       df_train = pd.DataFrame(data_array_train)
       df_test = pd.DataFrame(data_array_test)
       df_train.to_csv(output_train_csv_path)
       df_test.to_csv(output_test_csv_path)
    else: 
       data_array = {'uuid': x_data, 'labels': y_data}
       df = pd.DataFrame(data = data_array)
       df.to_csv(output_train_csv_path)

In [None]:
input_dir = '/content/drive/MyDrive/TechFest'
output_dir = '/content/drive/MyDrive/TechFest/Output_2'
output_dir_csv = '/content/drive/MyDrive/TechFest/csv_2'
val_split_pct = 0.2

In [None]:
process_data(input_dir, output_dir, output_dir_csv, float(val_split_pct))

100%|██████████| 117/117 [05:43<00:00,  2.94s/it]


In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/TechFest/csv_2/train.csv')

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/TechFest/csv_2/test.csv')

In [None]:
df_train['labels'].value_counts()

0    24919
1       86
2       14
Name: labels, dtype: int64

In [None]:
df_test['labels'].value_counts()

0    6231
1      18
2       4
3       2
Name: labels, dtype: int64

In [None]:
df=df_train.append(df_test)

In [None]:
df = df.drop('Unnamed: 0',axis=1)


In [None]:
df.to_csv("/content/drive/MyDrive/TechFest/csv_2/Train_final1.csv",index=False)

In [None]:
df_t = pd.read_csv('/content/drive/MyDrive/TechFest/csv/Train_final1.csv')
df_t.head()

Unnamed: 0,uuid,labels
0,f39855b0-ece9-49ec-9658-337666ef61d7.png,0
1,1df50457-bd22-4799-be92-ff8ced223565.png,0
2,8b191dda-494f-4053-8cde-85066d463849.png,0
3,549a4a75-8369-4ed8-bdc5-3fb1597e26c9.png,0
4,aa9062b6-16e8-467c-8be8-9c961df5c905.png,0


In [None]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31274 entries, 0 to 31273
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uuid    31274 non-null  object
 1   labels  31274 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 488.8+ KB


In [None]:
import shutil
import os
import tqdm
import sys

In [None]:
labels = pd.read_csv(r'/content/drive/MyDrive/TechFest/csv/Train_final1.csv')


In [None]:
train_dir =r'/content/drive/MyDrive/TechFest/Output_2/'
DR = r"/content/drive/MyDrive/TechFest/Data"

In [None]:
labels['labels'] = labels['labels'].astype(str)

In [None]:
class_names = list(labels.labels.unique())
class_names[0]

'0'

In [None]:
for i in class_names:
   os.makedirs(os.path.join(DR, i))

In [None]:
for c in tqdm.tqdm(class_names): # Category Name
  for i in list(labels[labels['labels']==c]['uuid']): # Image Id
    get_image = os.path.join(train_dir, i) # Path to Images
    move_image_to_cat = shutil.copy(get_image, '/content/drive/MyDrive/TechFest/Data/'+c)

100%|██████████| 4/4 [04:45<00:00, 71.36s/it]
