# Start here if working in Colab

In [120]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive

drive.mount(ROOT)           # we mount the google drive at /content/drive

ModuleNotFoundError: No module named 'google.colab'

In [20]:
# Set working directory
%cd /content/drive/My Drive/restoration-mapper

/content/drive/My Drive/restoration-mapper


# Start here if working locally

In [19]:
from tqdm import tqdm_notebook, tnrange
import numpy as np
from random import shuffle
import pandas as pd
import os
import random

### Additional function definitions

In [136]:
def reconstruct_images(plot_id):
    '''
    Takes a plot ID and subsets the input pd.DataFrame to that plot ID
    returns a (14, 14) array-like list with binary labels
    # LAT, LONG, TREE
    # 42.11, 1.02, 0
    # 42.1102, 1.02, 1
    
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

### Data loading

In [137]:
source = 'test'
sentinel_1 = True
s2_path = "../data/{}-super/".format(source)
s1_path = "../data/{}-s1/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)

In [138]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if ".csv" in i:
        print(i)
        df = pd.read_csv(csv_path + i).drop('IMAGERY_TITLE', axis = 1)
        df['country'] = i.split(".")[0]
        dfs.append(df)

for i in range(len(dfs)):
    if "PL_PLOTID" in dfs[i].columns:
            dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
    if 'STACKINGPROFILEDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
    if 'IMAGERYYEARDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)

df = pd.concat(dfs, ignore_index = True)
df = df.dropna(axis = 0)

existing = [int(x[:-4]) for x in os.listdir(s2_path) if ".DS" not in x]

df = df[df['PLOT_ID'].isin(existing)]
plot_ids = sorted(df['PLOT_ID'].unique())

india-test.csv
lac-north-test.csv
cameroon-test.csv
ghana-test.csv
global-test.csv
kenya-test.csv
lac-south-test.csv
ethiopia-test.csv
ghana-test-large.csv
africaeast-test.csv
africawest-test.csv


In [139]:
# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
    
countries = {}
count = 0
to_remove = []
# Iterate over each plot
for i in tnrange(len(plot_ids)):
    skip = True if sentinel_1 else False
    # Load the sentinel imagery
    if (str(plot_ids[i]) + ".npy")in os.listdir(s2_path):
        country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
        if str(country[0]) not in countries.keys():
            countries[str(country[0])] = [count, count]
        countries[str(country[0])][1] = count
        x = np.load(s2_path + str(plot_ids[i]) + ".npy")
        if sentinel_1 and os.path.isfile(s1_path + str(plot_ids[i]) + ".npy"):
            skip = False
            s1 = np.load(s1_path + str(plot_ids[i]) + ".npy")
            x = np.concatenate([x, s1], axis = -1)
        count += 1
    y = reconstruct_images(plot_ids[i])
    if not skip:
        lengths.append(x.shape[0])
        data_x.append(x)
        data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

HBox(children=(IntProgress(value=0, max=684), HTML(value='')))


Finished data loading


### Data writing

In [140]:
# This writes a (N_samples, time, width, height, channels) X data that is not standardized
# Training 4500, 24, 16, 16, 16
# and a (N_samples, width, height) Y data where the Y is a 0 / 1 binary tree presence
# The X data is 16x16, and the Y data is 14x14 -- so the X data incorporates an extra boundary
# pixel, which should be downsampled before output layer with a convolution layer with no padding

# The X data channels are as below:
# [B02,B03,B04,B05,B06,B07, B08, B8A,B11,B12, SLOPE, EVI, MSAVI2, Bare soil index,
#    Sentinel 1 VV, Sentinel 1 VH]

# GAN 

# Pretraining

# Normalization
#     layer norm,
#     batch renorm
#     batch norm, batch renormalization, instance normalization,
#     self normalization - https://arxiv.org/abs/1706.02515

# Loss functions
#   - binary cross entropy
#   - boundary loss
#   - lovasz softmax
#   - directly optimize dice score
#   - hausdorff distance metrics
#   - focal loss

# Baseline ResNet, InceptionNet, XceptionNet


if source == 'train':
    np.save(output_path + "data_x_l2a_processed.npy", data_x)
    np.save(output_path + "data_y_l2a_processed.npy", np.array(data_y))
    np.save(output_path + "length_l2a_processed.npy", np.array(lengths))
if source == 'test' or source == "project":
    print("Writing test data")
    np.save(output_path + "test_x_l2a_processed.npy", data_x)
    np.save(output_path + "test_y_l2a_processed.npy", data_y)
    np.save(output_path + "test_length_l2a_processed.npy", lengths)

Writing test data
