# Load, preprocess, and save train and test data

This notebook preprocesses and collates the training and testing data for model creation.

# John Brandt
# July 11, 2021

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save arrays for data_x, data_y, length


# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools
from scipy.ndimage import median_filter
import hickle as hkl

os.environ['KMP_DUPLICATE_LIB_OK']='True'

%run ../src/preprocessing/slope.py

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [16]:
source = 'test'
sentinel_1 = True
s2_path = "../data/{}-s2/".format(source)
s1_path = "../data/{}-s1/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

#s2_path = '../data/plantation/train-s2/'
#s1_path = '../data/plantation/train-s1/'
#dem_path = '../data/plantation/train-dem/'
#csv_path = "../data/plantation/train-csv/new/"

In [17]:
# Load and edit bad plot ids if needed
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

to_add = [141238348]
to_add = [x for x in to_add if x not in verified_lu_change]
verified_lu_change = np.concatenate([verified_lu_change, 
                     np.array(to_add).flatten()])

to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))

2328


In [18]:
bad_test_plots =[10048, 10052, 10084, 20026, 20047, 20079, 20091, 100111, 100120, 100191, 100209, 100213, 100216, 
200101, 139190217, 139270445, 150027, 150051, 150057, 200187, 1500180, 136776649, 136776650,139190100,
139190109, 139190113, 139190268, 139190330, 139190396, 139190452, 139190506, 139190534, 139190803,
139190811, 139190892, 139190900, 139190903, 139190954, 139191025, 139191125, 139191502, 139191557,139191574,
       139252935, 139264527, 139264598, 139270017, 139270025, 139270222, 139270102, 139270307, 139270436, 
       139270494, 139270542]

In [19]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set
import re 

cols_to_keep = ['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES', 'USER_ID',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE', 'plotid', 'sampleid', 'PLOT_FNAME', 'PLANTATION']
csvs = [x for x in sorted(os.listdir(csv_path)) if ".csv" in x]
#csvs = [x for x in csvs if 'uuid'in x]
csvs = [x for x in csvs if ".csv" in x]
#csvs = [x for x in csvs if "chaco" in x]
#csvs = [x for x in csvs if "senegal" not in x]

print(csvs)

dfs = []
for i in csvs:
    df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
    print(i, len(df) / 196)
    df.columns = [re.sub(r'\W+', '', x) for x in df.columns]
    df.rename(columns={'ïplotid':'plotid'}, inplace=True)
    df.columns = [x.upper() for x in df.columns]
    df.columns = ['PLOT_ID' if x == 'PLOTID' else x for x in df.columns]
    #df = df.drop('PLOT_ID', axis = 1)
    df.columns = ['SAMPLE_ID' if x == 'SAMPLEID' else x for x in df.columns]
    df = df.rename(columns={df.columns[0]: 'PLOT_ID'})
    #df = df[df['LAT'] > -24]
    #df = df[df['LAT'] < 24]
    #df = df.reset_index()
    #print(i, len(df) / 196)
    if len(df) > 0:
        #print(df.columns)
    # If there are no unique IDs already, go ahead and assign them
        
        if abs(df['PLOT_ID'][0]) == 1:
            print(df['PLOT_ID'][0])
            print(f"No unique ID for {i}")
            for index, row in df.iterrows():
                row['PLOT_ID'] = abs(row['PLOT_ID'])
                df['PLOT_ID'][index] = str(i[-6:-4]).zfill(2) + '00' + str(row['PLOT_ID'])
        
        for column in df.columns:
            if column not in cols_to_keep:
                df = df.drop(column, axis = 1)
        #df.reset_index(inplace=True, drop=True)
        print(df.columns)
        #df['country'] = i.split(".")[0]
        #df.to_csv(csv_path + i, index = False)
        dfs.append(df)

df = pd.concat(dfs, ignore_index = True, sort = True)
print(len(df) // 196)
df = df[~pd.isna(df['TREE'])]
print(len(df) // 196)

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

print(f"There are {len(plot_ids)} plots")

['ceo-oceana_middleast_test-02.csv', 'ceo-tml_asia_testplots-01.csv', 'tml-test-03.csv', 'tml-test-04.csv', 'tml-test-05.csv', 'tml-test-06.csv', 'tml-test-07.csv', 'tml-test-08.csv', 'tml-test-09.csv', 'tml-test-10.csv', 'tml-test-11.csv', 'tml-test-12.csv', 'tml-test-13.csv', 'tml-test-14.csv', 'tml-test-2022-15.csv']
ceo-oceana_middleast_test-02.csv 250.0
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE'],
      dtype='object')
ceo-tml_asia_testplots-01.csv 249.0
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE'],
      dtype='object')
tml-test-03.csv 115.0
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE'],
      dtype='object')
tml-test-04.csv 81.0
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE'],
      dtype='object')


In [20]:
df.head(5)

Unnamed: 0,ANALYSES,ANALYSIS_DURATION,COLLECTION_TIME,FLAGGED,LAT,LON,PLOT_ID,SAMPLE_ID,TREE,USER_ID
0,,20.2 secs,2022-07-20 19:26,False,-0.98449,131.822858,2001,1,1.0,
1,,20.2 secs,2022-07-20 19:26,False,-0.9844,131.822858,2001,2,1.0,
2,,20.2 secs,2022-07-20 19:26,False,-0.98431,131.822858,2001,3,1.0,
3,,20.2 secs,2022-07-20 19:26,False,-0.98422,131.822858,2001,4,1.0,
4,,20.2 secs,2022-07-20 19:26,False,-0.984131,131.822858,2001,5,1.0,


In [21]:
#df.to_csv("tml-india-train-plots.csv")

In [22]:
def to_int16(array: np.array) -> np.array:
    '''Converts a float32 array to int16, reducing storage costs by three-fold'''
    assert np.min(array) >= 0, np.min(array)
    assert np.max(array) <= 1, np.max(array)
    
    array = np.clip(array, 0, 1)
    array = np.trunc(array * 65535)
    assert np.min(array >= 0)
    assert np.max(array <= 65535)
    
    return array.astype(np.uint16)

def process_dem(dem):
    dem =  median_filter(dem, size = 5)
    dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                      np.full((32+2, 32+2), 10),
                      np.full((32+2, 32+2), 10), 
                      zScale = 1, minSlope = 0.02)
    dem = dem / 90
    dem = dem.reshape((32+2, 32+2, 1))
    dem = dem[1:-1, 1:-1]
    dem = median_filter(dem, 5)[2:-2, 2:-2]
    return dem

def grndvi(array):
    nir = np.clip(array[..., 3], 0, 1)
    green = np.clip(array[..., 1], 0, 1)
    red = np.clip(array[..., 2], 0, 1)
    denominator = (nir+(green+red)) + 1e-5
    return (nir-(green+red)) / denominator


In [23]:
from skimage.transform import resize

%run ../src/preprocessing/indices.py

def to_float32(array: np.array) -> np.array:
    """Converts an int_x array to float32"""
    if not isinstance(array.flat[0], np.floating):
        assert np.max(array) > 1
        array = np.float32(array) / 65535.
    assert np.max(array) <= 1
    assert array.dtype == np.float32
    return array

count = 0
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325],
                          'y': [0]})

# Identify shape of data to load
#plot_ids = [str(x).zfill(5) for x in plot_ids]
plot_ids_to_load = []
for i in range(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    s1_new_i = f'../data/{source}-s1/{str(plot_ids[i])}.npy'
    s1_exists = (os.path.exists(s1_i))
    print(s1_exists, os.path.isfile(s2_i), s2_i)
    
    if os.path.isfile(s2_i) and s1_exists:
        if plot_ids[i] not in bad_test_plots:#verified_lu_change:
            plot_ids_to_load.append(plot_ids[i])

print(f"There are {len(plot_ids_to_load)} plots")
plot_ids_to_load = [x for x in plot_ids_to_load if x not in  [139077414,
                                                              139187051,
                                                              139187043,
                                                             139187133, 139187134]]
data_x = np.zeros((len(plot_ids_to_load), 12, 28, 28, 14)).astype(np.uint16) # 14
data_y = np.zeros((len(plot_ids_to_load), 14, 14))
            
# Iterate over each plot
to_remove = []

for i in range(len(plot_ids_to_load)):
    #print(plot_ids_to_load[i])
    s1_i = f'{s1_path}{str(plot_ids_to_load[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids_to_load[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids_to_load[i])}.npy'

    x = to_float32(hkl.load(s2_i))
    s1 = hkl.load(s1_i)
    s1 = np.reshape(s1, (12, 16, 2, 16, 2, 2))
    s1 = np.mean(s1, axis = (2, 4))
    s1 = resize(s1, (12, 32, 32, 2), order = 1)
    s1 = s1[:, 2:-2, 2:-2, :]
    
    dem = np.load(dem_i)
    dem = process_dem(dem)
    dem = np.tile(dem.reshape((1, 28, 28)), (x.shape[0], 1, 1))
    x[..., 10] = dem
    x = np.concatenate([x, s1], axis = -1)

    count += 1
    y = reconstruct_images(plot_ids_to_load[i])
    long = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LON'])
    lat = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LAT'])
    dataframe = dataframe.append({'plot_id': str(plot_ids_to_load[i]),
                                  'lat': lat, 'long': long,
                                 'y': np.sum(np.array(y))}, 
                                 ignore_index = True)
    dataframe.append([plot_ids_to_load[i], lat, long])

    if np.sum(np.isnan(x)) > 0:
        to_remove.append(i)
    else:
        x = np.clip(x, 0, 1)
        x = to_int16(x)
        data_x[i] = x
        print("X worked")
        try:
            data_y[i] = np.array(y)
        except:
            print("Y didn't work")
           # to_remove.append(i)
            
# Remove any data samples that had missing values
if len(to_remove) > 0:
    print(f"Removing {to_remove}")
    #data_x = np.delete(data_x, to_remove, 0)
    #data_y = np.delete(data_y, to_remove, 0)
            
print(f"Finished loading: {data_x.shape} of {data_x.dtype} type")

True True ../data/test-s2/1001.hkl
True True ../data/test-s2/1002.hkl
True True ../data/test-s2/1003.hkl
True True ../data/test-s2/1004.hkl
True True ../data/test-s2/1005.hkl
True True ../data/test-s2/1006.hkl
True True ../data/test-s2/1008.hkl
True True ../data/test-s2/1009.hkl
True True ../data/test-s2/2001.hkl
True True ../data/test-s2/2002.hkl
True True ../data/test-s2/2003.hkl
True True ../data/test-s2/2004.hkl
True True ../data/test-s2/2005.hkl
True True ../data/test-s2/2006.hkl
True True ../data/test-s2/2008.hkl
True True ../data/test-s2/2009.hkl
True True ../data/test-s2/10010.hkl
True True ../data/test-s2/10011.hkl
True True ../data/test-s2/10012.hkl
True True ../data/test-s2/10013.hkl
True True ../data/test-s2/10014.hkl
True True ../data/test-s2/10015.hkl
True True ../data/test-s2/10016.hkl
True True ../data/test-s2/10017.hkl
True True ../data/test-s2/10018.hkl
True True ../data/test-s2/10019.hkl
True True ../data/test-s2/10020.hkl
True True ../data/test-s2/10021.hkl
True Tru

True True ../data/test-s2/200124.hkl
True True ../data/test-s2/200129.hkl
True True ../data/test-s2/200130.hkl
True True ../data/test-s2/200132.hkl
True True ../data/test-s2/200133.hkl
True True ../data/test-s2/200134.hkl
True True ../data/test-s2/200135.hkl
True False ../data/test-s2/200136.hkl
True True ../data/test-s2/200137.hkl
True True ../data/test-s2/200138.hkl
True True ../data/test-s2/200139.hkl
True True ../data/test-s2/200140.hkl
True True ../data/test-s2/200142.hkl
True True ../data/test-s2/200144.hkl
True True ../data/test-s2/200146.hkl
True True ../data/test-s2/200152.hkl
True True ../data/test-s2/200155.hkl
True True ../data/test-s2/200158.hkl
True True ../data/test-s2/200162.hkl
True True ../data/test-s2/200167.hkl
True True ../data/test-s2/200168.hkl
True True ../data/test-s2/200171.hkl
True True ../data/test-s2/200172.hkl
True True ../data/test-s2/200173.hkl
True True ../data/test-s2/200174.hkl
True True ../data/test-s2/200175.hkl
True True ../data/test-s2/200176.hkl


True True ../data/test-s2/139190315.hkl
True True ../data/test-s2/139190316.hkl
True True ../data/test-s2/139190317.hkl
True True ../data/test-s2/139190318.hkl
True True ../data/test-s2/139190319.hkl
True True ../data/test-s2/139190320.hkl
True True ../data/test-s2/139190322.hkl
True True ../data/test-s2/139190323.hkl
True True ../data/test-s2/139190325.hkl
True True ../data/test-s2/139190326.hkl
True True ../data/test-s2/139190327.hkl
True True ../data/test-s2/139190328.hkl
True True ../data/test-s2/139190329.hkl
True True ../data/test-s2/139190330.hkl
True True ../data/test-s2/139190331.hkl
True True ../data/test-s2/139190332.hkl
True True ../data/test-s2/139190334.hkl
True True ../data/test-s2/139190335.hkl
True True ../data/test-s2/139190336.hkl
True True ../data/test-s2/139190337.hkl
True True ../data/test-s2/139190346.hkl
True True ../data/test-s2/139190347.hkl
True True ../data/test-s2/139190348.hkl
True True ../data/test-s2/139190349.hkl
True True ../data/test-s2/139190350.hkl


True True ../data/test-s2/139252908.hkl
True True ../data/test-s2/139252910.hkl
True True ../data/test-s2/139252911.hkl
True True ../data/test-s2/139252914.hkl
True True ../data/test-s2/139252919.hkl
True True ../data/test-s2/139252920.hkl
True True ../data/test-s2/139252925.hkl
True True ../data/test-s2/139252928.hkl
True True ../data/test-s2/139252933.hkl
True True ../data/test-s2/139252934.hkl
True True ../data/test-s2/139252935.hkl
True True ../data/test-s2/139264384.hkl
True True ../data/test-s2/139264385.hkl
True True ../data/test-s2/139264386.hkl
True True ../data/test-s2/139264390.hkl
True True ../data/test-s2/139264393.hkl
True False ../data/test-s2/139264395.hkl
True True ../data/test-s2/139264396.hkl
True False ../data/test-s2/139264397.hkl
True True ../data/test-s2/139264398.hkl
True True ../data/test-s2/139264401.hkl
True True ../data/test-s2/139264404.hkl
True True ../data/test-s2/139264407.hkl
True True ../data/test-s2/139264408.hkl
True True ../data/test-s2/139264430.hk

True True ../data/test-s2/139270491.hkl
True True ../data/test-s2/139270494.hkl
True True ../data/test-s2/139270495.hkl
True True ../data/test-s2/139270497.hkl
True True ../data/test-s2/139270498.hkl
True True ../data/test-s2/139270509.hkl
True True ../data/test-s2/139270510.hkl
True True ../data/test-s2/139270512.hkl
True True ../data/test-s2/139270513.hkl
True True ../data/test-s2/139270516.hkl
True True ../data/test-s2/139270518.hkl
True True ../data/test-s2/139270519.hkl
True True ../data/test-s2/139270520.hkl
True True ../data/test-s2/139270524.hkl
True True ../data/test-s2/139270527.hkl
True True ../data/test-s2/139270533.hkl
True True ../data/test-s2/139270534.hkl
True True ../data/test-s2/139270535.hkl
True True ../data/test-s2/139270536.hkl
True True ../data/test-s2/139270537.hkl
True True ../data/test-s2/139270539.hkl
True True ../data/test-s2/139270540.hkl
True True ../data/test-s2/139270542.hkl
True True ../data/test-s2/139270543.hkl
True True ../data/test-s2/139270544.hkl


X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X

In [24]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)
if len(to_remove) > 0:
    dataframe = dataframe.drop(to_remove, 0)
    dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../data/{source}/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../data/{source}/{source}_y.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../data/{source}/{source}_plot_ids.csv", index = False)
print("Finished!")

Writing test data


  


Finished!


In [7]:
x = np.load("../data/train/train_y.npy")
x.shape

(52, 14, 14)

In [8]:
p = np.load("pineappley.npy")
p.shape

(60, 14, 14)

In [9]:
x = np.concatenate([x, p], axis = 0)
np.save("../data/train/train_y.npy", x)

In [10]:
x.shape

(112, 14, 14)