This notebook contains code that creates label files for the training, validation, and test data.

In [1]:
import boto3
import pandas as pd

## make dataframe of s3 filepaths

In [2]:
s3 = boto3.resource('s3')

bucket_name = 'canopy-production-ml'

pc_bucket = s3.Bucket(bucket_name)

print(pc_bucket)

s3.Bucket(name='canopy-production-ml')


In [3]:
# Get list of all chip geotiffs in a certain S3 folder

unittest_chips = []

unittest_uri = 'chips/model2_s2cloudless/training_v1/'

for obj in pc_bucket.objects.all():
    if unittest_uri in obj.key:
        unittest_chips.append(obj.key)
        
len(unittest_chips)

19989

In [4]:
unittest_chips[0]

'chips/model2_s2cloudless/training_v1/'

In [5]:
unittest_chips[1]

'chips/model2_s2cloudless/training_v1/DRC_train_multiclass_v1.csv'

In [15]:
# Note that each chip is put in a folder according to the label that applies.
# So since this chip is in the "ISL" folder, it's an ISL chip.
# If more than one label applies, the chip will be in two different folders.

unittest_chips[6]

'chips/model2_s2cloudless/training_v1/ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-03-15.tif'

In [16]:
c = unittest_chips[6]
c.split('-')

['chips/model2_s2cloudless/training_v1/ISL/ISL',
 '100',
 '100',
 '100_1000_1100',
 '2020',
 '12',
 '15',
 '2021',
 '03',
 '15.tif']

In [4]:
def get_filename(path):
    split = path.split('/')
    label = split[0]
    filename = split[1]
    if label == 'null': # Null chips have a different filename format from labeled chips
        return filename[:-4]
    else:
        return filename.split('-')[2]

In [7]:
unittest_chips[2][-3:]

'csv'

In [11]:
unittest_chips[2].split('/')

['chips', 'model2_s2cloudless', 'training_v1', 'DRC_train_multilabel_v1.csv']

In [10]:
('/').join(unittest_chips[2].split('/')[3:])

'ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-03-15.tif'

In [11]:
chip = unittest_chips[-1]

split = chip.split('/')
label = split[3]

label

'null'

In [19]:
chip[:-4]

'chips/model2_s2cloudless/training_v1/null/99_900_600'

In [5]:
def make_multiclass_dataframe(path_list):
    """
    Makes a multiclass label dataframe out of a list of chips in S3.
    path_list: List of S3 URLs
    """
    data = {0: [], 1: [], 2: [], 3: [], 'paths': []} # Change number of keys depending on how many labels you have
    for chip in unittest_chips:
        if chip[-3:] == 'tif': # only put geotiffs in the dataframe
            split = chip.split('/')
            label = split[3]
            # We're doing one-hot encoding
            if label == 'ISL':
                data[0].append(1) # First column represents ISL
                data[1].append(0)
                data[2].append(0)
                data[3].append(0)
            elif label == 'SAB':
                data[1].append(1) # second column represents slash and burn
                data[0].append(0)
                data[2].append(0)
                data[3].append(0)
            elif label == 'industrial_agriculture':
                data[2].append(1) # third column represents industrial ag
                data[0].append(0)
                data[1].append(0)
                data[3].append(0)
            elif label == 'null':
                data[3].append(1) # fourth column represents the null class
                data[0].append(0)
                data[1].append(0)
                data[2].append(0)
            else:
                raise ValueError('label error')
                
            path = ('/').join(chip.split('/')[3:])
            data['paths'].append(path) # append URL
    
    df = pd.DataFrame(data=data)
    
    def get_filename(path):
        split = path.split('/')
        label = split[0]
        filename = split[1]
        if label == 'null':
            return filename[:-4]
        else:
            return filename.split('-')[3]
    
    df['filenames'] = df['paths'].apply(get_filename) # get filenames
    
    # drop duplicate filenames (the same file can be in two different folders
    # if both labels apply to it)
    df = df.drop_duplicates(subset=['filenames'], ignore_index=True)
    df = df.drop('filenames', axis=1)

    return df

In [6]:
df = make_multiclass_dataframe(unittest_chips)

print(df.shape)
df.head()

(19819, 5)


Unnamed: 0,0,1,2,3,paths
0,1,0,0,0,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...
1,1,0,0,0,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...
2,1,0,0,0,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...
3,1,0,0,0,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...
4,1,0,0,0,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...


In [7]:
df.columns

Index([0, 1, 2, 3, 'paths'], dtype='object')

In [15]:
# Make a new dataframe with just SAB and non-SAB labels
df_sab = df.copy()
df_sab[4] = df[1].apply(lambda x: 1 if x == 0 else 0) # non-SAB column
df_sab.head()

Unnamed: 0,0,1,2,3,paths,4
0,1,0,0,0,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...,1
1,1,0,0,0,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...,1
2,1,0,0,0,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...,1
3,1,0,0,0,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...,1
4,1,0,0,0,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...,1


In [16]:
df_sab[1].value_counts()

0    16510
1     3309
Name: 1, dtype: int64

In [17]:
df_sab[4].value_counts()

1    16510
0     3309
Name: 4, dtype: int64

In [18]:
df_sab = df_sab.drop([0,2,3], axis=1)
df_sab = df_sab.rename(columns={1: 0, 4: 1})
df_sab = df_sab[[0,1,'paths']]
df_sab.head()

Unnamed: 0,0,1,paths
0,0,1,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...
1,0,1,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...
2,0,1,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...
3,0,1,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...
4,0,1,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...


In [None]:
df_sab = df_sab.reset_index(drop=True)
df_sab = df_sab.rename({})

In [14]:
df_sab.shape

(19819, 3)

In [28]:
df.tail()

Unnamed: 0,0,1,2,3,paths
19814,0,0,0,1,null/99_900_2000.tif
19815,0,0,0,1,null/99_900_2100.tif
19816,0,0,0,1,null/99_900_2200.tif
19817,0,0,0,1,null/99_900_300.tif
19818,0,0,0,1,null/99_900_600.tif


In [29]:
# Saving the non-SAB dataframe as a CSV
df.to_csv('DRC_labels_multiclass_v2_unbalanced.csv', index=None, header=True)

In [30]:
# Create train, val, and test files
df_shuffled = df.sample(frac=1)
df_shuffled = df_shuffled.reset_index(drop=True)
df_shuffled.shape

(19819, 5)

In [31]:
total = df_shuffled.shape[0]

train_size = round(total * .6)
val_test_size = round(total * .2)

train_size + val_test_size * 2

19819

In [32]:
df_train = df_shuffled[:train_size]
df_val = df_shuffled[train_size:train_size+val_test_size]
df_test = df_shuffled[train_size+val_test_size:]

print(df_train.shape, df_val.shape, df_test.shape)

(11891, 5) (3964, 5) (3964, 5)


In [33]:
df_train.to_csv('DRC_labels_multiclass_v2_train.csv', index=None, header=True)
df_val = df_val.reset_index(drop=True)
df_val.to_csv('DRC_labels_multiclass_v2_val.csv', index=None, header=True)
df_test = df_test.reset_index(drop=True)
df_test.to_csv('DRC_labels_multiclass_v2_test.csv', index=None, header=True)

In [34]:
df_train.head()

Unnamed: 0,0,1,2,3,paths
0,0,0,0,1,null/81_8100_10900.tif
1,0,0,0,1,null/81_6900_10900.tif
2,0,0,0,1,null/81_3100_8400.tif
3,0,1,0,0,SAB/Shifting_cultivation-100-92-92_2500_2600-2...
4,0,0,0,1,null/50_2700_2400.tif


In [35]:
df_val.head()

Unnamed: 0,0,1,2,3,paths
0,0,0,0,1,null/60_100_8600.tif
1,0,0,0,1,null/92_4600_4000.tif
2,0,0,0,1,null/81_7400_2700.tif
3,0,0,0,1,null/81_2700_7000.tif
4,0,1,0,0,SAB/Shifting_cultivation-100-52-52_2000_2200-2...


In [36]:
df_test.head()

Unnamed: 0,0,1,2,3,paths
0,0,0,0,1,null/60_200_6400.tif
1,0,1,0,0,SAB/Shifting_cultivation-100-81-81_4900_12000-...
2,0,1,0,0,SAB/Shifting_cultivation-100-92-92_6300_3300-2...
3,0,1,0,0,SAB/Shifting_cultivation-100-81-81_3400_8400-2...
4,0,0,0,1,null/67_1900_6000.tif


In [29]:
def make_multilabel_dataframe(path_list):
    """
    Makes a multilabel label dataframe out of a list of chips in S3.
    path_list: List of S3 URLs
    """
    data = {0: [], 1: [], 2: [], 'paths': []} # Change number of keys depending on how many labels you want
    for chip in unittest_chips:
        if chip[-3:] == 'tif': # only put geotiffs in the dataframe
            split = chip.split('/')
            label = split[3]
            # we're doing one-hot encoding
            if label == 'ISL':
                data[0].append(1) # first column represents ISL
                data[1].append(0)
                data[2].append(0)
            elif label == 'SAB':
                data[1].append(1) # second column represents slash and burn
                data[0].append(0)
                data[2].append(0)
            elif label == 'industrial_agriculture':
                data[2].append(1) # third column represents industrial ag
                data[0].append(0)
                data[1].append(0)
            elif label == 'null':
                # if it's a null label, all label columns are zero
                data[0].append(0) 
                data[1].append(0)
                data[2].append(0)
            else:
                raise ValueError('label error')
                
            path = ('/').join(chip.split('/')[3:])
            data['paths'].append(path)
    
    df = pd.DataFrame(data=data)
  
    def get_filename(path):
        split = path.split('/')
        filename = split[-1]
        split2 = filename.split('-')
        neutral_name = ('-').join(split2[1:])
        return neutral_name
    
    df['filenames'] = df['paths'].apply(get_filename)
    
    df2 = df.copy()
    df2 = df2.groupby('filenames').sum().reset_index()
    new_paths = []
    for filename in df2['filenames']:
        for i in range(len(df)):
            if df['filenames'][i] == filename:
                new_paths.append(df['paths'][i])
                break
                
    df2['paths'] = new_paths
    df2 = df2.drop(['filenames'], axis=1)
    df2 = df2[[0, 1, 2, 'paths']]

    return df2

In [30]:
df2 = make_multilabel_dataframe(unittest_chips)

df2.head()

Unnamed: 0,0,1,2,paths
0,1,0,0,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...
1,1,0,0,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...
2,1,0,0,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...
3,1,0,0,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...
4,1,0,0,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...


In [40]:
assert df.shape[0] == df2.shape[0]

In [32]:
df2.shape

(4135, 4)

In [33]:
df2['paths'].value_counts()

SAB/Shifting_cultivation-100-81-81_5800_3600-2019-12-01-2020-02-29.tif      1
SAB/Shifting_cultivation-100-81-81_8000_5900-2019-12-01-2020-02-29.tif      1
SAB/Shifting_cultivation-100-56-56_600_1000-2019-12-01-2020-02-29.tif       1
SAB/Shifting_cultivation-100-101-101_2300_3500-2019-12-01-2020-02-29.tif    1
SAB/Shifting_cultivation-100-81-81_3100_8700-2019-12-01-2020-02-29.tif      1
                                                                           ..
SAB/Shifting_cultivation-100-101-101_1000_800-2019-12-01-2020-02-29.tif     1
SAB/Shifting_cultivation-100-81-81_5200_11800-2019-12-01-2020-02-29.tif     1
SAB/Shifting_cultivation-100-56-56_1100_2700-2019-12-01-2020-02-29.tif      1
SAB/Shifting_cultivation-100-52-52_2000_1800-2019-12-01-2020-02-29.tif      1
ISL/ISL-100-73-73_400_300-2019-12-13-2020-03-12.tif                         1
Name: paths, Length: 4135, dtype: int64

In [35]:
df2_shuffle = df2.sample(frac=1)
df2_shuffle = df2_shuffle.reset_index(drop=True)

df2_train = df2_shuffle.loc[0:2480]
df2_val = df2_shuffle.loc[2481:2481+826]
df2_test = df2_shuffle.loc[2481+826:2481+826*2+1]

df_to_csv(df2_train, 'train_multilabel_full.csv')
df_to_csv(df2_val, 'val_multilabel_full.csv')
df_to_csv(df2_test, 'test_multilabel_full.csv')

In [41]:
df.to_csv('DRC_labels_multiclass_v1.csv', index=None, header=True)

df2.to_csv('DRC_labels_multilabel_v1.csv', index=None, header=True)

In [42]:
df_sample_train = df.sample(n=6)
df_sample_val = df.sample(n=2)
df2_sample_train = df2.sample(n=6)
df2_sample_val = df2.sample(n=2)

In [None]:
def df_to_csv(df, filename):
    df.to_csv(filename + '.csv', index=None, header=True)
    
df_to_csv(df_sample_train, 'DRC_train_multiclass_v1')
df_to_csv(df_sample_val, 'DRC_val_multiclass_v1')
df_to_csv(df2_sample_val, 'DRC_train_multilabel_v1')
df_to_csv(df)

In [24]:
df3 = df.copy()
def get_filename(path):
    split = path.split('/')
    filename = split[-1]
    split2 = filename.split('-')
    neutral_name = ('-').join(split2[1:])
    return neutral_name

df3['filenames'] = df3['paths'].apply(get_filename)

df3.head()

Unnamed: 0,0,paths,filenames
0,1,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...,100-100-100_1000_1100-2020-12-15-2021-03-15.tif
1,1,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...,100-100-100_1000_1500-2020-12-15-2021-03-15.tif
2,1,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...,100-100-100_1100_1000-2020-12-15-2021-03-15.tif
3,1,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...,100-100-100_1100_1100-2020-12-15-2021-03-15.tif
4,1,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...,100-100-100_1100_900-2020-12-15-2021-03-15.tif


In [25]:
df3['filenames'].value_counts()

100-52-52_1400_2900-2020-12-15-2021-03-15.tif     1
100-81-81_6600_1700-2019-12-01-2020-02-29.tif     1
100-52-52_1200_3500-2019-12-01-2020-02-29.tif     1
100-81-81_5800_3400-2019-12-01-2020-02-29.tif     1
100-92-92_4900_3000-2019-12-01-2020-02-29.tif     1
                                                 ..
100-81-81_7800_5800-2019-12-01-2020-02-29.tif     1
100-49-49_800_2600-2019-12-01-2020-02-29.tif      1
100-82-82_1300_2100-2020-12-15-2021-03-15.tif     1
100-81-81_5100_10300-2019-12-01-2020-02-29.tif    1
100-81-81_5300_10800-2019-12-01-2020-02-29.tif    1
Name: filenames, Length: 4135, dtype: int64

In [28]:
df4 = df2.copy()
df4['filenames'] = df2['paths'].apply(get_filename)

df4 = df4.groupby('filenames').sum().reset_index()

In [29]:
df4.head()

Unnamed: 0,filenames,0,1,2
0,100-100-100_1000_1100-2020-12-15-2021-03-15.tif,1,0,0
1,100-100-100_1000_1500-2020-12-15-2021-03-15.tif,1,0,0
2,100-100-100_1100_1000-2020-12-15-2021-03-15.tif,1,0,0
3,100-100-100_1100_1100-2020-12-15-2021-03-15.tif,1,0,0
4,100-100-100_1100_900-2020-12-15-2021-03-15.tif,1,0,0


In [31]:
df4 = df4[[0, 1, 2, 'filenames']]

df4.head()

Unnamed: 0,0,1,2,filenames
0,1,0,0,100-100-100_1000_1100-2020-12-15-2021-03-15.tif
1,1,0,0,100-100-100_1000_1500-2020-12-15-2021-03-15.tif
2,1,0,0,100-100-100_1100_1000-2020-12-15-2021-03-15.tif
3,1,0,0,100-100-100_1100_1100-2020-12-15-2021-03-15.tif
4,1,0,0,100-100-100_1100_900-2020-12-15-2021-03-15.tif


## merge dataframes

In [12]:
test_1 = pd.DataFrame(data={'col1': [1,2,3], 'col2': [4,5,6]})

test_2 = pd.DataFrame(data={'col2': [5,6,4], 'col3': [8,9,10]})

test_1.merge(test_2, on='col2')

Unnamed: 0,col1,col2,col3
0,1,4,10
1,2,5,8
2,3,6,9


In [13]:
label_df.loc[0, 'filepaths']

'/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Chips/misha_polygons_cloudfreemerge/yes/ISL/100/59/59_800_2800.tif'

In [14]:
df_val_s3.loc[0, 's3_paths']

'chips/cloudfree-merge-polygons/split/val/1/1_1000_1000.tif'

In [15]:
def get_filename(path):
    return path.split('/')[-1]

label_df['filenames'] = label_df['filepaths'].apply(get_filename)
df_val_s3['filenames'] = df_val_s3['s3_paths'].apply(get_filename)

In [16]:
label_df.head()

Unnamed: 0,filepaths,labels,lengths,filenames
0,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[ISL],1,59_800_2800.tif
1,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[ISL],1,59_900_2800.tif
2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[ISL],1,59_900_2900.tif
3,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[ISL],1,59_1000_2900.tif
4,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[ISL],1,59_1000_3000.tif


In [17]:
df_val_s3.head()

Unnamed: 0,s3_paths,filenames
0,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1000.tif
1,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1100.tif
2,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1200.tif
3,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1300.tif
4,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1400.tif


In [18]:
merge_df_val = df_val_s3.merge(label_df, on='filenames')

merge_df_val.head()

Unnamed: 0,s3_paths,filenames,filepaths,labels,lengths
0,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1000.tif,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,"[ISL, Rainforest]",2
1,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1100.tif,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[Rainforest],1
2,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1200.tif,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[Rainforest],1
3,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1300.tif,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[Rainforest],1
4,chips/cloudfree-merge-polygons/split/val/1/1_1...,1_1000_1400.tif,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...,[Rainforest],1


In [19]:
assert len(merge_df_val) == len(val_chips)

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()

val_ohe = mlb.fit_transform(merge_df_val['labels'])

val_ohe

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [22]:
mlb.classes_

array(['Habitation', 'ISL', 'Industrial_agriculture', 'Mining',
       'Rainforest', 'River', 'Roads', 'Savannah', 'Shifting_cultivation',
       'Water'], dtype=object)

In [21]:
list(val_ohe)[0]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [23]:
len(val_ohe)

6795

In [26]:
with open('val.lst', 'w') as lst:
    for i in range(len(merge_df_val)):
        lst.write(str(i) + '\t')
        for label in list(val_ohe)[i]:
            lst.write(str(label) + '\t')
        lst.write(merge_df_val.loc[i, 's3_paths'])
        lst.write('\n')

## the function

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer


def make_lst_file(label_df, chips, lst_path):
    s3_df = pd.DataFrame(data={'s3_paths': chips})
    
    def get_filename(path):
        return path.split('/')[-1]

    label_df['filenames'] = label_df['filepaths'].apply(get_filename)
    s3_df['filenames'] = df_val_s3['s3_paths'].apply(get_filename)
    
    merge_df = s3_df.merge(label_df, on='filenames')
    
    mlb = MultiLabelBinarizer()
    ohe = mlb.fit_transform(merge_df_val['labels'])

    with open(lst_path, 'w') as lst:
        for i in range(len(merge_df)):
            lst.write(str(i) + '\t')
            for label in list(ohe)[i]:
                lst.write(str(label) + '\t')
            lst.write(merge_df.loc[i, 's3_paths'])
            lst.write('\n')
            
    return mlb.classes_

In [28]:
make_lst_file(label_df, val_chips, 'val_2.lst')

array(['Habitation', 'ISL', 'Industrial_agriculture', 'Mining',
       'Rainforest', 'River', 'Roads', 'Savannah', 'Shifting_cultivation',
       'Water'], dtype=object)

## label json

In [1]:
import json


label_list = ['Habitation', 'ISL', 'Industrial_agriculture', 'Mining',
    'Rainforest', 'River', 'Roads', 'Savannah', 'Shifting_cultivation',
    'Water'
]

label_dict = {
    'label_names': {}
}

In [2]:
for i, label in enumerate(label_list, 1):
    label_dict['label_names'][i] = label

In [3]:
label_dict

{'label_names': {1: 'Habitation',
  2: 'ISL',
  3: 'Industrial_agriculture',
  4: 'Mining',
  5: 'Rainforest',
  6: 'River',
  7: 'Roads',
  8: 'Savannah',
  9: 'Shifting_cultivation',
  10: 'Water'}}

In [4]:
with open('labels.json', 'w') as json_file:
    json.dump(label_dict, json_file)