In [1]:
import boto3


s3 = boto3.resource('s3')

bucket_name = 'canopy-production-ml'

pc_bucket = s3.Bucket(bucket_name)

In [2]:
all_objects = pc_bucket.objects.all()

In [15]:
yes_chips = []
for obj in all_objects:
    if 'cloudfree-merge-polygons/yes' in obj.key:
        yes_chips.append(obj)
        
len(yes_chips)

49133

In [4]:
yes_chips[0].key

'chips/cloudfree-merge-polygons/yes/Habitation/100/101/101_1000_3000.tif'

In [5]:
key = yes_chips[0].key

key.split('/')

['chips',
 'cloudfree-merge-polygons',
 'yes',
 'Habitation',
 '100',
 '101',
 '101_1000_3000.tif']

In [6]:
split = key.split('/')
split[2] = 'dataset_v2'

'/'.join(split)

'chips/cloudfree-merge-polygons/dataset_v2/Habitation/100/101/101_1000_3000.tif'

In [13]:
total = len(yes_chips)

for i, chip in enumerate(yes_chips):
    print(f'Copying chip {i+1} of {total}', end='\r', flush=True)
    
    key = chip.key

    copy_source = {
          'Bucket': bucket_name,
          'Key': key
        }

    key_split = key.split('/')
    key_split[2] = 'dataset_v2'
    new_key = '/'.join(key_split)
    
    pc_bucket.copy(copy_source, new_key)

Copying chip 49133 of 49133

In [11]:
print(end - start)

37.38832712173462


In [12]:
all_chips = []
for obj in all_objects:
    if 'cloudfree-merge-polygons/full' in obj.key:
        all_chips.append(obj)
        
len(all_chips)

128161

In [16]:
yes_filenames = [chip.key.split('/')[-1] for chip in yes_chips]

yes_filenames[:10]

['101_1000_3000.tif',
 '101_1000_3100.tif',
 '101_1000_400.tif',
 '101_1000_500.tif',
 '101_1100_2600.tif',
 '101_1100_300.tif',
 '101_1100_3000.tif',
 '101_1100_3100.tif',
 '101_1100_400.tif',
 '101_1100_500.tif']

In [17]:
all_chips[0].key

'chips/cloudfree-merge-polygons/full/1/100/1_0_0.tif'

In [10]:
all_chips[0].key[-3:]

'tif'

In [17]:
no_chips = []

total = len(all_chips)

for i, chip in enumerate(all_chips):
    filename = chip.key.split('/')[-1]
    if filename not in yes_filenames:
        no_chips.append(chip)
        
len(no_chips)

86390

In [16]:
import collections


count = collections.Counter(yes_filenames)

duplicates = [i for i in count if count[i] >= 2]

duplicates

['101_1000_3100.tif',
 '101_1000_400.tif',
 '101_1000_500.tif',
 '101_1100_300.tif',
 '101_1100_3000.tif',
 '101_1100_3100.tif',
 '101_1100_400.tif',
 '101_1100_500.tif',
 '101_1200_2600.tif',
 '101_1200_300.tif',
 '101_1200_3300.tif',
 '101_1200_3400.tif',
 '101_1200_3500.tif',
 '101_1200_3600.tif',
 '101_1200_400.tif',
 '101_1300_1600.tif',
 '101_1300_1800.tif',
 '101_1300_3500.tif',
 '101_1300_3600.tif',
 '101_1400_1600.tif',
 '101_1400_1700.tif',
 '101_1400_2700.tif',
 '101_1500_2200.tif',
 '101_1500_2300.tif',
 '101_1500_2700.tif',
 '101_1500_2800.tif',
 '101_1500_3900.tif',
 '101_1500_4000.tif',
 '101_1500_4300.tif',
 '101_1500_800.tif',
 '101_1600_1500.tif',
 '101_1600_1600.tif',
 '101_1600_2000.tif',
 '101_1600_2300.tif',
 '101_1600_3400.tif',
 '101_1600_3500.tif',
 '101_1600_4300.tif',
 '101_1600_500.tif',
 '101_1600_800.tif',
 '101_1700_1500.tif',
 '101_1700_1600.tif',
 '101_1700_200.tif',
 '101_1700_2300.tif',
 '101_1700_2400.tif',
 '101_1700_2500.tif',
 '101_1700_300.tif',


In [13]:
odd_keys

[]

In [20]:
assert len(yes_chips) + len(no_chips) == len(all_chips)

AssertionError: 

In [14]:
len(yes_chips) + len(no_chips)

135523

In [15]:
len(all_chips)

128161

In [6]:
chip_filenames = [chip.key.split('/')[-1] for chip in all_chips]

missing = []

total = len(yes_chips)

for i, chip in enumerate(yes_chips):
    print(f'Chip {i+1} of {total}', end='\r', flush=True)
    
    if chip.key.split('/')[-1] not in chip_filenames:
        missing.append(chip.key)
        
len(missing)

Chip 12345 of 49133

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Chip 49133 of 49133

5

In [7]:
missing

['chips/cloudfree-merge-polygons/yes/ISL/100/1/1_300_1000.tif.aux.xml',
 'chips/cloudfree-merge-polygons/yes/ISL/100/1/1_300_1100.tif.aux.xml',
 'chips/cloudfree-merge-polygons/yes/ISL/100/1/1_400_700.tif.aux.xml',
 'chips/cloudfree-merge-polygons/yes/ISL/100/1/1_400_800.tif.aux.xml',
 'chips/cloudfree-merge-polygons/yes/ISL/100/1/1_400_900.tif.aux.xml']

In [24]:
len(no_chips)

86390

In [42]:
def boto3_copy_objs(keys, old_bucket_name, new_bucket,
                    folder_change_dict=None, folder_add_dict=None, folder_swap_dict=None):
    total = len(keys)
    
    for i, key in enumerate(keys):
        print(f'Copying object {i+1} of {total}', end='\r', flush=True)
        
        copy_source = {
            'Bucket': old_bucket_name,
            'Key': key
        }
        
        key_split = key.split('/')
        
        if folder_change_dict:
            for k in folder_change_dict:
                try:
                    key_split[k] = folder_change_dict[k]
                except:
                    raise ValueError('Error when applying the folder_change_dict')
                
        if folder_add_dict:
            for k in folder_add_dict:
                try:
                    key_split.insert(k, folder_add_dict[k])
                except:
                    raise ValueError('Error when applying the folder_add_dict')
                    
        if folder_swap_dict:
            for k in folder_swap_dict:
                i = folder_swap_dict[k]
                try:
                    key_split[k], key_split[i] = key_split[i], key_split[k]
                except:
                    raise ValueError('Error when applying the folder_swap_dict')
                
        new_key = '/'.join(key_split)
        
        #print(new_key)
        
        new_bucket.copy(copy_source, new_key)

In [18]:
no_chips[0].key

'chips/cloudfree-merge-polygons/full/1/100/1_0_0.tif'

In [21]:
boto3_copy_objs(no_chips, bucket_name, pc_bucket, {2: 'dataset_v2'}, {3: 'misc'}, {4: 5})

Copying object 86390 of 86390

In [22]:
keep_cats = ['ISL', 'Roads', 'Industrial_agriculture', 'Shifting_cultivation', 'Mining']

keep_filenames = []

for obj in all_objects:
    for category in keep_cats:
        if f'cloudfree-merge-polygons/dataset_v2/{category}' in obj.key:
            filename = obj.key.split('/')[-1]
            keep_filenames.append(filename)

In [23]:
len(keep_filenames)

14025

In [24]:
keep_filenames[0]

'1_1000_1000.tif'

In [33]:
remove_cats = ['River', 'Rainforest', 'Water', 'Savanna', 'Habitation']

move_chips = []

delete_chips = []

for obj in all_objects:
    for category in remove_cats:
        if f'cloudfree-merge-polygons/dataset_v2/{category}' in obj.key:
            filename = obj.key.split('/')[-1]
            if filename not in keep_filenames:
                move_chips.append(obj)
            else:
                delete_chips.append(obj)

In [34]:
len(move_chips) + len(delete_chips)

35108

In [35]:
import pandas as pd


data = {'Keys': [chip.key for chip in move_chips]}

df = pd.DataFrame(data=data)

df.head()

Unnamed: 0,Keys
0,chips/cloudfree-merge-polygons/dataset_v2/Habi...
1,chips/cloudfree-merge-polygons/dataset_v2/Habi...
2,chips/cloudfree-merge-polygons/dataset_v2/Habi...
3,chips/cloudfree-merge-polygons/dataset_v2/Habi...
4,chips/cloudfree-merge-polygons/dataset_v2/Habi...


In [36]:
df['Filenames'] = df['Keys'].apply(lambda x:x.split('/')[-1])

df.head()

Unnamed: 0,Keys,Filenames
0,chips/cloudfree-merge-polygons/dataset_v2/Habi...,101_1000_3000.tif
1,chips/cloudfree-merge-polygons/dataset_v2/Habi...,101_1000_3100.tif
2,chips/cloudfree-merge-polygons/dataset_v2/Habi...,101_1100_2600.tif
3,chips/cloudfree-merge-polygons/dataset_v2/Habi...,101_1100_3000.tif
4,chips/cloudfree-merge-polygons/dataset_v2/Habi...,101_1100_3100.tif


In [37]:
print(df.shape)

df2 = df.drop_duplicates(subset=['Filenames'])

print(df2.shape)

(30226, 2)
(28577, 2)


In [38]:
move_chips_no_dupes = df2['Keys'].tolist()

len(move_chips_no_dupes)

28577

In [39]:
move_chips_no_dupes[0]

'chips/cloudfree-merge-polygons/dataset_v2/Habitation/100/101/101_1000_3000.tif'

In [43]:
boto3_copy_objs(move_chips_no_dupes, bucket_name, pc_bucket, {3: 'misc'})

Copying object 28577 of 28577

In [44]:
delete_chips_full = move_chips + delete_chips

len(delete_chips_full)

35108

In [None]:
# delete_objs_list = [{'Key': chip.key for chip in delete_chips_full}]

In [None]:
#client = boto3.client('s3')

#response = client.delete_objects(
#    Bucket=bucket_name,
#    Delete=delete_objs_list
#)

### train/test split based off polygons

In [45]:
all_objects = pc_bucket.objects.all()

all_chips = []

for obj in all_objects:
    if 'cloudfree-merge-polygons/dataset_v2/' in obj.key:
        all_chips.append(obj)
        
print(len(all_chips))

print(all_chips[0].key)

128994
chips/cloudfree-merge-polygons/dataset_v2/


In [47]:
all_chips[1].key

'chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_1000_1000.tif'

In [48]:
assert 'chips/cloudfree-merge-polygons/dataset_v2/ISL/' in all_chips

AssertionError: 

In [49]:
chip_count = {}

for chip in all_chips:
    key = chip.key
    
    try:
        poly_id = key.split('/')[5]

        if poly_id in chip_count.keys():
            chip_count[poly_id] += 1
        else:
            chip_count[poly_id] = 1
            
    except:
        print(key)

chips/cloudfree-merge-polygons/dataset_v2/
chips/cloudfree-merge-polygons/dataset_v2/misc/


In [50]:
sum(chip_count.values())

128992

In [56]:
chip_count

{'1': 559,
 '100': 336,
 '11': 56,
 '13': 550,
 '15': 1400,
 '16': 418,
 '17': 799,
 '2': 262,
 '20': 1025,
 '21': 1152,
 '22': 1710,
 '24': 210,
 '25': 1240,
 '26': 256,
 '29': 1548,
 '3': 110,
 '30': 509,
 '32': 1591,
 '33': 845,
 '34': 100,
 '35': 304,
 '36': 792,
 '37': 899,
 '38': 500,
 '39': 1620,
 '40': 306,
 '41': 272,
 '42': 456,
 '43': 110,
 '44': 1824,
 '46': 210,
 '47': 182,
 '48': 1833,
 '5': 399,
 '53': 1185,
 '54': 90,
 '55': 3036,
 '59': 1189,
 '62': 195,
 '64': 994,
 '65': 294,
 '68': 961,
 '70': 420,
 '72': 1292,
 '73': 36,
 '74': 3658,
 '75': 4680,
 '76': 440,
 '77': 780,
 '79': 528,
 '82': 899,
 '83': 456,
 '84': 611,
 '85': 3424,
 '88': 858,
 '89': 2977,
 '90': 88,
 '91': 1184,
 '93': 211,
 '94': 48,
 '99': 456,
 '10': 2843,
 '27': 708,
 '31': 1440,
 '49': 1545,
 '52': 1411,
 '58': 832,
 '6': 450,
 '87': 48,
 '9': 1292,
 '4': 34,
 '61': 20,
 '63': 72,
 '69': 460,
 '8': 121,
 '86': 832,
 '101': 1980,
 '14': 812,
 '19': 1955,
 '45': 256,
 '56': 1233,
 '66': 1822,
 '9

In [58]:
import json


json_object = json.dumps(chip_count)

with open('chip_count.json', 'w') as fp:
    json.dump(chip_count, fp)

In [4]:
bucket_name = 'canopy-production-ml-output'
my_bucket = s3.Bucket(bucket_name)
path = 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/'

h5_files = [obj.key for obj in my_bucket.objects.filter(Prefix=path) if obj.key[-2:] == 'h5']

h5_files

['ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/last_chkpt.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_1.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_10.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_11.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_12.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_13.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_14.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_15.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_16.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_17.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_18.h5',
 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_19.h5',
 'ckpt/pc-tf-custom-container-2021-04-14

In [7]:
h5_files = h5_files[1:]

In [8]:
h5_files_dict = {}

for file in h5_files:
    key = int(file.split('_')[-1].split('.')[0])
    h5_files_dict[key] = file
    
h5_files_dict

{1: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_1.h5',
 10: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_10.h5',
 11: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_11.h5',
 12: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_12.h5',
 13: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_13.h5',
 14: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_14.h5',
 15: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_15.h5',
 16: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_16.h5',
 17: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_17.h5',
 18: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_18.h5',
 19: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_19.h5',
 2: 'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resn

In [9]:
max_epoch = max(h5_files_dict)

h5_files_dict[max_epoch]

'ckpt/pc-tf-custom-container-2021-04-14-15-15-52-166/model_resnet_epoch_26.h5'

In [10]:
max(h5_files_dict.keys())

26