In [3]:
import boto3
import pandas as pd
import json

In [2]:
with open('train_test_polygons.json') as ttp:
    train_test_polygons = json.load(ttp)
    
train_test_polygons

{'train': [12,
  1,
  74,
  10,
  7,
  19,
  28,
  9,
  21,
  32,
  31,
  53,
  64,
  37,
  33,
  58,
  86,
  77,
  27,
  84,
  30,
  18,
  22,
  15,
  25,
  23,
  75,
  78,
  85,
  55,
  89,
  48,
  44,
  66,
  39,
  29,
  81,
  60,
  57,
  50,
  92,
  101],
 'test': [6,
  2,
  72,
  70,
  20,
  14,
  13,
  5,
  26,
  3,
  11,
  4,
  69,
  76,
  35,
  41,
  46,
  47,
  43,
  34,
  63,
  61,
  17,
  16,
  24,
  8,
  59,
  68,
  88,
  36,
  38,
  42,
  83,
  40,
  45,
  71,
  87,
  80,
  49,
  52,
  56,
  82,
  51,
  54,
  91,
  99,
  100,
  93,
  90,
  94,
  67,
  65,
  62,
  73,
  79]}

In [4]:
s3 = boto3.resource('s3')

bucket_name = 'canopy-production-ml'

pc_bucket = s3.Bucket(bucket_name)

all_objects = pc_bucket.objects.all()

In [5]:
all_chips = []

for obj in all_objects:
    if 'cloudfree-merge-polygons/dataset_v2' in obj.key:
        all_chips.append(obj)

In [7]:
def remove_duplicate_chips(keys):
    data = {'Keys': keys}
    
    df = pd.DataFrame(data=data)
    
    df['Filenames'] = df['Keys'].apply(lambda x:x.split('/')[-1])
    
    df = df.drop_duplicates(subset=['Filenames'])
    
    return df['Keys'].tolist()

In [16]:
poly_chip_dict = {}

for chip in all_chips:
    key = chip.key
    
    try:
        poly_id = int(key.split('/')[5])
        
        if poly_id in poly_chip_dict:
            poly_chip_dict[poly_id].append(key)
        else:
            poly_chip_dict[poly_id] = [key]
            
    except:
        print(key)

chips/cloudfree-merge-polygons/dataset_v2/
chips/cloudfree-merge-polygons/dataset_v2/misc/


In [17]:
len(poly_chip_dict)

97

In [18]:
len(poly_chip_dict[1])

559

In [19]:
poly_chip_dict[1][0]

'chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_1000_1000.tif'

In [20]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

test_chip_labels = {}

for test_poly in train_test_polygons['test']:
    chips = poly_chip_dict[test_poly]
    
    for chip in chips:
        filename = chip.split('/')[-1]
        
        if filename not in test_chip_labels:
            chip_labels = [0,0,0,0,0]
            
            for i in range(5):
                if labels[i] in chip:
                    chip_labels[i] = 1
                    
            test_chip_labels[filename] = {'labels': chip_labels, 'key': chip}
            
        else:
            for i in range(5):
                if labels[i] in chip:
                    test_chip_labels[filename]['labels'][i] = 1

In [22]:
list(test_chip_labels.keys())[:5]

['6_1000_1000.tif',
 '6_1000_1100.tif',
 '6_1000_1200.tif',
 '6_1000_1300.tif',
 '6_1000_600.tif']

In [23]:
test_chip_labels['6_1000_1000.tif']

{'labels': [1, 0, 0, 1, 0],
 'key': 'chips/cloudfree-merge-polygons/dataset_v2/Industrial_agriculture/100/6/6_1000_1000.tif'}

In [24]:
test_data = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    'paths': []
}

for filename in test_chip_labels:
    chip_labels = test_chip_labels[filename]['labels']
    
    path = test_chip_labels[filename]['key']
    
    for i in range(5):
        test_data[i].append(chip_labels[i])
        
    test_data['paths'].append(path)
    
test_labels_df = pd.DataFrame(data=test_data)

test_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
1,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
2,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
3,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
4,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...


In [25]:
test_labels_df.to_csv('new_test_labels_v1.csv', index=None, header=True)

In [26]:
labels

['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

In [28]:
def get_chip_labels(chips, label_names, labels_dict):
    for chip in chips:
        filename = chip.split('/')[-1]
        
        if filename not in labels_dict:
            chip_labels = [0,0,0,0,0]
            
            for i in range(5):
                if label_names[i] in chip:
                    chip_labels[i] = 1
                    
            labels_dict[filename] = {'labels': chip_labels, 'key': chip}
            
        else:
            for i in range(5):
                if label_names[i] in chip:
                    labels_dict[filename]['labels'][i] = 1

In [29]:
labels

['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

In [45]:
import random


train_chip_labels = {}
val_chip_labels = {}

for train_poly in train_test_polygons['train']:
    chips = poly_chip_dict[train_poly]
    
    val_slice = len(chips) // 5
    random.shuffle(chips)
    val_chips = chips[:val_slice]
    train_chips = chips[val_slice:]
    
    get_chip_labels(train_chips, labels, train_chip_labels)
    get_chip_labels(val_chips, labels, val_chip_labels)

In [46]:
len(train_chip_labels)

79499

In [47]:
len(val_chip_labels)

19905

In [48]:
list(train_chip_labels.keys())[:5]

['12_2200_4700.tif',
 '12_4100_4400.tif',
 '12_2700_1500.tif',
 '12_200_2600.tif',
 '12_2000_5200.tif']

In [49]:
train_chip_labels['12_1700_2800.tif']

{'labels': [0, 0, 0, 0, 0],
 'key': 'chips/cloudfree-merge-polygons/dataset_v2/misc/100/12/12_1700_2800.tif'}

In [50]:
def make_labels_df(labels_dict):
    data = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        'paths': []
    }

    for filename in labels_dict:
        chip_labels = labels_dict[filename]['labels']

        path = labels_dict[filename]['key']

        for i in range(5):
            data[i].append(chip_labels[i])

        data['paths'].append(path)

    labels_df = pd.DataFrame(data=data)

    return labels_df

In [51]:
train_labels_df = make_labels_df(train_chip_labels)
val_labels_df = make_labels_df(val_chip_labels)

train_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [52]:
val_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [53]:
train_labels_df.describe()

Unnamed: 0,0,1,2,3,4
count,79499.0,79499.0,79499.0,79499.0,79499.0
mean,0.011271,0.044038,0.001598,0.009371,0.028101
std,0.105564,0.205182,0.039937,0.096351,0.165262
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [54]:
val_labels_df.describe()

Unnamed: 0,0,1,2,3,4
count,19905.0,19905.0,19905.0,19905.0,19905.0
mean,0.012208,0.045416,0.001457,0.010249,0.028636
std,0.109816,0.208219,0.038143,0.100718,0.166785
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [55]:
train_labels_df.to_csv('new_train_labels_v1.csv', index=None, header=True)
val_labels_df.to_csv('new_val_labels_v1.csv', index=None, header=True)