In [1]:
import boto3
import pandas as pd
import json

In [2]:
with open('train_test_polygons.json') as ttp:
    train_test_polygons = json.load(ttp)
    
train_test_polygons

{'train': [12,
  1,
  74,
  10,
  7,
  19,
  28,
  9,
  21,
  32,
  31,
  53,
  64,
  37,
  33,
  58,
  86,
  77,
  27,
  84,
  30,
  18,
  22,
  15,
  25,
  23,
  75,
  78,
  85,
  55,
  89,
  48,
  44,
  66,
  39,
  29,
  81,
  60,
  57,
  50,
  92,
  101],
 'test': [6,
  2,
  72,
  70,
  20,
  14,
  13,
  5,
  26,
  3,
  11,
  4,
  69,
  76,
  35,
  41,
  46,
  47,
  43,
  34,
  63,
  61,
  17,
  16,
  24,
  8,
  59,
  68,
  88,
  36,
  38,
  42,
  83,
  40,
  45,
  71,
  87,
  80,
  49,
  52,
  56,
  82,
  51,
  54,
  91,
  99,
  100,
  93,
  90,
  94,
  67,
  65,
  62,
  73,
  79]}

In [4]:
s3 = boto3.resource('s3')

bucket_name = 'canopy-production-ml'

pc_bucket = s3.Bucket(bucket_name)

all_objects = pc_bucket.objects.all()

In [5]:
all_chips = []

for obj in all_objects:
    if 'cloudfree-merge-polygons/dataset_v2' in obj.key:
        all_chips.append(obj)

In [7]:
def remove_duplicate_chips(keys):
    data = {'Keys': keys}
    
    df = pd.DataFrame(data=data)
    
    df['Filenames'] = df['Keys'].apply(lambda x:x.split('/')[-1])
    
    df = df.drop_duplicates(subset=['Filenames'])
    
    return df['Keys'].tolist()

In [16]:
poly_chip_dict = {}

for chip in all_chips:
    key = chip.key
    
    try:
        poly_id = int(key.split('/')[5])
        
        if poly_id in poly_chip_dict:
            poly_chip_dict[poly_id].append(key)
        else:
            poly_chip_dict[poly_id] = [key]
            
    except:
        print(key)

chips/cloudfree-merge-polygons/dataset_v2/
chips/cloudfree-merge-polygons/dataset_v2/misc/


In [17]:
len(poly_chip_dict)

97

In [18]:
len(poly_chip_dict[1])

559

In [19]:
poly_chip_dict[1][0]

'chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_1000_1000.tif'

In [20]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

test_chip_labels = {}

for test_poly in train_test_polygons['test']:
    chips = poly_chip_dict[test_poly]
    
    for chip in chips:
        filename = chip.split('/')[-1]
        
        if filename not in test_chip_labels:
            chip_labels = [0,0,0,0,0]
            
            for i in range(5):
                if labels[i] in chip:
                    chip_labels[i] = 1
                    
            test_chip_labels[filename] = {'labels': chip_labels, 'key': chip}
            
        else:
            for i in range(5):
                if labels[i] in chip:
                    test_chip_labels[filename]['labels'][i] = 1

In [22]:
list(test_chip_labels.keys())[:5]

['6_1000_1000.tif',
 '6_1000_1100.tif',
 '6_1000_1200.tif',
 '6_1000_1300.tif',
 '6_1000_600.tif']

In [23]:
test_chip_labels['6_1000_1000.tif']

{'labels': [1, 0, 0, 1, 0],
 'key': 'chips/cloudfree-merge-polygons/dataset_v2/Industrial_agriculture/100/6/6_1000_1000.tif'}

In [24]:
test_data = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    'paths': []
}

for filename in test_chip_labels:
    chip_labels = test_chip_labels[filename]['labels']
    
    path = test_chip_labels[filename]['key']
    
    for i in range(5):
        test_data[i].append(chip_labels[i])
        
    test_data['paths'].append(path)
    
test_labels_df = pd.DataFrame(data=test_data)

test_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
1,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
2,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
3,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
4,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...


In [25]:
test_labels_df.to_csv('new_test_labels_v1.csv', index=None, header=True)

In [26]:
labels

['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

In [28]:
def get_chip_labels(chips, label_names, labels_dict):
    for chip in chips:
        filename = chip.split('/')[-1]
        
        if filename not in labels_dict:
            chip_labels = [0,0,0,0,0]
            
            for i in range(5):
                if label_names[i] in chip:
                    chip_labels[i] = 1
                    
            labels_dict[filename] = {'labels': chip_labels, 'key': chip}
            
        else:
            for i in range(5):
                if label_names[i] in chip:
                    labels_dict[filename]['labels'][i] = 1

In [29]:
labels

['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

In [45]:
import random


train_chip_labels = {}
val_chip_labels = {}

for train_poly in train_test_polygons['train']:
    chips = poly_chip_dict[train_poly]
    
    val_slice = len(chips) // 5
    random.shuffle(chips)
    val_chips = chips[:val_slice]
    train_chips = chips[val_slice:]
    
    get_chip_labels(train_chips, labels, train_chip_labels)
    get_chip_labels(val_chips, labels, val_chip_labels)

In [46]:
len(train_chip_labels)

79499

In [47]:
len(val_chip_labels)

19905

In [48]:
list(train_chip_labels.keys())[:5]

['12_2200_4700.tif',
 '12_4100_4400.tif',
 '12_2700_1500.tif',
 '12_200_2600.tif',
 '12_2000_5200.tif']

In [49]:
train_chip_labels['12_1700_2800.tif']

{'labels': [0, 0, 0, 0, 0],
 'key': 'chips/cloudfree-merge-polygons/dataset_v2/misc/100/12/12_1700_2800.tif'}

In [50]:
def make_labels_df(labels_dict):
    data = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        'paths': []
    }

    for filename in labels_dict:
        chip_labels = labels_dict[filename]['labels']

        path = labels_dict[filename]['key']

        for i in range(5):
            data[i].append(chip_labels[i])

        data['paths'].append(path)

    labels_df = pd.DataFrame(data=data)

    return labels_df

In [51]:
train_labels_df = make_labels_df(train_chip_labels)
val_labels_df = make_labels_df(val_chip_labels)

train_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [52]:
val_labels_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [53]:
train_labels_df.describe()

Unnamed: 0,0,1,2,3,4
count,79499.0,79499.0,79499.0,79499.0,79499.0
mean,0.011271,0.044038,0.001598,0.009371,0.028101
std,0.105564,0.205182,0.039937,0.096351,0.165262
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [54]:
val_labels_df.describe()

Unnamed: 0,0,1,2,3,4
count,19905.0,19905.0,19905.0,19905.0,19905.0
mean,0.012208,0.045416,0.001457,0.010249,0.028636
std,0.109816,0.208219,0.038143,0.100718,0.166785
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [55]:
train_labels_df.to_csv('new_train_labels_v1.csv', index=None, header=True)
val_labels_df.to_csv('new_val_labels_v1.csv', index=None, header=True)

### duplicate ISL rows

In [51]:
import pandas as pd


df = pd.read_csv('new_train_labels_v1.csv')

df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [52]:
xml_rows = df[df['paths'].str.contains('xml')]

xml_rows

Unnamed: 0,0,1,2,3,4,paths
2235,0,1,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/ISL/...
2380,0,1,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/ISL/...


In [53]:
for path in xml_rows['paths']:
    print(path)

chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_300_1000.tif.aux.xml
chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_300_1100.tif.aux.xml


In [14]:
df = df.drop(xml_rows.index.tolist())

df[df['paths'].str.contains('xml')]

Unnamed: 0,0,1,2,3,4,paths


In [15]:
df = df.reset_index(drop=True)

In [16]:
df.columns

Index(['0', '1', '2', '3', '4', 'paths'], dtype='object')

In [17]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

for i in range(5):
    print(labels[i])
    print(df[str(i)].value_counts())

Industrial_agriculture
0    78601
1      896
Name: 0, dtype: int64
ISL
0    75998
1     3499
Name: 1, dtype: int64
Mining
0    79370
1      127
Name: 2, dtype: int64
Roads
0    78752
1      745
Name: 3, dtype: int64
Shifting_cultivation
0    77263
1     2234
Name: 4, dtype: int64


In [18]:
len(df)

79497

In [19]:
yes_labels = 0

for i in range(len(df)):
    for j in range(5):
        if df.loc[i, str(j)] == 1:
            yes_labels += 1
            break

In [20]:
yes_labels

7237

In [21]:
no_labels = len(df) - yes_labels
no_labels

72260

In [22]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

label_count = {
    'Industrial_agriculture': 0,
    'ISL': 0,
    'Mining': 0,
    'Roads': 0,
    'Shifting_cultivation': 0
}

for i in range(len(df)):
    for j in range(5):
        if df.loc[i, str(j)] == 1:
            lbl = labels[j]
            label_count[lbl] += 1

In [23]:
label_count

{'Industrial_agriculture': 896,
 'ISL': 3499,
 'Mining': 127,
 'Roads': 745,
 'Shifting_cultivation': 2234}

In [24]:
lbl_products = {}

non_isl_count = 0

for lbl in labels:
    if lbl != 'ISL':
        non_isl_count += label_count[lbl]
        
non_isl_tot_prod = no_labels // non_isl_count

non_isl_tot_prod

18

In [25]:
non_isl_count

4002

In [26]:
mult_by_18_tot = 0

for lbl in labels:
    if lbl != 'ISL':
        mult_by_18_tot += (label_count[lbl] * 18)
        
mult_by_18_tot

72036

In [27]:
no_labels // label_count['ISL']

20

In [28]:
only_nulls = df[
    (df['0'] == 0) & (df['1'] == 0) & (df['2'] == 0) & (df['3'] == 0) & (df['4'] == 0)
]

len(only_nulls)

72260

In [29]:
import numpy as np


df2 = only_nulls.copy()

for i in range(5):
    only_that_label = df[df[str(i)] == 1]
    
    if i == 1:
        mult = 20
    else:
        mult = 18
        
    dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)
    
    df2 = df2.append(dupes)
    
df2 = df2.reset_index(drop=True)

In [30]:
df2.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [31]:
df2.tail()

Unnamed: 0,0,1,2,3,4,paths
214271,0,0,0,0,1,chips/cloudfree-merge-polygons/dataset_v2/Shif...
214272,0,0,0,0,1,chips/cloudfree-merge-polygons/dataset_v2/Shif...
214273,0,0,0,0,1,chips/cloudfree-merge-polygons/dataset_v2/Shif...
214274,0,0,0,0,1,chips/cloudfree-merge-polygons/dataset_v2/Shif...
214275,0,0,0,0,1,chips/cloudfree-merge-polygons/dataset_v2/Shif...


In [32]:
len(only_nulls) * 3

216780

In [33]:
df2[df2['paths'].str.contains('xml')]

Unnamed: 0,0,1,2,3,4,paths


In [29]:
df2.to_csv('new_train_labels_v2.csv', index=None, header=True)

In [34]:
df2.loc[0,'paths'].split('/')

['chips',
 'cloudfree-merge-polygons',
 'dataset_v2',
 'misc',
 '100',
 '12',
 '12_2200_4700.tif']

In [35]:
df3 = df2.copy()

df3['paths'] = df3['paths'].apply(lambda x: '/'.join(x.split('/')[3:]))

df3.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2200_4700.tif
1,0,0,0,0,0,misc/100/12/12_4100_4400.tif
2,0,0,0,0,0,misc/100/12/12_2700_1500.tif
3,0,0,0,0,0,misc/100/12/12_200_2600.tif
4,0,0,0,0,0,misc/100/12/12_2000_5200.tif


In [36]:
len(df3)

214276

In [37]:
df3.to_csv('new_train_labels_v4.csv', index=None, header=True)

In [38]:
val_df = pd.read_csv('new_val_labels_v1.csv')

val_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
1,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
2,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
3,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...
4,0,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/misc...


In [41]:
xml_rows = val_df[val_df['paths'].str.contains('xml')]

xml_rows

Unnamed: 0,0,1,2,3,4,paths
540,0,1,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/ISL/...
555,0,1,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/ISL/...
583,0,1,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/ISL/...


In [44]:
for path in xml_rows['paths']:
    print(path)

chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_400_800.tif.aux.xml
chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_400_700.tif.aux.xml
chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_400_900.tif.aux.xml


In [42]:
val_df = val_df.drop(xml_rows.index.tolist())
val_df = val_df.reset_index(drop=True)

val_df[val_df['paths'].str.contains('xml')]

Unnamed: 0,0,1,2,3,4,paths


In [45]:
def change_path(x):
    return '/'.join(x.split('/')[3:])

In [46]:
val_df['paths'] = val_df['paths'].apply(change_path)
val_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2800_2200.tif
1,0,0,0,0,0,misc/100/12/12_1800_0.tif
2,0,0,0,0,0,misc/100/12/12_3200_3800.tif
3,0,0,0,0,0,misc/100/12/12_4500_4100.tif
4,0,0,0,0,0,misc/100/12/12_900_4200.tif


In [47]:
val_df.shape

(19902, 6)

In [48]:
val_df.to_csv('new_val_labels_v3.csv', index=None, header=True)

In [40]:
df5 = pd.read_csv('new_train_labels_v3.csv')

df5.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2200_4700.tif
1,0,0,0,0,0,misc/100/12/12_4100_4400.tif
2,0,0,0,0,0,misc/100/12/12_2700_1500.tif
3,0,0,0,0,0,misc/100/12/12_200_2600.tif
4,0,0,0,0,0,misc/100/12/12_2000_5200.tif


## remove xml files

In [2]:
df_train = pd.read_csv('new_train_labels_v4.csv')
df_val = pd.read_csv('new_val_labels_v3.csv')
df_test = pd.read_csv('new_test_labels_v1.csv')

In [50]:
df_xml = pd.DataFrame()

for df in [df_train, df_val, df_test]:
    xml_rows = df[df['paths'].str.contains('xml')]
    df_xml = pd.concat([df_xml, xml_rows])
    
df_xml

Unnamed: 0,0,1,2,3,4,paths


In [5]:
len(df_val[df_val['1'] == 1]) / len(df_val)

0.04527183197668576

In [6]:
len(df_test[df_test['1'] == 1]) / len(df_test)

0.08869739756367663

In [1]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

def create_duplicate_rows(df, labels):

    yes_labels = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                yes_labels += 1
                break

    no_labels = len(df) - yes_labels

    label_count = {}

    for label in labels:
        label_count[label] = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                lbl = labels[j]
                label_count[lbl] += 1

    non_isl_count = 0

    for lbl in labels:
        if lbl != 'ISL':
            non_isl_count += label_count[lbl]

    non_isl_tot_prod = no_labels // non_isl_count

    isl_prod = no_labels // label_count['ISL']

    only_nulls = df[
        (df['0'] == 0) & (df['1'] == 0) & (df['2'] == 0) & (df['3'] == 0) & (df['4'] == 0)
    ]

    df2 = only_nulls.copy()

    for i in range(len(labels)):
        only_that_label = df[df[str(i)] == 1]

        if labels[i] == 'ISL':
            mult = isl_prod
        else:
            mult = non_isl_tot_prod

        dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)

        df2 = df2.append(dupes)

    df2 = df2.reset_index(drop=True)
    
    return df2

In [8]:
import numpy as np


new_val_df = create_duplicate_rows(df_val, labels)

new_val_df.shape

(52872, 6)

In [9]:
df_val.shape

(19902, 6)

In [10]:
len(new_val_df[new_val_df['1'] == 1]) / len(new_val_df)

0.3237819639885005

In [11]:
new_val_df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2800_2200.tif
1,0,0,0,0,0,misc/100/12/12_1800_0.tif
2,0,0,0,0,0,misc/100/12/12_3200_3800.tif
3,0,0,0,0,0,misc/100/12/12_4500_4100.tif
4,0,0,0,0,0,misc/100/12/12_900_4200.tif


In [12]:
new_val_df.to_csv('new_val_labels_v4.csv', index=None, header=True)

In [13]:
new_test_df = create_duplicate_rows(df_test, labels)

new_test_df.to_csv('new_test_labels_v2.csv', index=None, header=True)

In [14]:
len(new_test_df[new_test_df['1'] == 1]) / len(new_test_df)

0.33111763151673596

In [15]:
import json


labels_json = {'label_names': {}}

for i, label in enumerate(labels, 1):
    labels_json['label_names'][i] = label
    
with open('new_labels.json', 'w') as out_file:
    json.dump(labels_json, out_file)

In [16]:
with open('C:/Users/David/canopy/cb_feature_detection/model-development/new_labels.json', 'w') as out_file:
    json.dump(labels_json, out_file)

In [3]:
import pandas as pd


train_df = pd.read_csv('new_train_labels_v4.csv')
val_df = pd.read_csv('new_val_labels_v4.csv')

len(train_df)

214276

In [4]:
len(val_df)

52872

In [7]:
train_df_new = train_df.sample(frac = 0.1)

len(train_df_new)

21428

In [8]:
train_df_new.head()

Unnamed: 0,0,1,2,3,4,paths
765,0,0,0,0,0,misc/100/12/12_2000_4900.tif
80085,1,0,0,0,1,Shifting_cultivation/100/9/9_1200_1900.tif
196061,0,0,0,0,1,Shifting_cultivation/100/81/81_7400_6300.tif
200459,0,0,0,0,1,Shifting_cultivation/100/92/92_3500_4000.tif
200123,0,0,0,0,1,Shifting_cultivation/100/92/92_3900_4400.tif


In [10]:
for i in range(5):
    print(labels[i])
    print(train_df_new[str(i)].value_counts())

Industrial_agriculture
0    19671
1     1757
Name: 0, dtype: int64
ISL
0    14442
1     6986
Name: 1, dtype: int64
Mining
0    21178
1      250
Name: 2, dtype: int64
Roads
0    19681
1     1747
Name: 3, dtype: int64
Shifting_cultivation
0    17006
1     4422
Name: 4, dtype: int64


In [11]:
train_df_new = train_df_new.reset_index(drop=True)

train_df_new.to_csv('new_train_labels_v5_10_percent.csv')

In [12]:
len(train_df_new)

21428

In [13]:
val_df_new = val_df.sample(len(train_df_new) // 2)

len(val_df_new)

10714

In [14]:
val_df_new = val_df_new.reset_index(drop=True)

val_df_new.to_csv('new_val_labels_v5_10000.csv')

In [17]:
test_df = pd.read_csv('new_test_labels_v2.csv')

len(test_df)

71911

In [18]:
test_df = pd.read_csv('new_test_labels_v1.csv')

len(test_df)

28896

In [19]:
test_df_sample = test_df.sample(frac = 0.33)

len(test_df_sample)

9536

In [20]:
test_df_sample = test_df_sample.reset_index(drop=True)

test_df_sample.to_csv('new_test_labels_v3_one_third.csv', index=None, header=True)

In [33]:
import numpy as np


labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

def create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=1, sample_frac=None):

    yes_labels = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                yes_labels += 1
                break

    no_labels = len(df) - yes_labels

    label_count = {}

    for label in labels:
        label_count[label] = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                lbl = labels[j]
                label_count[lbl] += 1

    non_isl_count = 0

    for lbl in labels:
        if lbl != 'ISL':
            non_isl_count += label_count[lbl]

    non_isl_tot_prod = ((no_labels * frac_of_nulls) // non_isl_count) * non_isl_mult

    isl_prod = (no_labels * frac_of_nulls) // label_count['ISL']

    only_nulls = df[
        (df['0'] == 0) & (df['1'] == 0) & (df['2'] == 0) & (df['3'] == 0) & (df['4'] == 0)
    ]

    df2 = only_nulls.copy()

    for i in range(len(labels)):
        only_that_label = df[df[str(i)] == 1]

        if labels[i] == 'ISL':
            mult = isl_prod
        else:
            mult = non_isl_tot_prod

        dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)

        df2 = df2.append(dupes)
    
    df2 = df2.reset_index(drop=True)
    
    if sample_frac:
        df2 = df2.sample(frac=sample_frac)
        
    df2 = df2.reset_index(drop=True)
    
    return df2

In [23]:
df = pd.read_csv('new_train_labels_v1.csv')

df['paths'] = df['paths'].apply(lambda x: '/'.join(x.split('/')[3:]))

xml_rows = df[df['paths'].str.contains('xml')]

df = df.drop(xml_rows.index.tolist())
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2200_4700.tif
1,0,0,0,0,0,misc/100/12/12_4100_4400.tif
2,0,0,0,0,0,misc/100/12/12_2700_1500.tif
3,0,0,0,0,0,misc/100/12/12_200_2600.tif
4,0,0,0,0,0,misc/100/12/12_2000_5200.tif


In [24]:
df.to_csv('new_train_labels_base.csv', index=None, header=True)

In [27]:
df2 = create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=2, sample_frac=0.1)

In [29]:
for i in range(5):
    print(labels[i])
    print(df2[str(i)].value_counts())

Industrial_agriculture
0    25048
1     3583
Name: 0, dtype: int64
ISL
0    21454
1     7177
Name: 1, dtype: int64
Mining
0    28174
1      457
Name: 2, dtype: int64
Roads
0    25171
1     3460
Name: 3, dtype: int64
Shifting_cultivation
0    19974
1     8657
Name: 4, dtype: int64


In [30]:
len(df2)

28631

In [31]:
df2 = df2.reset_index(drop=True)

In [32]:
df2.to_csv('new_train_labels_v6_half_non_isl.csv', index=None, header=True)

### start here

In [1]:
import numpy as np
import pandas as pd

In [2]:
def create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=1, sample_frac=None):

    yes_labels = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                yes_labels += 1
                break

    no_labels = len(df) - yes_labels

    label_count = {}

    for label in labels:
        label_count[label] = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                lbl = labels[j]
                label_count[lbl] += 1

    non_isl_count = 0

    for lbl in labels:
        if lbl != 'ISL':
            non_isl_count += label_count[lbl]

    non_isl_tot_prod = ((no_labels * frac_of_nulls) // non_isl_count) * non_isl_mult

    isl_prod = (no_labels * frac_of_nulls) // label_count['ISL']

    only_nulls = df[
        (df['0'] == 0) & (df['1'] == 0) & (df['2'] == 0) & (df['3'] == 0) & (df['4'] == 0)
    ]

    df2 = only_nulls.copy()

    for i in range(len(labels)):
        only_that_label = df[df[str(i)] == 1]

        if labels[i] == 'ISL':
            mult = isl_prod
        else:
            mult = non_isl_tot_prod

        dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)

        df2 = df2.append(dupes)
    
    df2 = df2.reset_index(drop=True)
    
    if sample_frac:
        df2 = df2.sample(frac=sample_frac)
        
    df2 = df2.reset_index(drop=True)
    
    return df2


df = pd.read_csv('new_train_labels_base.csv')

labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

In [3]:
df2 = create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=2, sample_frac=0.25)

df2.shape

(71578, 6)

In [4]:
df2.to_csv('new_train_labels_v7_one_quarter.csv', index=None, header=True)

In [2]:
import pandas as pd


df = pd.read_csv('new_train_labels_v5_10_percent.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,0,misc/100/12/12_2000_4900.tif
1,1,1,0,0,0,1,Shifting_cultivation/100/9/9_1200_1900.tif
2,2,0,0,0,0,1,Shifting_cultivation/100/81/81_7400_6300.tif
3,3,0,0,0,0,1,Shifting_cultivation/100/92/92_3500_4000.tif
4,4,0,0,0,0,1,Shifting_cultivation/100/92/92_3900_4400.tif


In [4]:
df2 = df.drop(columns=['Unnamed: 0'])

df2.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2000_4900.tif
1,1,0,0,0,1,Shifting_cultivation/100/9/9_1200_1900.tif
2,0,0,0,0,1,Shifting_cultivation/100/81/81_7400_6300.tif
3,0,0,0,0,1,Shifting_cultivation/100/92/92_3500_4000.tif
4,0,0,0,0,1,Shifting_cultivation/100/92/92_3900_4400.tif


In [5]:
df2.to_csv('new_train_labels_v5_10_percent.csv', index=None, header=True)

In [3]:
df2 = create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=2)

df2.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2200_4700.tif
1,0,0,0,0,0,misc/100/12/12_4100_4400.tif
2,0,0,0,0,0,misc/100/12/12_2700_1500.tif
3,0,0,0,0,0,misc/100/12/12_200_2600.tif
4,0,0,0,0,0,misc/100/12/12_2000_5200.tif


In [4]:
df2.to_csv('new_train_labels_v8_half_nonisl_full.csv', index=None, header=True)

In [6]:
df = pd.read_csv('new_train_labels_v5_10_percent.csv')

df.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,0,0,0,0,misc/100/12/12_2000_4900.tif
1,1,0,0,0,1,Shifting_cultivation/100/9/9_1200_1900.tif
2,0,0,0,0,1,Shifting_cultivation/100/81/81_7400_6300.tif
3,0,0,0,0,1,Shifting_cultivation/100/92/92_3500_4000.tif
4,0,0,0,0,1,Shifting_cultivation/100/92/92_3900_4400.tif


In [4]:
val_df = pd.read_csv('new_val_labels_v3.csv')

val_df_2 = val_df.sample(frac=0.1)

len(val_df_2)

1990

In [5]:
val_df_2.head()

Unnamed: 0,0,1,2,3,4,paths
6695,0,0,0,0,0,misc/100/15/15_2900_400.tif
8289,0,0,0,0,0,misc/100/78/78_900_3900.tif
2000,0,0,0,0,0,misc/100/7/7_3600_1900.tif
12080,0,0,0,0,0,misc/100/39/39_2500_1400.tif
4452,0,0,0,0,0,misc/100/64/64_2900_2800.tif


In [6]:
val_df_2 = val_df_2.reset_index(drop=True)

val_df_2.to_csv('new_val_labels_v6_2000.csv', index=None, header=True)

In [3]:
val_df = pd.read_csv('new_val_labels_v3.csv')

df2 = df.sample(frac=.01)

val_df2 = val_df.sample(frac=.01)

print(len(df2))
print(len(val_df2))

795
199


In [5]:
df2 = df2.reset_index(drop=True)
val_df2 = val_df2.reset_index(drop=True)

In [6]:
df2.to_csv('new_train_labels_v9_verysmall.csv', index=None, header=True)
val_df2.to_csv('new_val_labels_v9_verysmall.csv', index=None, header=True)

In [1]:
import pandas as pd


df1 = pd.read_csv('new_test_labels_v1.csv')
df2 = pd.read_csv('new_test_labels_v2.csv')

In [2]:
print(df1.shape)
print(df2.shape)

(28896, 6)
(71911, 6)


In [3]:
df1.head()

Unnamed: 0,0,1,2,3,4,paths
0,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
1,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
2,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
3,1,0,0,0,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...
4,1,0,0,1,0,chips/cloudfree-merge-polygons/dataset_v2/Indu...


In [4]:
uri = df1.loc[0, 'paths']

uri

'chips/cloudfree-merge-polygons/dataset_v2/Industrial_agriculture/100/6/6_1000_1000.tif'

In [6]:
'/'.join(uri.split('/')[3:])

'Industrial_agriculture/100/6/6_1000_1000.tif'

In [7]:
df3 = df1.copy()
df3['paths'] = df3['paths'].apply(lambda x: '/'.join(x.split('/')[3:]))

df3.head()

Unnamed: 0,0,1,2,3,4,paths
0,1,0,0,1,0,Industrial_agriculture/100/6/6_1000_1000.tif
1,1,0,0,1,0,Industrial_agriculture/100/6/6_1000_1100.tif
2,1,0,0,0,0,Industrial_agriculture/100/6/6_1000_1200.tif
3,1,0,0,0,0,Industrial_agriculture/100/6/6_1000_1300.tif
4,1,0,0,1,0,Industrial_agriculture/100/6/6_1000_600.tif


In [9]:
df3.to_csv('new_test_labels_v4_no_duplicates.csv', index=None, header=True)

In [10]:
train_v7 = pd.read_csv('new_train_labels_v7_one_quarter.csv')
train_v8 = pd.read_csv('new_train_labels_v8_half_nonisl_full.csv')

In [11]:
train_v7.head()

Unnamed: 0,0,1,2,3,4,paths
0,0,1,0,0,0,ISL/100/89/89_2600_1100.tif
1,0,0,0,0,0,misc/100/81/81_1400_8800.tif
2,1,0,0,1,0,Roads/100/10/10_1000_2300.tif
3,0,0,0,0,0,misc/100/60/60_6500_300.tif
4,0,0,0,0,1,Shifting_cultivation/100/19/19_2900_4200.tif


In [18]:
train_v7['0'].value_counts()

0    62638
1     8940
Name: 0, dtype: int64

In [16]:
train_v7['0'].value_counts()[0]

62638

In [22]:
def calc_ratio(df, col):
    zero_count = df[col].value_counts()[0]
    one_count = df[col].value_counts()[1]
    return one_count / (zero_count + one_count)

In [23]:
labels = ['Industrial_agriculture', 'ISL', 'Mining', 'Roads', 'Shifting_cultivation']

ratios = []

for i in range(5):
    ratio_v7 = calc_ratio(train_v7, str(i))
    ratio_v8 = calc_ratio(train_v8, str(i))
    ratios.append((ratio_v7, ratio_v8))

In [24]:
ratios

[(0.12489871189471626, 0.1239766408673056),
 (0.24895917740087736, 0.25007683925228424),
 (0.016192126072256837, 0.016667132359104754),
 (0.11856995166112493, 0.11749420212914582),
 (0.30482829919807763, 0.30528933471178293)]