# save per tile, good optical (s2) observations

In [17]:
import os
import numpy as np
from matplotlib import pyplot as plt
import glob
from skimage import io

def get_good_observations(input_dir, total_tiles, mode, return_images=2):
    '''
    this function returns n cloud free images per month/per tile
    '''
    
    # dict to hold good dates per file
    tile_dates = {k:[] for k in range(1, total_tiles+1)}

    for tile_id in range(1, total_tiles + 1):
        
        if tile_id % 100 == 0:
            print(tile_id)

        sub_folders = glob.glob(input_dir + '/' +'ref_south_africa_crops_competition_v1_' + mode+ '_source_s2_' + str(tile_id).zfill(4) + '*' )
        dates = []
        std = []

        # iterate tile folders and return std per date
        for item in sub_folders:
            
            # record date
            dates.append(item[-10:])

            #read band blue and append std
            blue = io.imread(item+ '/' + 'B02.tif')
            std.append(np.std(blue))

        zip_iterator = zip(dates, std)
        cloud_score = dict(zip_iterator) 

        months = ['2017_04', '2017_05', '2017_06', '2017_07', '2017_08', '2017_09', '2017_10', '2017_11']

        for month in months:
            filtered_dict = {k:v for (k,v) in cloud_score.items() if month in k}
            filtered_dict = dict(sorted(filtered_dict.items(), key=lambda item: item[1]))
            filtered_list = list(filtered_dict.keys())[:return_images]
            filtered_list.sort()
            tile_dates[tile_id].extend(filtered_list)

    return tile_dates

In [18]:
from datetime import datetime
start = datetime.now()
input_dir = '/share/projects/erasmus2/crop_zindi/data/ref_south_africa_crops_competition_v1_train_source_s2'
s2_train_tile_dates = get_good_observations(input_dir, total_tiles=2650, mode='train')

print('complete in ', datetime.now() - start)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
complete in  1:31:53.958580


In [20]:
list(s2_train_tile_dates.items())[:10]

[(1,
  ['2017_04_14',
   '2017_04_24',
   '2017_05_04',
   '2017_05_24',
   '2017_06_03',
   '2017_06_23',
   '2017_07_08',
   '2017_07_18',
   '2017_08_07',
   '2017_08_12',
   '2017_09_01',
   '2017_09_21',
   '2017_10_03',
   '2017_10_31',
   '2017_11_10',
   '2017_11_15']),
 (2,
  ['2017_04_11',
   '2017_04_14',
   '2017_05_24',
   '2017_05_31',
   '2017_06_13',
   '2017_06_30',
   '2017_07_08',
   '2017_07_28',
   '2017_08_09',
   '2017_08_14',
   '2017_09_01',
   '2017_09_26',
   '2017_10_01',
   '2017_10_11',
   '2017_11_02',
   '2017_11_05']),
 (3,
  ['2017_04_11',
   '2017_04_14',
   '2017_05_04',
   '2017_05_21',
   '2017_06_20',
   '2017_06_23',
   '2017_07_10',
   '2017_07_18',
   '2017_08_12',
   '2017_08_17',
   '2017_09_21',
   '2017_09_28',
   '2017_10_03',
   '2017_10_23',
   '2017_11_10',
   '2017_11_20']),
 (4,
  ['2017_04_11',
   '2017_04_21',
   '2017_05_21',
   '2017_05_31',
   '2017_06_10',
   '2017_06_20',
   '2017_07_05',
   '2017_07_25',
   '2017_08_09',
   '2

In [19]:
# save
import json
with open('/share/projects/erasmus2/crop_zindi/data/pse_tsa_data/s2_train_dates.json', 'w') as file:
    file.write(json.dumps(s2_train_tile_dates, indent=4))
    


# for test tiles

In [21]:
from datetime import datetime
start = datetime.now()
input_dir = '/share/projects/erasmus2/crop_zindi/data/ref_south_africa_crops_competition_v1_test_source_s2'
s2_test_tile_dates = get_good_observations(input_dir, total_tiles=1137, mode='test')

print('complete in ', datetime.now() - start)

import json
with open('/share/projects/erasmus2/crop_zindi/data/pse_tsa_data/s2_test_dates.json', 'w') as file:
    file.write(json.dumps(s2_test_tile_dates, indent=4))
    


100
200
300
400
500
600
700
800
900
1000
1100
complete in  0:26:39.478904


# save npy for s1 and s2



In [24]:
# import json containing dates for train or test

import json
with open('/share/projects/erasmus2/crop_zindi/data/pse_tsa_data/s2_train_dates.json', 'r') as file:
    s2_train_dates = json.load(file)
   
                  

2650

# load df, drop columns and convert to np array

In [49]:
#load save df
import pandas as pd
s2_df = pd.read_csv('/share/projects/erasmus2/crop_zindi/df/s1_df_avg.csv')

s2_df = s2_df.drop(['Unnamed: 0', 'field_id'], axis=1)

s2_df.head(10)

s2_arr = s2_df.to_numpy()
s2_arr.shape



(87114, 21)

# create sits from x,y df

In [55]:
# split df into multi
# creates temporal dimension from high-dimensional multitemporal features

count = 0
for i in range(1, 11):
    a = 0
    b = 2
    arr = np.expand_dims(s2_arr[:,a:b], 1)
    
    if count == 0:
        sits = arr
    else:
        sits = np.concatenate((sits, arr), 1)
    
    count += 1
    a+=2
    b+=2
        
    

(87114, 1, 2)
(87114, 2, 2)
(87114, 3, 2)
(87114, 4, 2)
(87114, 5, 2)
(87114, 6, 2)
(87114, 7, 2)
(87114, 8, 2)
(87114, 9, 2)
(87114, 10, 2)
