# Preprocessing

## Step 1: Check missing images

In [1]:
import os.path
from datetime import date
import datetime


In [2]:
base_path = os.path.join("D:","Graduation_Thesis_Dataset")
S1_path = os.path.join(base_path, "S1_dataset")
L8_path = os.path.join(base_path, "L8_dataset")

In [3]:
years = [2017, 2018, 2019, 2020]
site_range = [1, 2, 3, 4, 5, 6]
satellites = ['Landsat_8', 'S1']
Missing_image_paths = []
valid_image_paths = []
cdl_paths = []
for satellite in satellites:
  cur_path = None
  if satellite == 'Landsat_8':
    cur_path = L8_path
  elif satellite == 'S1':
    cur_path = S1_path
  for site in site_range:
    # print(cur_path)
    site_path = os.path.join(cur_path,  str(site))
    # print(cur_path)
    for year in years:
      year_path = os.path.join(site_path,str(year))
      cdlpath = "CDL_" + str(site) + '_' + str(year) + '.tif'
      cdlpath = os.path.join(year_path, cdlpath)
      if os.path.isfile(cdlpath):
        cdl_paths.append(cdlpath)
      start_date = date(year, 9, 10)
      for advancement in range(0, 12):
        cur_time = start_date + datetime.timedelta(days=advancement*24)
        cur_time_str = cur_time.strftime("%Y%m%d")
        image_path = os.path.join(year_path, satellite + '_' + str(site) + '_'+ cur_time_str + '.tif')
        if os.path.isfile(image_path):
          valid_image_paths.append(image_path)
        else:
          Missing_image_paths.append(image_path)
    



In [4]:
print('Valid images: ', len(valid_image_paths))

Valid images:  572


In [5]:
print('Missing images: ', len(Missing_image_paths))
print(Missing_image_paths)
testpath = Missing_image_paths[0]


Missing images:  4
['D:Graduation_Thesis_Dataset\\S1_dataset\\3\\2017\\S1_3_20171004.tif', 'D:Graduation_Thesis_Dataset\\S1_dataset\\5\\2017\\S1_5_20171004.tif', 'D:Graduation_Thesis_Dataset\\S1_dataset\\5\\2017\\S1_5_20171028.tif', 'D:Graduation_Thesis_Dataset\\S1_dataset\\5\\2018\\S1_5_20181215.tif']


## Step 2 Fill in Nan Values

In [6]:
import cv2 as cv
from osgeo import gdal
from tqdm import tqdm
from sklearn.impute import KNNImputer
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)

In [15]:
def replace_nan(image, threshold = 6, window = 5):
  x = image.shape[0]
  y = image.shape[1]
  for i in range (2, x - 2):
    for j in range(2, y - 2):
      if not np.isnan(image[i, j]):
        continue
      img_window = image[i-2:i+3, j-2:j+3]
      if np.count_nonzero(np.isnan(img_window)) > threshold:
        continue
      mean = np.nanmean(img_window)
      std = np.nanstd(img_window)
      image[i, j] = np.random.normal(mean, std)
  return image

In [21]:
NAN_image_path = []
for path in tqdm(valid_image_paths):
  new_img = gdal.Open(path)
  band_cnt = new_img.RasterCount
  channel_list = []
  is_nan_too_much = False
  for b in range(1, band_cnt + 1):
    img_array = np.array(new_img.GetRasterBand(b).ReadAsArray())
    nan_percent = np.count_nonzero(np.isnan(img_array)) / (img_array.shape[0]*img_array.shape[1])
    # print('before: ',nan_percent)
    if nan_percent > 0.3:
      is_nan_too_much = True
      NAN_image_path.append(path)
    img_array = replace_nan(img_array)
    nan_percent = np.count_nonzero(np.isnan(img_array)) / (img_array.shape[0]*img_array.shape[1])
    # print('after: ',nan_percent)
    channel_list.append(img_array)
  if is_nan_too_much:
    NAN_image_path.append(path)
    continue
  output = np.moveaxis(np.stack(channel_list, axis=0), 0, -1)
  output_path = path.replace('.tif', '.npy')
  # print(output_path)
  # print(output.shape)
  np.save(output_path, output)
print(NAN_image_path)




100%|██████████| 572/572 [3:47:04<00:00, 23.82s/it]  

['D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_




In [22]:
Nan_paths = set(NAN_image_path)
print(Nan_paths)

{'D:Graduation_Thesis_Dataset\\L8_dataset\\3\\2017\\Landsat_8_3_20171215.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\6\\2017\\Landsat_8_6_20180225.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\6\\2018\\Landsat_8_6_20190414.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2020\\Landsat_8_1_20210508.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\6\\2018\\Landsat_8_6_20190321.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\3\\2018\\Landsat_8_3_20190414.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2018\\Landsat_8_1_20180910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2020\\Landsat_8_1_20210108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\5\\2018\\Landsat_8_5_20190225.tif', 'D:Graduation_Thesis_Dataset\\S1_dataset\\2\\2017\\S1_2_20170910.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2020\\Landsat_8_1_20210201.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\1\\2019\\Landsat_8_1_20200108.tif', 'D:Graduation_Thesis_Dataset\\L8_dataset\\5\\2017\\Landsat_8_5_2017102