## Full example of processing satellite raster data

Steps:
* HDF LST band to tiff
* Clip raster
* Calculate zonal statistics

In [1]:
import os
import subprocess
import datetime
import numpy as np
import pandas as pd
from osgeo import gdal, ogr

In [2]:
# HDF to tiff

In [8]:
def only_hdf(bag_of_files, file_end):
    f_list = []
    for f in bag_of_files:
        if f.endswith(file_end):
            f_list.append(f)
    return f_list

def only_area_of_interest(bag_of_files, area_of_interest):
    a_list = []
    for f in bag_of_files:
        if area_of_interest in f:
            a_list.append(f)
    return a_list

def get_filelist(folder, file_ending, aoi):
    filelist = os.listdir(folder)
    filelist = only_hdf(filelist, file_ending)
    filelist = only_area_of_interest(filelist, aoi)
    return filelist

In [9]:
hdf_files_for_processing = get_filelist('input/rastry/wawa/', '.hdf', '.h19v')

In [11]:
def hdf_to_tiff(list_of_files):
    for f in list_of_files:
        input_folder = 'input/rastry/wawa/'
        output_folder = 'input/rastry/tiffs/'
        output_file_name_day = 'day_' + f[:-4] + '.tif'
        output_file_name_night = 'night_' + f[:-4] + '.tif'

        modis_data = gdal.Open((input_folder + f))
        subdatasets = modis_data.GetSubDatasets()
        del modis_data
    
        day = subdatasets[0][0]
        night = subdatasets[4][0]
    
        subprocess.call(['gdal_translate', day, (output_folder + output_file_name_day)])
        subprocess.call(['gdal_translate', night, (output_folder + output_file_name_night)])

In [12]:
hdf_to_tiff(hdf_files_for_processing)

In [13]:
# Clip to area of interst

In [14]:
FILE_ENDING = '.tif'
FOLDER = 'input/rastry/tiffs/'

filelist = os.listdir(FOLDER)

def only_tif(bag_of_files):
    f_list = []
    for f in bag_of_files:
        if f.endswith(FILE_ENDING):
            f_list.append(f)
    return f_list

In [15]:
filelist = only_tif(filelist)

In [16]:
VECTOR = 'input/wektor/warszawa.shp'

for f in filelist:
    input_tif = 'input/rastry/tiffs/' + f
    output_tif = 'input/rastry/clipped/clipped_' + f
    subprocess.call(['gdalwarp', '-cutline', VECTOR, '-crop_to_cutline', '-dstalpha', input_tif, output_tif])

In [17]:
# Zonal statistics

In [18]:
FILE_ENDING = '.tif'
FOLDER = 'input/rastry/clipped/'

filelist = os.listdir(FOLDER)

def get_clipped_files(bag_of_files):
    f_list = []
    for f in bag_of_files:
        if f.endswith(FILE_ENDING):
            f_list.append(f)
    return f_list

In [19]:
filelist = get_clipped_files(filelist)

In [20]:
def detect_day_or_night(name_str):
    if 'day' in name_str:
        return 'day'
    elif 'night' in name_str:
        return 'night'
    else:
        return None

def leap_or_regular(year):
    if ((year % 4) == 0 and (year % 100) != 0) or ((year % 400) == 0):
        return True
    else:
        return False
    
def julian_date_to_month(name_str):
    """Function for MODIS file name processing"""
    
    LOOK_UP_TABLE_BASE = 'input/lut/julian_day_calendar_'
    
    # Get the day or night
    day_or_night = detect_day_or_night(name_str)
    
    # Get year and Julian day from filename
    position = name_str.find('.')
    position = position + 2
    position_end = position + 7
    date = name_str[position:position_end]
    m_year = date[:4]
    m_year = int(m_year)
    julian_day = date[4:]
    julian_day = int(julian_day)
    
    # Check if year is leap or not
    is_leap = leap_or_regular(m_year)
    
    # Find month of measurements
    lut_address = ''
    if is_leap:
        lut_address = LOOK_UP_TABLE_BASE + 'leap.csv'
    else:
        lut_address = LOOK_UP_TABLE_BASE + 'regular.csv'
    
    lut_df = pd.read_csv(lut_address, index_col=0)
    
    if lut_df.isin([julian_day]).iloc[0].any():
        str_month = lut_df.columns[(lut_df == julian_day).iloc[0]]
        nb_month = lut_df.columns.get_loc(str_month[0]) + 1
        # Set the date
        acquisition_time = datetime.date(year = m_year,
                                        month = nb_month,
                                        day = 1)
    
        return [name_str, acquisition_time, day_or_night]
    else:
        return ['-1', '-1', '-1']

In [23]:
COLS = ['FILENAME',
        'ACQUISITION TIME',
        'DAY OR NIGHT',
        'MIN TEMPERATURE',
        'MAX TEMPERATURE',
        'MEAN TEMPERATURE',
        'STD OF TEMPERATURE']

In [24]:
def zonal_stats(numpy_array, count_zeros = None):
    if count_zeros == None:
        mean_value = np.mean(numpy_array[numpy_array > 0])
        std_value = np.std(numpy_array[numpy_array > 0])
    else:
        mean_value = np.mean(numpy_array)
        std_value = np.std(numpy_array)
        
    max_value = np.max(numpy_array)
    min_value = np.min(numpy_array[numpy_array > 0])
    
    return [min_value, max_value, mean_value, std_value]

In [25]:
df = pd.DataFrame(columns = COLS)
for f in filelist:
    # Read file
    modis_data = gdal.Open((FOLDER + f))
    data_description = julian_date_to_month(f)
    
    # Show raster as array
    array_modis = (modis_data.ReadAsArray().astype(np.float32))
    temperature_matrix = array_modis[0] * 0.02
    
    # Count number of 'good' pixels
    pixel_counter = (temperature_matrix > 0).sum()
    
    # Decide to process data or not - if not save filename in log file
    threshold = 4
    stats = [-1, -1, -1, -1]
    if pixel_counter < threshold:
        with open("corrupted.log", "a") as log_file:
            log_file.write(data_description[0])
            log_file.write('\n')
    else:
        stats = zonal_stats(temperature_matrix)
        
    # Prepare data frame with all data
    cols = ['FILENAME', 'ACQUISITION TIME', 'DAY OR NIGHT', 'MIN TEMPERATURE', 'MAX TEMPERATURE', 'MEAN TEMPERATURE', 'STD OF TEMPERATURE']

    information_frame = pd.DataFrame(data = [data_description + stats], columns = cols)
    frames = [df, information_frame]
    df = pd.concat(frames)
    
    del modis_data

In [26]:
df.head()

Unnamed: 0,FILENAME,ACQUISITION TIME,DAY OR NIGHT,MIN TEMPERATURE,MAX TEMPERATURE,MEAN TEMPERATURE,STD OF TEMPERATURE
0,clipped_day_MOD11B3.A2000032.h19v03.006.201516...,2000-02-01,day,280.23999,283.23999,282.307983,0.839964
0,-1,-1,-1,280.23999,283.23999,282.307983,0.839964
0,clipped_day_MOD11B3.A2000061.h19v03.006.201516...,2000-03-01,day,277.019989,281.779999,279.126648,1.201174
0,clipped_day_MOD11B3.A2000092.h19v03.006.201516...,2000-04-01,day,296.059998,299.279999,297.433319,0.933201
0,clipped_day_MOD11B3.A2000122.h19v03.006.201516...,2000-05-01,day,298.019989,302.579987,300.953308,1.216894


In [27]:
df.to_csv('output/monthly_lst_measurements_warszawa.csv', index=False)