# Merged meteorological AGB zonal stats colation

This notebook looks for zonal stats subdirectories and the zonal stats csv outputs within them and concatenates all files into one data frame which is exported as a csv to the output directory.

The following conditions apply:

 - run after Seasonal Biomass Zonal Pipeline.
 - env = base

In [1]:
import pandas as pd
from glob import glob
import os
from calendar import monthrange
from datetime import datetime

In [2]:
# # processing date
# date = "20230104"
# # date of data exports
# field_date = "20230103"
# # out_date
# out_date = "20230109"

In [None]:
# processing date
date = "20230407"
# date of data exports
field_date = "20230407"
# out_date
out_date = "20230407"

In [3]:
basal = r"F:\cdu\data\output\{0}\slats_tern_biomass.csv".format(field_date)
basal_df = pd.read_csv(basal)

In [4]:
dir_ = r"F:\cdu\data\zonal_stats\meterological\gr_meteorological_20230104_0840"
output_dir = r"F:\cdu\data\zonal_stats\output\{0}".format(out_date)

In [5]:
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [None]:
def start_seasonal_date(date_):
    """ extract the end dates of the seasonal image zonal stats."""
    
    year = date_[:4]
    month = date_[4:]
    
    start_date = str(year) + str(month) + "01"
    
    return start_date
    

def end_seasonal_date(date_):
    
    """ extract the start dates of the seasonal image zonal stats."""
    #print("date: ", date_)
    year = str(date_[:4])
    month = str(date_[4:])
    #print("month: ", month)
    
    month_, day_range = monthrange(int(year), int(month))
    end_date = str(year) + str(month) + str(day_range)
    #print(end_date)
    return end_date


def im_date_season(df):
    """Collate start date of image into im_date column"""
    
    st_date_list = []
    e_date_list = []
    for i in df.im_name:
        #print(i)
        list_name = i.split("_")
        date = list_name[-2]
        st_date = date[1:7]
        start_date = start_seasonal_date(st_date)
        st_date_list.append(start_date)
        
        e_date = date[7:] 
        end_date = end_seasonal_date(e_date)
        e_date_list.append(end_date)
        
    df["im_s_date"] = st_date_list
    df["im_e_date"] = e_date_list
    
    return df
        
    
def im_date_annual(df):
    """Collate start date of image into im_date column"""
    
    st_date_list = []
    e_date_list = []
    for i in df.im_name:
        #print(i)
        list_name = i.split("_")
        date = list_name[-2]
        st_date = str(date) + "01"
        start_date = start_seasonal_date(st_date)
        st_date_list.append(start_date)
        
        e_date = str(date) + "12"
        end_date = end_seasonal_date(e_date)
        e_date_list.append(end_date)
        
    df["im_s_date"] = st_date_list
    df["im_e_date"] = e_date_list
    
    return df

    
def convert_to_datetime(df, col_nm_s, col_nm_d):
    
    date_list = []
    for i in df[col_nm_s]:
        #print(i)
        datetime_object = datetime.strptime(str(i), '%Y%m%d')
        date_list.append(datetime_object)
        print(datetime_object)
        #df[col_nm_d] =  pd.to_datetime(df[col_nm_s], format='%Y%m%d.%f')
        #date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    df[col_nm_d] = date_list
    return df        

In [None]:
basal_df = convert_to_datetime(basal_df, "date", "basal_dt")
basal_df.sort_values(by='basal_dt', inplace=True)

In [None]:
basal_df.shape

In [None]:
def temp_dir_fn(output_dir, pos):
    temp_dir = os.path.join(output_dir, "{0}_temp".format(pos))
    
    if not os.path.isdir(temp_dir):
        os.mkdir(temp_dir)
    
    return temp_dir

def out_file_fn(temp_dir, pos, sub_dir, df__):
    out_file = os.path.join(temp_dir, "{0}_{1}_zonal_stats.csv".format(pos, sub_dir))
    df__.to_csv(os.path.join(temp_dir, out_file), index=False)
    print("output: ", out_file)
    

In [None]:
# extract a list of subdirectories from the qld meterological zonal stats
sub_list = next(os.walk(dir_))[1]
sub_list


working_sub_list = sub_list[19:20]
zonal_list = []
sub_dir_list = []
for sub_dir in working_sub_list:
    file_list = []
    #print(sub_dir)
    sub_dir_path = os.path.join(dir_, sub_dir)
    sub_dir_df_list = []
    for file_ in glob(os.path.join(sub_dir_path, "*.csv")):
        if "zonal_stats" in file_:
            sub_dir_list.append(file_)
            print(file_)

            df = pd.read_csv(file_)
            df = convert_to_datetime(df, "im_date", "image_dt")
            df.sort_values(by='image_dt', inplace=True)
            print(df.columns)
            sub_dir_df_list.append(df)
    df1 = pd.concat(sub_dir_df_list)
    
    # sort values
    df1.sort_values(by="im_date", inplace=True)
    
    # drop null values on minimum column
    df_columns = df1.columns.tolist()
    min_column = [columns for columns in df_columns if "min" in columns]
    print("Min column: ", min_column)
    df1.dropna(subset = min_column, inplace=True)
    

    #merge data with basal datset based on the nearest date to the field data colection
    n_df = pd.merge_asof(basal_df, df1, left_on="basal_dt", right_on= "image_dt", by="site", direction="nearest")
    f_df = pd.merge_asof(basal_df, df1, left_on="basal_dt", right_on= "image_dt", by="site", direction="forward")
    b_df = pd.merge_asof(basal_df, df1, left_on="basal_dt", right_on= "image_dt", by="site", direction="backward")
        
    # create temp dirs    
    n_temp_dir = temp_dir_fn(output_dir, "near")
    f_temp_dir = temp_dir_fn(output_dir, "for")
    b_temp_dir = temp_dir_fn(output_dir, "back")
    
    # export csv
    out_file_fn(n_temp_dir, "near", sub_dir, n_df)
    out_file_fn(f_temp_dir, "for", sub_dir, f_df)
    out_file_fn(b_temp_dir, "back", sub_dir, b_df)
        


In [9]:
def glob_fn(temp_dir):
    csv_list = []
    for f in glob(os.path.join(temp_dir, "*.csv")):
        
        df__ = pd.read_csv(f)
        csv_list.append(df__)
    final_df = pd.concat(csv_list, axis=1)
    

    return final_df
        
def export_fn(output_dir, pos, dff):
    out = os.path.join(output_dir, "{0}_met_zonal_stats.csv".format(pos))
    dff.to_csv((out), index=False)
    print("output to: ", out)
    
    
def drop_cols_fn(df):
    df1 = df.copy()
    df_columns = df.columns.tolist()
#     print(df_columns)
#     print("-"*100)

    drop_list = [columns for columns in df_columns if "mean" not in columns]

#     print(len(drop_list))
#     drop_list.remove("site")
#     drop_list.remove("bio_agb_kg1ha")
#     print("length of drop: ", len(drop_list))
#     print("mean column: ", mean_column)
#     test = mean_column
#     print("test: ", test)
    df.drop(drop_list, axis = 1, inplace=True)
    
    var_ = df1.iloc[:, [1, 12]]
    df_out = pd.concat([var_, df], axis=1)
    
    return df_out
    

In [10]:
n_temp_dir = r"F:\cdu\data\zonal_stats\output\20230104\near_temp"
f_temp_dir = r"F:\cdu\data\zonal_stats\output\20230104\for_temp"
b_temp_dir = r"F:\cdu\data\zonal_stats\output\20230104\back_temp"

In [11]:
n_final_df = glob_fn(n_temp_dir)
export_fn(output_dir, "near", n_final_df)
df1 = drop_cols_fn(n_final_df)
export_fn(output_dir, "cleaned_near", df1)

f_final_df = glob_fn(f_temp_dir)
export_fn(output_dir, "for", f_final_df)
df1 = drop_cols_fn(f_final_df)
export_fn(output_dir, "cleaned_for", df1)

b_final_df = glob_fn(b_temp_dir)
export_fn(output_dir, "back", b_final_df)
df1 = drop_cols_fn(b_final_df)
export_fn(output_dir, "cleaned_back", df1)

output to:  F:\cdu\data\zonal_stats\output\20230109\near_met_zonal_stats.csv
output to:  F:\cdu\data\zonal_stats\output\20230109\cleaned_near_met_zonal_stats.csv
output to:  F:\cdu\data\zonal_stats\output\20230109\for_met_zonal_stats.csv
output to:  F:\cdu\data\zonal_stats\output\20230109\cleaned_for_met_zonal_stats.csv
output to:  F:\cdu\data\zonal_stats\output\20230109\back_met_zonal_stats.csv
output to:  F:\cdu\data\zonal_stats\output\20230109\cleaned_back_met_zonal_stats.csv
