# Collate met and seasonal data

## Fire mask applied

In [36]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from glob import glob
import os

In [37]:
drive = "D"
met_date = "20230109"
direction = "near"

feature_date = "20230202"

# processing date
date = "20230202"

# define output directory
output_dir = r"{0}:\cdu\data\zonal_stats\output\{1}".format(drive, date)


# choose from one or the other to select fire mask applied or notscars or not

# choose for fire mask applied
type_basal_feat_dir = "no_fire_scar_zonal_basal"
fsm = "fms_applied"

# choose for fire mask NOT applied
# type_basal_feat_dir = "with_no_fire_scar_zonal_basal"
# fsm = "fms_NOT_applied"



In [38]:
def mk_dir_fn(dir_):
    """ Create a new directory if one does not already exist. """
    if not os.path.isdir(dir_):
        os.mkdir(dir_)

        
def export_csv_fn(list_, dir_, file_name):
    
    """ Create and export path from directory and file name and exports csv with no dropping the index column. """
    
    df_final = pd.concat(list_, axis =0)    
    output_path = os.path.join(dir_, file_name)
    df_final.to_csv(os.path.join(output_path), index=False)
    print("File output to: ", output_path)
    
    return df_final


def glob_concat_axis1_fn(d, s, m):
    """ Concatenate mosaic basal and feature data based on site index. """
    df_list = []

    for f in glob(os.path.join(d, s)):

        df = pd.read_csv(f)
#         df.set_index("uid", inplace = True) # change back to uid once met data has been corrected
        df.set_index("site", inplace = True)
        df_list.append(df)
    
    df_list.append(m)
    # concat data
    df_con = pd.concat(df_list, axis = 1)
    
    # Drop duplicate columns
    df_bm_dd = df_con.T.drop_duplicates().T
    

    return df_list, df_bm_dd


def glob_concat_axis1_fn(d, s, m):
    """ Concatenate mosaic basal and feature data based on site index. """
    df_list = []
    
    
    for f in glob(os.path.join(d, s)):
#         print(f)
        _, file_ = os.path.split(f)
        f_split = file_.split("_")
        
        old_s = f_split[0] 
#         print(old_s)
        new_s = f_split[0] + "_" + f_split[1]
#         print(new_s)

        df = pd.read_csv(f)
        col_list = df.columns.tolist()
        
        s_list = []
        for i in col_list:

            s = i.replace(old_s, new_s)
            s_list.append(s)
            
        df.columns = s_list

        
        # df.set_index("uid", inplace = True) # change back to uid once met data has been corrected
        df.set_index("site", inplace = True)
        df_list.append(df)
    
    df_list.append(m)
    # concat data
    df_con = pd.concat(df_list, axis = 1)
    
    # Drop duplicate columns
    df_bm_dd = df_con.T.drop_duplicates().T
    
    print(s_list)
    return df_list, df_bm_dd


def identify_null_values(df_in):
    """ Create and export column names and null values as a new df. """
    nc = df_in.isnull().sum()
    df_out = pd.DataFrame(nc[nc > 0].sort_values(ascending=False))
    for index, row in df_out.iterrows():
        print(f"{index}: {row[0]} Null values")

    df_out.reset_index(drop=False, inplace=True)
    
    return df_out


def drop_cols_fn(df, var1, var2, var3, var4, var5):
    """ Select out feature or stats values and export df's """
    df1 = df.copy(deep=True)
    df_columns = df1.columns.tolist()
    
    drop_list = [columns for columns in df_columns if var1 not in columns]  
    drop_list2 = [columns for columns in drop_list if var2 not in columns]
    drop_list3 = [columns for columns in drop_list2 if var3 not in columns]
    drop_list4 = [columns for columns in drop_list3 if var4 not in columns]
    drop_list5 = [columns for columns in drop_list4 if var5 not in columns]

    df1.drop(drop_list5, axis = 1, inplace=True)
    var_ = df.iloc[:, [0, 1, 11]]
    df_out = pd.concat([var_, df1], axis=1)
    

    return df1, var_, df_out

##### Create output directories

In [39]:
ml_data_dir = os.path.join(output_dir, "ml_data_dir")
mk_dir_fn(output_dir)
mk_dir_fn(ml_data_dir)

##### Load climate data 

In [40]:
m = r"{0}:\cdu\data\zonal_stats\output\{1}\cleaned_{2}_met_zonal_stats.csv".format(drive, met_date, direction)
print(m)

D:\cdu\data\zonal_stats\output\20230109\cleaned_near_met_zonal_stats.csv


In [41]:
m_df = pd.read_csv(m)
# set index to site or uid
m_df.set_index("site", inplace = True)
# m_df

In [42]:
# print(list(m_df))

##### Load basal feature zonal stats

In [43]:
bf = r"{0}:\cdu\data\zonal_stats\output\{1}\{2}".format(drive, feature_date, type_basal_feat_dir)
print(bf)

D:\cdu\data\zonal_stats\output\20230202\no_fire_scar_zonal_basal


In [44]:
# Call the glob_concat_axis1 function to concatenate features
bfm_df_list,df_bm_dd = glob_concat_axis1_fn(bf, "*.csv", m_df)

# call the drop columns function to select statistical values
taf_df, t_d_df, df_bm_dmmmms = drop_cols_fn(df_bm_dd, "mean", "max", "min", "med", "std")


['uid', 'site', 'date', 'lon_gda94', 'lat_gda94', 'bio_l_kg1ha', 'bio_t_kg1ha', 'bio_b_kg1ha', 'bio_w_kg1ha', 'bio_br_kg1ha', 'bio_s_kg1ha', 'bio_r_kg1ha', 'bio_agb_kg1ha', 'c_l_kg1ha', 'c_t_kg1ha', 'c_b_kg1ha', 'c_w_kg1ha', 'c_br_kg1ha', 'c_s_kg1ha', 'c_r_kg1ha', 'c_agb_kg1ha', 'basal_dt', 'uid_dp1_0509_0509', 'dt_no_a_fs', 'image', 'dp1_0509_s_dt', 'b1_dp1_0509_min', 'b1_dp1_0509_max', 'b1_dp1_0509_mean', 'b1_dp1_0509_count', 'b1_dp1_0509_std', 'b1_dp1_0509_med', 'b1_dp1_0509_p25', 'b1_dp1_0509_p50', 'b1_dp1_0509_p75', 'b1_dp1_0509_p95', 'b1_dp1_0509_p99', 'b1_dp1_0509_range', 'b2_dp1_0509_min', 'b2_dp1_0509_max', 'b2_dp1_0509_mean', 'b2_dp1_0509_count', 'b2_dp1_0509_std', 'b2_dp1_0509_med', 'b2_dp1_0509_p25', 'b2_dp1_0509_p50', 'b2_dp1_0509_p75', 'b2_dp1_0509_p95', 'b2_dp1_0509_p99', 'b2_dp1_0509_range', 'b3_dp1_0509_min', 'b3_dp1_0509_max', 'b3_dp1_0509_mean', 'b3_dp1_0509_count', 'b3_dp1_0509_med', 'b3_dp1_0509_p25', 'b3_dp1_0509_p50', 'b3_dp1_0509_p75', 'b3_dp1_0509_p95', 'b3_dp1

In [45]:
def drop_feature_fn(df):
    """ Remove features from dataframe """
    df1 = df.copy(deep=True)
    df_columns = df.columns.tolist()
    
    drop_list = [columns for columns in df_columns if 'dis_' in columns]
    drop_list2 = [columns for columns in df_columns if 'stc_' in columns]
    drop_list3 = [columns for columns in df_columns if 'mean.1' in columns]
    
    drop_list.extend(drop_list2)
    drop_list.extend(drop_list3)
    df1.drop(drop_list, axis = 1, inplace=True)
    
        
    return df1
 
    
dmmmms_nodisstc = drop_feature_fn(df_bm_dmmmms)

In [46]:
# dmmmms_nodisstc

In [47]:
def fill_min_list(df):
    """ Fill minimum values with 0 """
    
    df1 = df.copy(deep=True)
    
    df_columns = df.columns.tolist()
    min_list = [columns for columns in df_columns if '_min' in columns]
    
    for i in min_list:
#         print(i)

        df1.dropna(subset=[i], inplace=True)
        mean_value = df1[i].mean()
        print(f"feature {i} mean = {mean_value}")
        df[i].fillna(mean_value, inplace=True)
        
        
        #df[i].fillna((df1[i].mean()), inplace=True) #should be 0
         
    return df
 
    
dmmmms_min = fill_min_list(dmmmms_nodisstc)

feature b1_fpca2_0509_min mean = 7.467314164359848
feature b1_h99a_01122_min mean = 7.783339416088701
feature b1_dja_0305_min mean = 14.08843537414966
feature b1_dja_0608_min mean = 14.197278911564625
feature b1_dja_0911_min mean = 14.29251700680272
feature b1_dja_1202_min mean = 14.353741496598639
feature b1_dim_0305_min mean = 6.782312925170068
feature b2_dim_0305_min mean = 37.8843537414966
feature b3_dim_0305_min mean = 37.8843537414966
feature b1_dim_0608_min mean = 9.979591836734693
feature b2_dim_0608_min mean = 21.238095238095237
feature b3_dim_0608_min mean = 50.89795918367347
feature b1_dim_0911_min mean = 12.489795918367347
feature b2_dim_0911_min mean = 19.857142857142858
feature b3_dim_0911_min mean = 51.734693877551024
feature b1_dim_1202_min mean = 8.843537414965986
feature b2_dim_1202_min mean = 36.02721088435374
feature b3_dim_1202_min mean = 37.63945578231292
feature b1_dp1_0112_min mean = 10.164383561643836
feature b2_dp1_0112_min mean = 23.02962962962963
feature b3_

In [13]:
print(list(dmmmms_min.columns))

['uid', 'date', 'bio_agb_kg1ha', 'b1_fpca2_0509_min', 'b1_fpca2_0509_max', 'b1_fpca2_0509_mean', 'b1_fpca2_0509_med', 'b1_fpca2_0509_std', 'b1_h99a_01122_min', 'b1_h99a_01122_max', 'b1_h99a_01122_mean', 'b1_h99a_01122_med', 'b1_h99a_01122_std', 'b1_dja_0305_min', 'b1_dja_0305_max', 'b1_dja_0305_mean', 'b1_dja_0305_med', 'b1_dja_0305_std', 'b1_dja_0608_min', 'b1_dja_0608_max', 'b1_dja_0608_mean', 'b1_dja_0608_med', 'b1_dja_0608_std', 'b1_dja_0911_min', 'b1_dja_0911_max', 'b1_dja_0911_mean', 'b1_dja_0911_med', 'b1_dja_0911_std', 'b1_dja_1202_min', 'b1_dja_1202_max', 'b1_dja_1202_mean', 'b1_dja_1202_med', 'b1_dja_1202_std', 'b1_dim_0305_min', 'b1_dim_0305_max', 'b1_dim_0305_mean', 'b1_dim_0305_med', 'b1_dim_0305_std', 'b2_dim_0305_min', 'b2_dim_0305_max', 'b2_dim_0305_mean', 'b2_dim_0305_med', 'b2_dim_0305_std', 'b3_dim_0305_min', 'b3_dim_0305_max', 'b3_dim_0305_mean', 'b3_dim_0305_med', 'b3_dim_0305_std', 'b1_dim_0608_min', 'b1_dim_0608_max', 'b1_dim_0608_mean', 'b1_dim_0608_med', 'b1_di

In [14]:
# df1 = dmmmms_nodisstc.copy()
# df2 = dmmmms_nodisstc

# df1.dropna(subset=["b1_dja_min"], inplace=True)
# mean_value = df1["b1_dja_min"].mean()
# # print(f"feature {"b1_dja_min"} mean = {mean_value}")
# #dmmmms_nodisstc["b1_dja_min"]= dmmmms_nodisstc["b1_dja_min"].fillna(mean_value, inplace=True)
# df2["b1_dja_min"].fillna(value=mean_value, inplace=True)

  


In [15]:
# print(dmmmms_nodisstc.shape)
# print(df2.shape)

In [16]:
# dmmmms_nodisstc.dropna(inplace=True)
# dmmmms_nodisstc.shape

In [17]:
# df2.dropna(inplace=True)
# df2.shape

In [18]:
# call the identify null values function
df_bm_dmmmms_null = identify_null_values(dmmmms_min)

b3_dp1_0509_std: 21 Null values
b2_dp1_0509_max: 21 Null values
b3_dp1_0509_med: 21 Null values
b1_dp1_0509_max: 21 Null values
b1_dp1_0509_mean: 21 Null values
b1_dp1_0509_med: 21 Null values
b1_dp1_0509_std: 21 Null values
b2_dp1_0509_mean: 21 Null values
b2_dp1_0509_std: 21 Null values
b2_dp1_0509_med: 21 Null values
b3_dp1_0509_max: 21 Null values
b3_dp1_0509_mean: 21 Null values
b1_h99a_01122_std: 20 Null values
b1_h99a_01122_max: 20 Null values
b1_h99a_01122_mean: 20 Null values
b1_h99a_01122_med: 20 Null values
b1_fpca2_0509_mean: 13 Null values
b1_fpca2_0509_med: 13 Null values
b1_fpca2_0509_std: 13 Null values
b1_fpca2_0509_max: 13 Null values
b1_dp1_0112_med: 2 Null values
b1_dp1_0112_std: 2 Null values
b1_dp1_0112_mean: 2 Null values
b1_dp1_0112_max: 2 Null values
b2_dp1_0112_max: 2 Null values
b2_dp1_0112_mean: 2 Null values
b2_dp1_0112_std: 2 Null values
b3_dp1_0112_max: 2 Null values
b3_dp1_0112_mean: 2 Null values
b3_dp1_0112_med: 2 Null values
b3_dp1_0112_std: 2 Null va

In [19]:
def mean_null(df):
    """ Mean the null values of data excluding date and biomass."""
    df1 = df.copy(deep=True)

    df_columns = df1.columns.tolist()

    for i in df_columns[3:]:
        df1.dropna(subset=[i], inplace=True)
        mean_value = df1[i].mean()
        print(f"feature {i} mean = {mean_value}")
        df[i].fillna(mean_value, inplace=True)

        
    return df
    

In [20]:
dmmmms_mean = mean_null(dmmmms_min)

feature b1_fpca2_0509_min mean = 7.467314164359848
feature b1_fpca2_0509_max mean = 16.554122273024028
feature b1_fpca2_0509_mean mean = 11.514610472572071
feature b1_fpca2_0509_med mean = 11.319621596243474
feature b1_fpca2_0509_std mean = 2.4774822417224676
feature b1_h99a_01122_min mean = 7.783339416088701
feature b1_h99a_01122_max mean = 12.490026357222577
feature b1_h99a_01122_mean mean = 10.100467834288274
feature b1_h99a_01122_med mean = 10.099648404283588
feature b1_h99a_01122_std mean = 1.2457832533686888
feature b1_dja_0305_min mean = 14.08843537414966
feature b1_dja_0305_max mean = 22.27891156462585
feature b1_dja_0305_mean mean = 18.34656462585034
feature b1_dja_0305_med mean = 18.408163265306122
feature b1_dja_0305_std mean = 2.1755266317642628
feature b1_dja_0608_min mean = 14.197278911564625
feature b1_dja_0608_max mean = 22.238095238095237
feature b1_dja_0608_mean mean = 18.423724489795923
feature b1_dja_0608_med mean = 18.479591836734695
feature b1_dja_0608_std mean = 

In [21]:
print(list(dmmmms_mean))

['uid', 'date', 'bio_agb_kg1ha', 'b1_fpca2_0509_min', 'b1_fpca2_0509_max', 'b1_fpca2_0509_mean', 'b1_fpca2_0509_med', 'b1_fpca2_0509_std', 'b1_h99a_01122_min', 'b1_h99a_01122_max', 'b1_h99a_01122_mean', 'b1_h99a_01122_med', 'b1_h99a_01122_std', 'b1_dja_0305_min', 'b1_dja_0305_max', 'b1_dja_0305_mean', 'b1_dja_0305_med', 'b1_dja_0305_std', 'b1_dja_0608_min', 'b1_dja_0608_max', 'b1_dja_0608_mean', 'b1_dja_0608_med', 'b1_dja_0608_std', 'b1_dja_0911_min', 'b1_dja_0911_max', 'b1_dja_0911_mean', 'b1_dja_0911_med', 'b1_dja_0911_std', 'b1_dja_1202_min', 'b1_dja_1202_max', 'b1_dja_1202_mean', 'b1_dja_1202_med', 'b1_dja_1202_std', 'b1_dim_0305_min', 'b1_dim_0305_max', 'b1_dim_0305_mean', 'b1_dim_0305_med', 'b1_dim_0305_std', 'b2_dim_0305_min', 'b2_dim_0305_max', 'b2_dim_0305_mean', 'b2_dim_0305_med', 'b2_dim_0305_std', 'b3_dim_0305_min', 'b3_dim_0305_max', 'b3_dim_0305_mean', 'b3_dim_0305_med', 'b3_dim_0305_std', 'b1_dim_0608_min', 'b1_dim_0608_max', 'b1_dim_0608_mean', 'b1_dim_0608_med', 'b1_di

In [22]:
dmmmms_mean

Unnamed: 0,uid,date,bio_agb_kg1ha,b1_fpca2_0509_min,b1_fpca2_0509_max,b1_fpca2_0509_mean,b1_fpca2_0509_med,b1_fpca2_0509_std,b1_h99a_01122_min,b1_h99a_01122_max,...,evp_s_mean,tmax_mean,tmin_mean,rain_m_mean,mslp_mean,rad_mean,rh_tmax_mean,rh_tmin_mean,vp_mean,vp_d_mean
agb02,1,20120417,0,14.648533,25.453907,20.006735,20.787782,3.180215,10.246597,16.737152,...,7.30,35.60,22.3,202.70,1009.40,13.60,3319.0,3368.3,24.7,23.6
ntapck0001,126,20160504,8984.91,12.933355,38.921173,28.233608,28.756306,8.265865,15.406975,20.898926,...,5.60,35.40,25.6,168.70,1013.20,17.60,3330.6,3371.3,31.1,19.1
ntagfu0005,84,20120416,1549.09,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.10,34.20,18.5,150.00,1015.20,21.70,3305.7,3350.3,15.7,27.4
hsf02,43,20120710,26636.3,33.189796,41.232662,36.739747,36.043419,2.569199,15.542252,19.774864,...,5.10,32.70,19.7,226.40,1013.00,19.50,3323.8,3376.5,23.4,17.7
ntagfu0006,85,20120418,0,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.30,33.50,19.2,156.90,1011.30,16.80,3309.0,3352.0,16.8,25.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vrd23,143,20120729,406.56,0.406700,1.486260,0.795091,0.752944,0.320448,3.327431,6.337692,...,6.50,24.70,5.8,78.30,1021.40,21.80,3296.8,3344.8,6.3,17.0
ntagfu0020,99,20120605,7711.29,12.109756,27.614885,19.936276,20.286156,4.268669,9.187775,13.847046,...,7.15,24.65,7.5,112.85,1018.35,20.25,3304.2,3359.5,8.6,15.3
ntagfu0031,110,20120706,5173.71,1.486260,15.538734,7.885186,6.726861,3.912009,5.522598,12.467270,...,5.40,25.90,9.6,148.30,1016.40,20.90,3299.6,3341.0,7.7,18.4
lim01,59,20120529,4872.78,7.036602,14.648533,9.989439,9.782678,2.069015,8.362190,11.851967,...,5.60,22.80,10.3,97.80,1017.30,10.20,3319.8,3372.3,12.0,10.9


In [23]:
# call the identify null values function
df_bm_dmmmms_null = identify_null_values(dmmmms_mean)


In [24]:
dmmmms_mean

Unnamed: 0,uid,date,bio_agb_kg1ha,b1_fpca2_0509_min,b1_fpca2_0509_max,b1_fpca2_0509_mean,b1_fpca2_0509_med,b1_fpca2_0509_std,b1_h99a_01122_min,b1_h99a_01122_max,...,evp_s_mean,tmax_mean,tmin_mean,rain_m_mean,mslp_mean,rad_mean,rh_tmax_mean,rh_tmin_mean,vp_mean,vp_d_mean
agb02,1,20120417,0,14.648533,25.453907,20.006735,20.787782,3.180215,10.246597,16.737152,...,7.30,35.60,22.3,202.70,1009.40,13.60,3319.0,3368.3,24.7,23.6
ntapck0001,126,20160504,8984.91,12.933355,38.921173,28.233608,28.756306,8.265865,15.406975,20.898926,...,5.60,35.40,25.6,168.70,1013.20,17.60,3330.6,3371.3,31.1,19.1
ntagfu0005,84,20120416,1549.09,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.10,34.20,18.5,150.00,1015.20,21.70,3305.7,3350.3,15.7,27.4
hsf02,43,20120710,26636.3,33.189796,41.232662,36.739747,36.043419,2.569199,15.542252,19.774864,...,5.10,32.70,19.7,226.40,1013.00,19.50,3323.8,3376.5,23.4,17.7
ntagfu0006,85,20120418,0,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.30,33.50,19.2,156.90,1011.30,16.80,3309.0,3352.0,16.8,25.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vrd23,143,20120729,406.56,0.406700,1.486260,0.795091,0.752944,0.320448,3.327431,6.337692,...,6.50,24.70,5.8,78.30,1021.40,21.80,3296.8,3344.8,6.3,17.0
ntagfu0020,99,20120605,7711.29,12.109756,27.614885,19.936276,20.286156,4.268669,9.187775,13.847046,...,7.15,24.65,7.5,112.85,1018.35,20.25,3304.2,3359.5,8.6,15.3
ntagfu0031,110,20120706,5173.71,1.486260,15.538734,7.885186,6.726861,3.912009,5.522598,12.467270,...,5.40,25.90,9.6,148.30,1016.40,20.90,3299.6,3341.0,7.7,18.4
lim01,59,20120529,4872.78,7.036602,14.648533,9.989439,9.782678,2.069015,8.362190,11.851967,...,5.60,22.80,10.3,97.80,1017.30,10.20,3319.8,3372.3,12.0,10.9


In [25]:
dmmmms_mean.dropna(inplace=True)

In [26]:
dmmmms_mean

Unnamed: 0,uid,date,bio_agb_kg1ha,b1_fpca2_0509_min,b1_fpca2_0509_max,b1_fpca2_0509_mean,b1_fpca2_0509_med,b1_fpca2_0509_std,b1_h99a_01122_min,b1_h99a_01122_max,...,evp_s_mean,tmax_mean,tmin_mean,rain_m_mean,mslp_mean,rad_mean,rh_tmax_mean,rh_tmin_mean,vp_mean,vp_d_mean
agb02,1,20120417,0,14.648533,25.453907,20.006735,20.787782,3.180215,10.246597,16.737152,...,7.30,35.60,22.3,202.70,1009.40,13.60,3319.0,3368.3,24.7,23.6
ntapck0001,126,20160504,8984.91,12.933355,38.921173,28.233608,28.756306,8.265865,15.406975,20.898926,...,5.60,35.40,25.6,168.70,1013.20,17.60,3330.6,3371.3,31.1,19.1
ntagfu0005,84,20120416,1549.09,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.10,34.20,18.5,150.00,1015.20,21.70,3305.7,3350.3,15.7,27.4
hsf02,43,20120710,26636.3,33.189796,41.232662,36.739747,36.043419,2.569199,15.542252,19.774864,...,5.10,32.70,19.7,226.40,1013.00,19.50,3323.8,3376.5,23.4,17.7
ntagfu0006,85,20120418,0,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,12.490026,...,8.30,33.50,19.2,156.90,1011.30,16.80,3309.0,3352.0,16.8,25.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vrd23,143,20120729,406.56,0.406700,1.486260,0.795091,0.752944,0.320448,3.327431,6.337692,...,6.50,24.70,5.8,78.30,1021.40,21.80,3296.8,3344.8,6.3,17.0
ntagfu0020,99,20120605,7711.29,12.109756,27.614885,19.936276,20.286156,4.268669,9.187775,13.847046,...,7.15,24.65,7.5,112.85,1018.35,20.25,3304.2,3359.5,8.6,15.3
ntagfu0031,110,20120706,5173.71,1.486260,15.538734,7.885186,6.726861,3.912009,5.522598,12.467270,...,5.40,25.90,9.6,148.30,1016.40,20.90,3299.6,3341.0,7.7,18.4
lim01,59,20120529,4872.78,7.036602,14.648533,9.989439,9.782678,2.069015,8.362190,11.851967,...,5.60,22.80,10.3,97.80,1017.30,10.20,3319.8,3372.3,12.0,10.9


In [27]:
dmmmms_mean.reset_index(drop=False, inplace=True)
dmmmms_mean.rename(columns={"index": "site"}, inplace=True)
dmmmms_mean.dropna(inplace=True)

In [28]:
dmmmms_mean

Unnamed: 0,site,uid,date,bio_agb_kg1ha,b1_fpca2_0509_min,b1_fpca2_0509_max,b1_fpca2_0509_mean,b1_fpca2_0509_med,b1_fpca2_0509_std,b1_h99a_01122_min,...,evp_s_mean,tmax_mean,tmin_mean,rain_m_mean,mslp_mean,rad_mean,rh_tmax_mean,rh_tmin_mean,vp_mean,vp_d_mean
0,agb02,1,20120417,0,14.648533,25.453907,20.006735,20.787782,3.180215,10.246597,...,7.30,35.60,22.3,202.70,1009.40,13.60,3319.0,3368.3,24.7,23.6
1,ntapck0001,126,20160504,8984.91,12.933355,38.921173,28.233608,28.756306,8.265865,15.406975,...,5.60,35.40,25.6,168.70,1013.20,17.60,3330.6,3371.3,31.1,19.1
2,ntagfu0005,84,20120416,1549.09,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,...,8.10,34.20,18.5,150.00,1015.20,21.70,3305.7,3350.3,15.7,27.4
3,hsf02,43,20120710,26636.3,33.189796,41.232662,36.739747,36.043419,2.569199,15.542252,...,5.10,32.70,19.7,226.40,1013.00,19.50,3323.8,3376.5,23.4,17.7
4,ntagfu0006,85,20120418,0,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,...,8.30,33.50,19.2,156.90,1011.30,16.80,3309.0,3352.0,16.8,25.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,vrd23,143,20120729,406.56,0.406700,1.486260,0.795091,0.752944,0.320448,3.327431,...,6.50,24.70,5.8,78.30,1021.40,21.80,3296.8,3344.8,6.3,17.0
163,ntagfu0020,99,20120605,7711.29,12.109756,27.614885,19.936276,20.286156,4.268669,9.187775,...,7.15,24.65,7.5,112.85,1018.35,20.25,3304.2,3359.5,8.6,15.3
164,ntagfu0031,110,20120706,5173.71,1.486260,15.538734,7.885186,6.726861,3.912009,5.522598,...,5.40,25.90,9.6,148.30,1016.40,20.90,3299.6,3341.0,7.7,18.4
165,lim01,59,20120529,4872.78,7.036602,14.648533,9.989439,9.782678,2.069015,8.362190,...,5.60,22.80,10.3,97.80,1017.30,10.20,3319.8,3372.3,12.0,10.9


In [29]:
# dmmmms_mean.to_csv(os.path.join(ml_data_dir, f"season_met_fillna_mean_{fsm}_applied.csv"))

In [30]:
path_ = os.path.join(ml_data_dir, f"season_met_fillna_mean_{fsm}.csv")      
dmmmms_mean.to_csv(path_, index=False)
print(path_)

D:\cdu\data\zonal_stats\output\20230202\ml_data_dir\season_met_fillna_mean_fms_applied.csv


In [31]:
print(list(dmmmms_mean))

['site', 'uid', 'date', 'bio_agb_kg1ha', 'b1_fpca2_0509_min', 'b1_fpca2_0509_max', 'b1_fpca2_0509_mean', 'b1_fpca2_0509_med', 'b1_fpca2_0509_std', 'b1_h99a_01122_min', 'b1_h99a_01122_max', 'b1_h99a_01122_mean', 'b1_h99a_01122_med', 'b1_h99a_01122_std', 'b1_dja_0305_min', 'b1_dja_0305_max', 'b1_dja_0305_mean', 'b1_dja_0305_med', 'b1_dja_0305_std', 'b1_dja_0608_min', 'b1_dja_0608_max', 'b1_dja_0608_mean', 'b1_dja_0608_med', 'b1_dja_0608_std', 'b1_dja_0911_min', 'b1_dja_0911_max', 'b1_dja_0911_mean', 'b1_dja_0911_med', 'b1_dja_0911_std', 'b1_dja_1202_min', 'b1_dja_1202_max', 'b1_dja_1202_mean', 'b1_dja_1202_med', 'b1_dja_1202_std', 'b1_dim_0305_min', 'b1_dim_0305_max', 'b1_dim_0305_mean', 'b1_dim_0305_med', 'b1_dim_0305_std', 'b2_dim_0305_min', 'b2_dim_0305_max', 'b2_dim_0305_mean', 'b2_dim_0305_med', 'b2_dim_0305_std', 'b3_dim_0305_min', 'b3_dim_0305_max', 'b3_dim_0305_mean', 'b3_dim_0305_med', 'b3_dim_0305_std', 'b1_dim_0608_min', 'b1_dim_0608_max', 'b1_dim_0608_mean', 'b1_dim_0608_med'

In [32]:
def drop_low_r2_seasons_fn(df):
    """ Remove features from dataframe """
    df_columns = df.columns.tolist()
    
    drop_list = [columns for columns in df_columns if '0305' in columns]
    drop_list2 = [columns for columns in df_columns if '1202' in columns]
    drop_list3 = [columns for columns in df_columns if 'dp1_0509' in columns]
    
    drop_list.extend(drop_list2)
    drop_list.extend(drop_list3)
    df.drop(drop_list, axis = 1, inplace=True)
    
        
    return df
 
    
selected_features_and_seasons = drop_feature_fn(dmmmms_mean)

In [33]:
path_ = os.path.join(ml_data_dir, f"r2_best_season_met_fillna_{fsm}.csv")      
selected_features_and_seasons.to_csv(path_, index=False)
print(path_)

D:\cdu\data\zonal_stats\output\20230202\ml_data_dir\r2_best_season_met_fillna_fms_applied.csv


In [34]:
selected_features_and_seasons

Unnamed: 0,site,uid,date,bio_agb_kg1ha,b1_fpca2_0509_min,b1_fpca2_0509_max,b1_fpca2_0509_mean,b1_fpca2_0509_med,b1_fpca2_0509_std,b1_h99a_01122_min,...,evp_s_mean,tmax_mean,tmin_mean,rain_m_mean,mslp_mean,rad_mean,rh_tmax_mean,rh_tmin_mean,vp_mean,vp_d_mean
0,agb02,1,20120417,0,14.648533,25.453907,20.006735,20.787782,3.180215,10.246597,...,7.30,35.60,22.3,202.70,1009.40,13.60,3319.0,3368.3,24.7,23.6
1,ntapck0001,126,20160504,8984.91,12.933355,38.921173,28.233608,28.756306,8.265865,15.406975,...,5.60,35.40,25.6,168.70,1013.20,17.60,3330.6,3371.3,31.1,19.1
2,ntagfu0005,84,20120416,1549.09,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,...,8.10,34.20,18.5,150.00,1015.20,21.70,3305.7,3350.3,15.7,27.4
3,hsf02,43,20120710,26636.3,33.189796,41.232662,36.739747,36.043419,2.569199,15.542252,...,5.10,32.70,19.7,226.40,1013.00,19.50,3323.8,3376.5,23.4,17.7
4,ntagfu0006,85,20120418,0,7.467314,16.554122,11.514610,11.319622,2.477482,7.783339,...,8.30,33.50,19.2,156.90,1011.30,16.80,3309.0,3352.0,16.8,25.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,vrd23,143,20120729,406.56,0.406700,1.486260,0.795091,0.752944,0.320448,3.327431,...,6.50,24.70,5.8,78.30,1021.40,21.80,3296.8,3344.8,6.3,17.0
163,ntagfu0020,99,20120605,7711.29,12.109756,27.614885,19.936276,20.286156,4.268669,9.187775,...,7.15,24.65,7.5,112.85,1018.35,20.25,3304.2,3359.5,8.6,15.3
164,ntagfu0031,110,20120706,5173.71,1.486260,15.538734,7.885186,6.726861,3.912009,5.522598,...,5.40,25.90,9.6,148.30,1016.40,20.90,3299.6,3341.0,7.7,18.4
165,lim01,59,20120529,4872.78,7.036602,14.648533,9.989439,9.782678,2.069015,8.362190,...,5.60,22.80,10.3,97.80,1017.30,10.20,3319.8,3372.3,12.0,10.9


In [35]:
print(list(selected_features_and_seasons))

['site', 'uid', 'date', 'bio_agb_kg1ha', 'b1_fpca2_0509_min', 'b1_fpca2_0509_max', 'b1_fpca2_0509_mean', 'b1_fpca2_0509_med', 'b1_fpca2_0509_std', 'b1_h99a_01122_min', 'b1_h99a_01122_max', 'b1_h99a_01122_mean', 'b1_h99a_01122_med', 'b1_h99a_01122_std', 'b1_dja_0305_min', 'b1_dja_0305_max', 'b1_dja_0305_mean', 'b1_dja_0305_med', 'b1_dja_0305_std', 'b1_dja_0608_min', 'b1_dja_0608_max', 'b1_dja_0608_mean', 'b1_dja_0608_med', 'b1_dja_0608_std', 'b1_dja_0911_min', 'b1_dja_0911_max', 'b1_dja_0911_mean', 'b1_dja_0911_med', 'b1_dja_0911_std', 'b1_dja_1202_min', 'b1_dja_1202_max', 'b1_dja_1202_mean', 'b1_dja_1202_med', 'b1_dja_1202_std', 'b1_dim_0305_min', 'b1_dim_0305_max', 'b1_dim_0305_mean', 'b1_dim_0305_med', 'b1_dim_0305_std', 'b2_dim_0305_min', 'b2_dim_0305_max', 'b2_dim_0305_mean', 'b2_dim_0305_med', 'b2_dim_0305_std', 'b3_dim_0305_min', 'b3_dim_0305_max', 'b3_dim_0305_mean', 'b3_dim_0305_med', 'b3_dim_0305_std', 'b1_dim_0608_min', 'b1_dim_0608_max', 'b1_dim_0608_mean', 'b1_dim_0608_med'