# overview

This Jupyter notebook example shows how to use tools from the `fluxdataqaqc` Python package to produce energy balance correction ratios by adjusting turblent energy fluxes in order to close the surface energy balace for a daily time series from the FLUXNET 2015 dataset. The data used herein is provided with the software package and can be downloaded [here](https://github.com/Open-ET/flux-data-qaqc/blob/master/example_data/FLX_US-AR1_FLUXNET2015_SUBSET_DD_2009-2012_1-3.xlsx).

This example can be reproduced by downloading `fluxdataqaqc` from github with git 

```bash
git clone https://github.com/Open-ET/flux-data-qaqc.git
```

or you can download a compressed folder. Dependencies currently include pandas, refet, numpy, and bokeh. Automatic installation and dependencies handling will soon be provided along with online documentation.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
# currently not installable so import from parent dir
sys.path.append('..')
from fluxdataqaqc.data import Data
from fluxdataqaqc.qaqc import QaQc 
from bokeh.plotting import figure, show
from bokeh.models.formatters import DatetimeTickFormatter
from bokeh.io import output_notebook
output_notebook()

## create a ``Data`` object to read in time series data using a config file

In [2]:
config_path = 'fluxnet_config.ini'
d = Data(config_path)

In [3]:
# you can access all metadata and datain the config file as a list
d.config.items('METADATA') # can access the DATA section the same way

[('climate_file_path', 'FLX_US-AR1_FLUXNET2015_SUBSET_DD_2009-2012_1-3.xlsx'),
 ('station_latitude', '36.4267'),
 ('station_longitude', '-99.42'),
 ('station_elevation', '611'),
 ('anemometer_height', '3'),
 ('missing_data_value', '-9999')]

In [4]:
# or as a dict, e.g. to access specific values by name
d.config.get('METADATA','station_elevation')

'611'

In [5]:
# path to climate time series input and config files
print(d.climate_file, '\n', d.config_file)

/home/john/flux-data-qaqc/example_data/FLX_US-AR1_FLUXNET2015_SUBSET_DD_2009-2012_1-3.xlsx 
 /home/john/flux-data-qaqc/example_data/fluxnet_config.ini


In [6]:
# view full header of input time series file
d.header

Index(['TIMESTAMP', 'TA_F', 'TA_F_QC', 'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC',
       'LW_IN_F', 'LW_IN_F_QC', 'VPD_F', 'VPD_F_QC', 'PA_F', 'PA_F_QC', 'P_F',
       'P_F_QC', 'WS_F', 'WS_F_QC', 'USTAR', 'USTAR_QC', 'NETRAD', 'NETRAD_QC',
       'PPFD_IN', 'PPFD_IN_QC', 'PPFD_OUT', 'PPFD_OUT_QC', 'SW_OUT',
       'SW_OUT_QC', 'LW_OUT', 'LW_OUT_QC', 'CO2_F_MDS', 'CO2_F_MDS_QC',
       'TS_F_MDS_1', 'TS_F_MDS_1_QC', 'SWC_F_MDS_1', 'SWC_F_MDS_1_QC',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'LE_CORR_25', 'LE_CORR_75', 'LE_RANDUNC', 'H_F_MDS', 'H_F_MDS_QC',
       'H_CORR', 'H_CORR_25', 'H_CORR_75', 'H_RANDUNC', 'NEE_VUT_REF',
       'NEE_VUT_REF_QC', 'NEE_VUT_REF_RANDUNC', 'NEE_VUT_25', 'NEE_VUT_50',
       'NEE_VUT_75', 'NEE_VUT_25_QC', 'NEE_VUT_50_QC', 'NEE_VUT_75_QC',
       'RECO_NT_VUT_REF', 'RECO_NT_VUT_25', 'RECO_NT_VUT_50', 'RECO_NT_VUT_75',
       'GPP_NT_VUT_REF', 'GPP_NT_VUT_25', 'GPP_NT_VUT_50', 'GPP_NT_VUT_75',
       'RECO_DT_VUT_REF', 'RECO_D

# load date-indexed DataFrame using ``.df``

* note, if there are variables stated in the config file but not found in the header of the input file, they will be filled with NaN (null) values in the dataframe

In [7]:
d.df.head()



Unnamed: 0_level_0,t_avg,sw_pot,sw_in,lw_in,vpd,ppt,ws,Rn,sw_out,lw_out,G,LE,LE_corr,H,H_corr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01,2.803,186.71,123.108,261.302,1.919,0.0,3.143,,,,,67.1459,43.8414,20.3876,13.3116
2009-01-02,2.518,187.329,121.842,268.946,0.992,0.0,2.093,,,,,92.8616,60.9673,32.6505,21.4364
2009-01-03,5.518,188.008,124.241,268.004,2.795,0.0,4.403,,,,,75.8029,50.3151,20.0569,13.313
2009-01-04,-3.753,188.742,113.793,246.675,0.892,0.0,4.336,,,,,67.1459,45.0539,20.3876,13.6798
2009-01-05,-2.214,189.534,124.332,244.478,1.304,0.0,2.417,,,,,92.8616,62.6443,32.6505,22.026


## you can modify the dataframe or assign new columns or even assign a new dataframe within Python

In [8]:
x = d.df
x += 100
d.df = x
d.df *= 5
d.df.head()

Unnamed: 0_level_0,t_avg,sw_pot,sw_in,lw_in,vpd,ppt,ws,Rn,sw_out,lw_out,G,LE,LE_corr,H,H_corr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01,514.015,1433.55,1115.54,1806.51,509.595,500.0,515.715,,,,,835.7295,719.207,601.938,566.558
2009-01-02,512.59,1436.645,1109.21,1844.73,504.96,500.0,510.465,,,,,964.308,804.8365,663.2525,607.182
2009-01-03,527.59,1440.04,1121.205,1840.02,513.975,500.0,522.015,,,,,879.0145,751.5755,600.2845,566.565
2009-01-04,481.235,1443.71,1068.965,1733.375,504.46,500.0,521.68,,,,,835.7295,725.2695,601.938,568.399
2009-01-05,488.93,1447.67,1121.66,1722.39,506.52,500.0,512.085,,,,,964.308,813.2215,663.2525,610.13


---
# using the `QaQc` class to correct latent energy and sensible heat

* note, the method used for corrections will be documented soon

In [9]:
# read in data fresh and use it to create a QaQc instance
data = Data(config_path)
q = QaQc(data)



In [10]:
# data is not corrected yet:
q.corrected

False

In [11]:
# data has not changed...
q.df.head()

Unnamed: 0_level_0,t_avg,sw_pot,sw_in,lw_in,vpd,ppt,ws,Rn,sw_out,lw_out,G,LE,LE_corr,H,H_corr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01,2.803,186.71,123.108,261.302,1.919,0.0,3.143,,,,,67.1459,43.8414,20.3876,13.3116
2009-01-02,2.518,187.329,121.842,268.946,0.992,0.0,2.093,,,,,92.8616,60.9673,32.6505,21.4364
2009-01-03,5.518,188.008,124.241,268.004,2.795,0.0,4.403,,,,,75.8029,50.3151,20.0569,13.313
2009-01-04,-3.753,188.742,113.793,246.675,0.892,0.0,4.336,,,,,67.1459,45.0539,20.3876,13.6798
2009-01-05,-2.214,189.534,124.332,244.478,1.304,0.0,2.417,,,,,92.8616,62.6443,32.6505,22.026


In [12]:
# note the original columns
import pprint
pprint.pprint(', '.join(q.df.columns))

('t_avg, sw_pot, sw_in, lw_in, vpd, ppt, ws, Rn, sw_out, lw_out, G, LE, '
 'LE_corr, H, H_corr')


In [13]:
q.elevation, q.latitude # necessary for computing clear sky radiation

(611, 36.4267)

In [14]:
# run correction
q.correct_data()
q.df.head()

  return np.arccos(-np.tan(lat) * np.tan(delta))


Unnamed: 0_level_0,t_avg,sw_pot,sw_in,lw_in,vpd,ppt,ws,Rn,sw_out,lw_out,...,H_adj,flux_adj,flux_corr,et_corr,ebc_corr,et_reg,et_adj,ebc_reg,ebc_adj,rso
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01,2.803,186.71,123.108,261.302,1.919,0.0,3.143,,,,...,20.3876,87.5335,57.153,1.515159,,2.320562,2.320562,,,
2009-01-02,2.518,187.329,121.842,268.946,0.992,0.0,2.093,,,,...,32.6505,125.5121,82.4037,2.10703,,3.209297,3.209297,,,
2009-01-03,5.518,188.008,124.241,268.004,2.795,0.0,4.403,,,,...,20.0569,95.8598,63.6281,1.73889,,2.619748,2.619748,,,
2009-01-04,-3.753,188.742,113.793,246.675,0.892,0.0,4.336,,,,...,20.3876,87.5335,58.7337,1.557063,,2.320562,2.320562,,,
2009-01-05,-2.214,189.534,124.332,244.478,1.304,0.0,2.417,,,,...,32.6505,125.5121,84.6703,2.164987,,3.209297,3.209297,,,


# correct energy balance using `flux-data-qaqc` methods

In [15]:
q.correct_data()
q.corrected

True

In [16]:
# now we have original data plus adjusted variables, energy balance ratios, and others
pprint.pprint(', '.join(q.df.columns))

('t_avg, sw_pot, sw_in, lw_in, vpd, ppt, ws, Rn, sw_out, lw_out, G, LE, '
 'LE_corr, H, H_corr, energy, flux, bowen_ratio, LE_adj, H_adj, flux_adj, '
 'flux_corr, et_corr, ebc_corr, et_reg, et_adj, ebc_reg, ebc_adj, rso')


In [17]:
q.df.head()

Unnamed: 0_level_0,t_avg,sw_pot,sw_in,lw_in,vpd,ppt,ws,Rn,sw_out,lw_out,...,H_adj,flux_adj,flux_corr,et_corr,ebc_corr,et_reg,et_adj,ebc_reg,ebc_adj,rso
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01,2.803,186.71,123.108,261.302,1.919,0.0,3.143,,,,...,20.3876,87.5335,57.153,1.515159,,2.320562,2.320562,,,
2009-01-02,2.518,187.329,121.842,268.946,0.992,0.0,2.093,,,,...,32.6505,125.5121,82.4037,2.10703,,3.209297,3.209297,,,
2009-01-03,5.518,188.008,124.241,268.004,2.795,0.0,4.403,,,,...,20.0569,95.8598,63.6281,1.73889,,2.619748,2.619748,,,
2009-01-04,-3.753,188.742,113.793,246.675,0.892,0.0,4.336,,,,...,20.3876,87.5335,58.7337,1.557063,,2.320562,2.320562,,,
2009-01-05,-2.214,189.534,124.332,244.478,1.304,0.0,2.417,,,,...,32.6505,125.5121,84.6703,2.164987,,3.209297,3.209297,,,


In [18]:
# view time series of select variable
p = figure(x_axis_label='date', y_axis_label='net radiation w/m2')
p.line(q.df.index, q.df.Rn, line_width=2)
p.xaxis.formatter = DatetimeTickFormatter(days="%d-%b-%Y")
show(p)

## temporally aggregate to monthly data using sums for ET and P, and means for all others

In [19]:
q.monthly_df.head()

Unnamed: 0_level_0,vpd,ws,H_corr,energy,flux_adj,Rn,ebc_adj,lw_out,sw_pot,G,...,LE_corr,bowen_ratio,LE_adj,lw_in,flux,t_avg,et_reg,et_adj,et_corr,ppt
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-31,1.959129,3.534355,17.626603,,102.470565,,,,203.904032,20.204263,...,57.066065,0.306513,78.233868,261.216323,102.470565,1.424161,,,,
2009-02-28,3.757714,3.828571,19.408454,,103.773596,,,,262.85725,12.974747,...,62.864507,0.308216,79.112686,274.406607,103.773596,6.423714,,,,
2009-03-31,6.457871,4.35871,18.794103,,102.739155,,,,341.04129,12.975887,...,60.914145,0.305254,78.513126,305.189258,102.739155,10.815452,,,,
2009-04-30,4.942533,4.482,19.565137,,103.674777,,,438.996567,414.747833,13.440414,...,62.844563,0.307787,79.06194,323.534467,103.674777,13.427233,,,,
2009-05-31,6.90071,3.529613,21.571916,,131.125547,,,425.21829,464.501226,7.620833,...,84.779639,0.267686,104.509681,358.490258,131.125547,17.991774,,,,


## alternatively create a QaQc instance from a pandas.DataFrame using `QaQc.from_dataframe`

Be sure to have the main energy balance components in the dataframe at daily time steps the dataframe index should be a daily datetime index as well, they should be named: 
* Rn, G, H, LE  

Otherwise you will not be able to run the energy balance correction routine, the example below shows that only the four energy balance components are needed to run the routine. **Note:**  we need to assign station elevation (m) and latitude (dec. degrees) which are normaly in the config file however this method gives the ability to use arbitrary daily time series data within Python.

In [20]:
data = Data(config_path)
# using the same dataframe here but this can be any with the correct variable names
df = data.df
# drop all other variables except those needed to demonstrate
df = df.drop([c for c in df.columns if not c in ['Rn', 'G', 'H', 'LE']], axis=1)
q = QaQc.from_dataframe(df, elev_m=611, lat_dec_deg=36.4267)
q.df.head()



Unnamed: 0_level_0,Rn,G,LE,H
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-01,,,67.1459,20.3876
2009-01-02,,,92.8616,32.6505
2009-01-03,,,75.8029,20.0569
2009-01-04,,,67.1459,20.3876
2009-01-05,,,92.8616,32.6505


## compare monthly energy balance correction ratio with raw data and corrected

In [22]:
p = figure(x_axis_label='date', y_axis_label='energy balance correction ratio')
p.line(q.monthly_df.index, q.monthly_df['ebc_reg'], color='red', legend="Raw", line_width=2)
p.line(q.monthly_df.index, q.monthly_df['ebc_adj'], legend="Corrected", line_width=2)
p.xaxis.formatter = DatetimeTickFormatter(days="%d-%b-%Y")
show(p)