In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime

import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr
from matplotlib import cm
from matplotlib import pyplot as plt
from Turbulence_processing.Scaling import *

## open data

In [4]:
path = "../../../../Dataset/MetCrax/"
ds = xr.open_dataset(path + "MetCrax_full_30min.nc")
ds_low = xr.open_dataset(path + "MetCrax_lowfreq_30min.nc")

In [5]:
# BLH
BLH = xr.open_dataset(path + "Analysis/MetCrax_BLH_ERA5.nc")
BLH = BLH.blh

## select daytime

In [7]:
def select_times(ds):
    # in UTC time
    hour = ds.time.dt.hour
    day = (hour >= 15) & (hour <= 23)
    night = (hour >= 1) & (hour <= 14)
    return night, day

In [8]:
# select daytime
ds = ds.where(select_times(ds)[1], drop=True)
ds_low = ds_low.where(select_times(ds_low)[1], drop=True)
BLH = BLH.where(select_times(BLH)[1], drop=True)

## add scaled variables

In [10]:
ds = Gradients_Anisotropy.Anisotropy_calculation(ds, RGB=False)

ds = Gradients_Anisotropy.Gradients_calculation_splines_1ds(
    ds, ds_low, roughness_length=0.02
)

In [11]:
ds = ds.assign(
    # Boundary Layer Height
    BLH=BLH,
    # tke and stress budget
    # transport
    dz_wtke=ds.wtke.differentiate(coord="heights"),
    # production
    tkeprod_shear=ds.uw * ds.gradU,
    tkeprod_buoy=9.81 / ds.meanT * ds.wT,
    # turbulent diffusion
    dz_uuw=ds.uuw.differentiate(coord="heights"),
    dz_uvw=ds.uvw.differentiate(coord="heights"),
    dz_uww=ds.uww.differentiate(coord="heights"),
    dz_vvw=ds.vvw.differentiate(coord="heights"),
    dz_vww=ds.vww.differentiate(coord="heights"),
    dz_www=ds.www.differentiate(coord="heights"),
)

In [12]:
ds = ds.drop(
    [
        "StatU",
        "StatUW",
        "StatWT",
        "uu",
        "vv",
        "ww",
        "uv",
        "uw",
        "vw",
        "xb",
        "LLJ",
        "uuu",
        "vvv",
        "www",
        "TTT",
        "uuuu",
        "vvvv",
        "wwww",
        "TTTT",
        "epsV",
        "epsW",
        "epsT",
        "epsUsf",
        "epsVsf",
        "epsWsf",
        "slopeHU",
        "slopeHV",
        "slopeHW",
        "slopeHT",
        "slopeHUsf",
        "uwT",
        "vwT",
        "wwT",
        "uTT",
        "vTT",
        "wTT",
        "slopeHVsf",
        "slopeHWsf",
        "Ri",
        "Rif",
        "uuv",
        "uvv",
        "uuw",
        "uww",
        "uvw",
        "vvu",
        "vvw",
        "vww",
        "utke",
        "vtke",
        "wtke",
        "cutoff",
        "cutoffW",
        "StatV",
        "StatW",
        "StatT",
        "StatVW",
        "StatUT",
        "StatVT",
    ]
)

In [22]:
list(ds.data_vars)

['meanT',
 'meanU',
 'dir',
 'TT',
 'uT',
 'vT',
 'wT',
 'ustar',
 'tke',
 'slopeLW',
 'slopeLU',
 'epsU',
 'slopeLV',
 'slopeLT',
 'intlenU',
 'intlenV',
 'intlenW',
 'yb',
 'gradU',
 'gradT',
 'BLH',
 'dz_wtke',
 'tkeprod_shear',
 'tkeprod_buoy',
 'dz_uuw',
 'dz_uvw',
 'dz_uww',
 'dz_vvw',
 'dz_vww',
 'dz_www']

### merge and save

In [24]:
ds.to_netcdf("Metcrax_forest_data.nc")

## pass to pandas and select groups
selected for test 8may B1 middle clouds, 23 may S3 clear sky and 11 june W3 clear sky

In [27]:
ds = xr.open_dataset("Metcrax_forest_data.nc")

In [29]:
# stack ds
ds = (
    ds.stack(index=("time", "heights"))
    .reset_index("index")
    .dropna(dim="index", how="any")
)

In [30]:
# hour that starts the day
day_start = 14

# Define group labels
dates = pd.to_datetime(ds.time)
groups = (
    dates
    - pd.to_datetime(datetime(dates[0].year, dates[0].month, dates[0].day, day_start))
).days

ds = ds.assign(groups=(["index"], groups + 1))

# test group
test_groups = [7, 22]

# split
train_bool = np.invert(np.isin(groups, test_groups))
groups_train = groups[train_bool]
ds_train = ds.where(train_bool).dropna(dim="index")
ds_test = ds.where(np.invert(train_bool)).dropna(dim="index")

In [32]:
# Pandas
vars = list(ds.data_vars)
vars.remove("yb")
X_train = ds_train.to_dataframe()[vars]
y_train = ds_train.to_dataframe()["yb"]
X_test = ds_test.to_dataframe()[vars]
y_test = ds_test.to_dataframe()["yb"]


# drop groups where not necessary
X_test = X_test.drop(columns="groups")

In [34]:
# save
X_train.to_csv("xtrain.csv")
y_train.to_csv("ytrain.csv")
X_test.to_csv("xtest.csv")
y_test.to_csv("ytest.csv")

In [36]:
# X_train = pd.read_csv("./xtrain.csv", index_col=0)
# y_train = pd.read_csv("./ytrain.csv", index_col=0)
# X_test = pd.read_csv("./xtest.csv", index_col=0)
# y_test = pd.read_csv("./ytest.csv", index_col=0)
# groups = X_train['groups']
# X_train = X_train.drop(columns='groups')