# STEP 1: Preprocessing PDSI and GLDAS data into Tabular Format

In [None]:
import utils
import os
import data_classes
import netCDF4 as nc
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
pdsi = data_classes.PDSIData()
variables = pdsi.variables
files = pdsi.get_file_names()

source_data_root = r'C:\Users\saulg\Desktop\Remote_Data\pdsi'
target_data_root = r'C:\Users\saulg\Desktop\Remote_Data\pdsi_tabular'
dates = utils.get_date_range(date_start='01/01/1850', date_end='12/31/2020', parse=False)

cell_names = ['Cell_' + str(i) for i in range(pdsi.n_cells)]
data_collection = {variable: [] for variable in variables}

for i, variable in enumerate(variables):
    # step 1: open up the dataset, parse the array and flip it so that the orientation is correct
    for j, file in tqdm(enumerate(files)):
        dataset = nc.Dataset(os.path.join(source_data_root, file))
        array = np.flip(dataset.variables[variable][:].data, axis=1)
        data_collection[variable].append(array)

    # step 2: squeeze arrays to make them two-dimensional
    variable_array = np.squeeze(np.array(data_collection[variable]), axis=0)

    # step 3: flatten array by rows, then reshape it based on time (rows) and cells (columns)
    flat_variable_array = variable_array.reshape(-1, pdsi.n_cells)

    # step 4: create a dataframe with the flattened array
    variable_dataframe = pd.DataFrame(
        flat_variable_array,
        index= dates,  # Repeat dates for each row
        columns=cell_names  # Repeat the created row names
    )

    print(f"Created {variable} number {str(i + 1)} / {str(len(variables))}")
    utils.save_pickle(data = variable_dataframe, file_name=f"{str(variable)}.pickle", path = target_data_root)
    del variable_array
    del variable_dataframe
    del data_collection[variable]

In [None]:
gldas = data_classes.GLDASData()
variables = gldas.variables
files = gldas.get_file_names()

source_data_root = r'C:\Users\saulg\Desktop\Remote_Data\GLDAS'
target_data_root = r'C:\Users\saulg\Desktop\Remote_Data\gldas_tabular'
dates = utils.get_date_range(files[0], files[-1])

cell_names = ['Cell_' + str(i) for i in range(gldas.n_cells)]
data_collection = {variable: [] for variable in variables}

for i, variable in enumerate(variables):
    # step 1: open up the dataset, parse the array and flip it so that the orientation is correct
    for j, file in tqdm(enumerate(files)):
        dataset = nc.Dataset(os.path.join(source_data_root, file))
        array = np.flip(np.squeeze(dataset.variables[variable][:], axis=0).data, axis=0)
        data_collection[variable].append(array)

    # step 2: concatenate arrays to make them two-dimensional
    variable_array = np.array(data_collection[variable])

    # step 3: flatten array by rows, then reshape it based on time (rows) and cells (columns)
    flat_variable_array = variable_array.reshape(-1, gldas.n_cells)

    # step 4: create a dataframe with the flattened array
    variable_dataframe = pd.DataFrame(
        flat_variable_array,
        index=dates[0:len(files)],
        columns=cell_names[:])

    print(f"Created {variable} number {str(i + 1)} / {str(len(variables))}")
    utils.save_pickle(data = variable_dataframe, file_name=f"{str(variable)}.pickle", path = target_data_root)
    del variable_array
    del variable_dataframe
    del data_collection[variable]

# Step 2: Transform PDSI, GLDAS, and Well Observations into format for ML

In [None]:
import utils
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
path_shape = '/home/saul/workspace/Well_Imputation/Aquifer Shapes/Beryl_Enterprise.shp'
aquifer_shape = utils.load_shapefile(path_shape)

In [None]:
# parse pdsi data and save it
directory_pdsi = r"/mnt/c/Users/saulg/Desktop/Remote_Data/pdsi_tabular"
pdsi = utils.pull_relevant_data(aquifer_shape, dataset_name="PDSI", dataset_directory=directory_pdsi)
utils.save_pickle(pdsi, "pdsi_data.pickle", "Datasets/")

In [None]:
# parse the GLDAS data and save it
directory_gldas = r"/mnt/c/Users/saulg/Desktop/Remote_Data/gldas_tabular"
gldas = utils.pull_relevant_data(aquifer_shape, dataset_name="GLDAS", dataset_directory=directory_gldas)
utils.save_pickle(gldas, "gldas_data.pickle", "Datasets/")

In [None]:
# Process well data from csv files
well_locations = pd.read_csv("Aquifers Data/EscalanteBerylLocation.csv")
well_timeseries = pd.read_csv("Aquifers Data/EscalanteBerylTimeseries.csv")
data = utils.transform_well_data(well_timeseries, well_locations)
utils.save_pickle(data, "BerylEnterpriseData.pickle", "./Datasets/")


In [None]:
# Plot the timeseries data to see if it looks reasonable
plt.plot(data["timeseries"], '-.')
plt.show()

# Step 3: Develop initial imputation model

# Step 4: Develop iterative refinement model

# Step 5: Analyze spatial characteristics of imputation model

# Step 6: Calculate Storage Change