## Import dependencies

In [None]:
import utils
import os
import data_classes
import netCDF4 as nc
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
def date_parse(date:str) -> str:
    substring = date.split(".")[1]
    year = substring[1:5]
    month = substring[5:7]
    day = "01"
    return "-".join([year, month, day])

def get_date_range(date_start:str, date_end:str, parse:bool=True):
    if parse:
        date_start = date_parse(date_start)
        date_end = date_parse(date_end)
    dates = pd.date_range(start=date_start, end=date_end, freq='MS')
    return dates

In [None]:
pdsi = data_classes.PDSIData()
variables = pdsi.variables
files = pdsi.get_file_names()

source_data_root = r'C:\Users\saulg\Desktop\Remote_Data\pdsi'
target_data_root = r'C:\Users\saulg\Desktop\Remote_Data\pdsi_tabular'
dates = get_date_range(date_start='01/01/1850', date_end='12/31/2020', parse=False)

cell_names = ['Cell_' + str(i) for i in range(pdsi.n_cells)]
data_collection = {variable: [] for variable in variables}

for i, variable in enumerate(variables):
    # step 1: open up the dataset, parse the array and flip it so that the orientation is correct
    for j, file in tqdm(enumerate(files)):
        dataset = nc.Dataset(os.path.join(source_data_root, file))
        array = np.flip(dataset.variables[variable][:].data, axis=1)
        data_collection[variable].append(array)

    # step 2: squeeze arrays to make them two-dimensional
    variable_array = np.squeeze(np.array(data_collection[variable]), axis=0)

    # step 3: flatten array by rows, then reshape it based on time (rows) and cells (columns)
    flat_variable_array = variable_array.reshape(-1, pdsi.n_cells)

    # step 4: create a dataframe with the flattened array
    variable_dataframe = pd.DataFrame(
        flat_variable_array,
        index= dates,  # Repeat dates for each row
        columns=cell_names  # Repeat the created row names
    )

    print(f"Created {variable} number {str(i + 1)} / {str(len(variables))}")
    utils.save_pickle(data = variable_dataframe, file_name=f"{str(variable)}.pickle", path = target_data_root)
    del variable_array
    del variable_dataframe
    del data_collection[variable]

In [None]:
gldas = data_classes.GLDASData()
variables = gldas.variables
files = gldas.get_file_names()

source_data_root = r'C:\Users\saulg\Desktop\Remote_Data\GLDAS'
target_data_root = r'C:\Users\saulg\Desktop\Remote_Data\gldas_tabular'
dates = get_date_range(files[0], files[-1])

cell_names = ['Cell_' + str(i) for i in range(gldas.n_cells)]
data_collection = {variable: [] for variable in variables}

for i, variable in enumerate(variables):
    # step 1: open up the dataset, parse the array and flip it so that the orientation is correct
    for j, file in tqdm(enumerate(files)):
        dataset = nc.Dataset(os.path.join(source_data_root, file))
        array = np.flip(np.squeeze(dataset.variables[variable][:], axis=0).data, axis=0)
        data_collection[variable].append(array)

    # step 2: concatenate arrays to make them two-dimensional
    variable_array = np.array(data_collection[variable])

    # step 3: flatten array by rows, then reshape it based on time (rows) and cells (columns)
    flat_variable_array = variable_array.reshape(-1, gldas.n_cells)

    # step 4: create a dataframe with the flattened array
    variable_dataframe = pd.DataFrame(
        flat_variable_array,
        index=dates[0:len(files)],
        columns=cell_names[:])

    print(f"Created {variable} number {str(i + 1)} / {str(len(variables))}")
    utils.save_pickle(data = variable_dataframe, file_name=f"{str(variable)}.pickle", path = target_data_root)
    del variable_array
    del variable_dataframe
    del data_collection[variable]