In [126]:
import numpy as np 
import pandas as pd 
import os 
from typing import Dict, List, Tuple
import shutil

In [127]:
def _get_list_of_files(folder_path) -> Tuple[List, List]:
    folder: os.DirEntry = os.scandir(folder_path)          # Reads the directory
    for file in folder:                                         # Loop over files
        if file.name[-len(filetype):] == filetype:    # Checks for files of filetype
            folder_list_data.append(file.name)             # Adds filenames with correct filetype to list
        else:
            folder_list_supplementary.append(file.name)    # Add all other files to a supplementary list
    folder.close()                                              # Close the folder to prevent mishaps
    return (folder_list_data, folder_list_supplementary)

In [128]:
def sample_data(sample):
    def _read_file_unparsed(path) -> pd.DataFrame:
        df_input = pd.read_csv(path, delim_whitespace=True, parse_dates=[0], names=["date", "discharge"]) # Creates a dataframe with space as delim
        return df_input
    def _slice_dataframe(df_input) -> pd.DataFrame:
        df_input["day"] = df_input["date"].dt.day
        df_input["month"] = df_input["date"].dt.month
        df_input["year"] = df_input["date"].dt.year
        indexes = list(zip(df_input["year"], df_input["month"], df_input["day"]))
        index = pd.MultiIndex.from_tuples(indexes, names=["year", "month", "day"])
        df_output: pd.DataFrame = pd.DataFrame(df_input["discharge"], copy=True)
        df_output.set_index(index, inplace=True)

        hyearstart = (df_output.iloc[0].name[0], 9, 1)             # Start of the hydrological year in Norway
        hyearend = (df_output.iloc[-1].name[0], 8, 31)             # End of the hydrological year in Norway
        df_output = df_output.loc[hyearstart:hyearend]        # Slice the df to fit within the hydrological years
        
        # This trims the head and tail of data sets to make them span a hydrological year, without having nan-values at start and end
        
        parse = True
        while parse: 
            start = df_output.iloc[0].name[0]
            slutt = df_output.iloc[-1].name[0]
            slice_head = df_output.loc[(start, 9, 1)].values == -9999
            slice_tail = df_output.loc[(slutt, 8, 31)].values == -9999
            
            if slice_head:
                hyearstart = (start + 1, 9, 1)
            if slice_tail:
                hyearend = (slutt - 1, 8, 31) 
            df_output = df_output.loc[hyearstart:hyearend]
            end_loop = df_output.loc[(start + 1, 9, 1)].values != -9999
            if end_loop:
                parse = False
            
        return df_output
    df = _read_file_unparsed(sample)
    df = _slice_dataframe(df)
    return df

In [145]:
def _create_metadata_df(file_path):
    df = pd.read_csv(file_path, delim_whitespace=True, header=0, index_col=[0,1])
    return df

In [161]:
def _write_to_file_unparsed(file, path, df, sample_data, supp_data = [-9999]*3):
    string_final = _create_string_metadata(file, df, supp_data) # Key is file
    path_final = path + file
    with open(path_final, "w") as o:
        o.write("# " + string_final + "\n")
        o.write("#"*50 + "\n")
    sample_data.to_csv(path_final, sep=" ", mode="a")
    

In [151]:
def _folder_management(folder_path):
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        os.mkdir(folder_path)
    else:
        os.mkdir(folder_path)

In [148]:
def _create_string_metadata(file, df, data):
#     dict_of_string = {}
    reg, main = file[:-2].split(".") # ["1", "2"]
    meta_data = df.loc[(int(reg), int(main))]
    meta_data = meta_data.apply(str)
    index = meta_data.index
    values = meta_data.values
    temp_list = ["regno " + reg, "mainno " + main]
    for x,y in zip(index, values):
        temp_list.append(" ".join([x,y]))
    temp_list.append("AAAA " + str(data[0]))
    temp_list.append("BBBB " + str(data[1]))
    temp_list.append("CCCC " + str(data[2]))
#     dict_of_string[file] = " ".join(temp_list)
    return " ".join(temp_list)

In [149]:
def main():
    folder_path = "./data/"
    data_folder = "discharge_data_100/"
    filetype = ".q"
    folder_list_data = []
    folder_list_supplementary = []
    
    data, supp = _get_list_of_files(folder_path + data_folder)
    sample_df = sample_data(folder_path + data_folder + data[0]) # A single dataframe used as a sample to save time
    
    mgn_path = folder_path + data_folder[:-1] + "_parsed/"
    _folder_management(mgn_path)
    
    supp_file_path = folder_path + data_folder + supp[0]
    station_df = _create_metadata_df(supp_file_path) # df of the .txt file with index (reg, main)
    for file in data:
        _write_to_file_unparsed(file, mgn_path, station_df, sample_df)


In [162]:
main()