In [2]:
import numpy as np 
import pandas as pd
import os
from typing import Dict, List, Tuple


In [3]:
def _get_list_of_files(folder_path) -> Tuple[List, List]:
    folder: os.DirEntry = os.scandir(folder_path)          # Reads the directory
    for file in folder:                                         # Loop over files
        if file.name[-len(filetype):] == filetype:    # Checks for files of filetype
            folder_list_data.append(file.name)             # Adds filenames with correct filetype to list
        else:
            folder_list_supplementary.append(file.name)    # Add all other files to a supplementary list
    folder.close()                                              # Close the folder to prevent mishaps
    return (folder_list_data, folder_list_supplementary)

In [4]:
def _read_file_unparsed(path) -> pd.DataFrame:
        df_input = pd.read_csv(path, delim_whitespace=True, parse_dates=[0], names=["date", "discharge"]) # Creates a dataframe with space as delim
        return df_input

In [5]:
def _slice_dataframe(df_input) -> pd.DataFrame:
        df_input["day"] = df_input["date"].dt.day
        df_input["month"] = df_input["date"].dt.month
        df_input["year"] = df_input["date"].dt.year
        indexes = list(zip(df_input["year"], df_input["month"], df_input["day"]))
        index = pd.MultiIndex.from_tuples(indexes, names=["year", "month", "day"])
        df_output: pd.DataFrame = pd.DataFrame(df_input["discharge"], copy=True)
        df_output.set_index(index, inplace=True)

        hyearstart = (df_output.iloc[0].name[0], 9, 1)             # Start of the hydrological year in Norway
        hyearend = (df_output.iloc[-1].name[0], 8, 31)             # End of the hydrological year in Norway
        df_output = df_output.loc[hyearstart:hyearend]        # Slice the df to fit within the hydrological years
        
        # This trims the head and tail of data sets to make them span a hydrological year, without having nan-values at start and end
        
        parse = True
        while parse: 
            start = df_output.iloc[0].name[0]
            slutt = df_output.iloc[-1].name[0]
            slice_head = df_output.loc[(start, 9, 1)].values == -9999
            slice_tail = df_output.loc[(slutt, 8, 31)].values == -9999
            
            if slice_head:
                hyearstart = (start + 1, 9, 1)
            if slice_tail:
                hyearend = (slutt - 1, 8, 31) 
            df_output = df_output.loc[hyearstart:hyearend]
            end_loop = df_output.loc[(start + 1, 9, 1)].values != -9999
            if end_loop:
                parse = False
            
        return df_output

In [6]:
def _check_nan(nan_ident, df_output) -> np.ndarray:
    # If it finds nan values in the data set, it returns the location as a tuple of the nan values
    # Potential issues is long periods of NaN values
    loc_nan: np.ndarray                                                         # This seems not needed again
    nan_values = df_output["discharge"] == nan_ident                  # Creates a series
    nr_nan_values = nan_values.value_counts()                                   # Counts values in a truth array
    if True in nr_nan_values.index:                                            # Check if any value is true
        loc_nan = nan_values[nan_values == True].index.values                   # Creates a list of tuples of NaN locations
        return loc_nan                                                          # Returns above list
    return 0                                                                    # Returns zero if no nan values


In [7]:
def main():
    data_files = []
    supp_files = []
    nan_loc = []
    files = []
    data_files, supp_files = _get_list_of_files(folder_path)
#     data_files = data_files[:20]

    
    for file in data_files:
        df = _read_file_unparsed(folder_path + file)
        df = _slice_dataframe(df)
        add = _check_nan(nan_ident, df)
        nan_loc.append(add)
        files.append(file)
        print(file, df)
    
    for x,y in zip(files, nan_loc):
        print(x,y)
    
    

In [8]:
folder_path = "./data/discharge_data_100/"
filetype = ".q"
folder_list_data = []
folder_list_supplementary = []
df_input: pd.DataFrame
df_output: pd.DataFrame
nan_ident = -9999
main()

2.634.q                 discharge
year month day           
1991 9     1     0.083963
           2     0.083963
           3     0.083963
           4     0.083963
           5     0.070818
...                   ...
2020 8     27    0.178247
           28    0.127344
           29    0.121626
           30    0.111551
           31    0.100905

[10593 rows x 1 columns]
12.209.q                 discharge
year month day           
1984 9     1     2.069690
           2     1.958015
           3     1.903579
           4     2.303943
           5     2.013355
...                   ...
2020 8     27    4.405151
           28    4.221302
           29    4.607733
           30    4.164824
           31    3.837400

[13149 rows x 1 columns]
105.1.q                 discharge
year month day           
1923 9     1     3.686123
           2     3.838511
           3     4.158132
           4     4.498317
           5     4.325597
...                   ...
2020 8     27    9.179423
           28

2.463.q                 discharge
year month day           
1986 9     1     4.769576
           2     3.644458
           3     3.036317
           4     2.165801
           5     2.053843
...                   ...
2020 8     27    1.160909
           28    1.329244
           29    1.288970
           30    1.173427
           31    1.085466

[12419 rows x 1 columns]
2.32.q                 discharge
year month day           
1917 9     1    24.859102
           2    24.859102
           3    21.771330
           4    21.347218
           5    19.290291
...                   ...
2020 8     27   10.800002
           28   10.510736
           29   10.186209
           30   10.074120
           31   10.000978

[37621 rows x 1 columns]
223.2.q                 discharge
year month day           
1923 9     1    14.287393
           2    14.287393
           3    13.537949
           4    13.173653
           5    12.816208
...                   ...
2020 8     27   18.200975
           28  

26.20.q                 discharge
year month day           
1970 9     1     4.828401
           2     8.938517
           3    14.464278
           4    12.632706
           5     9.936237
...                   ...
2020 8     27    3.533714
           28    2.792996
           29    2.261498
           30    1.847548
           31    1.540013

[18263 rows x 1 columns]
2.268.q                  discharge
year month day            
1934 9     1    204.844864
           2    211.811157
           3    165.450668
           4    124.714813
           5     89.633781
...                    ...
2020 8     27    23.328819
           28    21.097692
           29    19.974604
           30    19.860235
           31    16.932688

[31412 rows x 1 columns]
18.10.q                 discharge
year month day           
1981 9     1     0.190401
           2     0.164325
           3     0.152284
           4     0.164325
           5     0.190401
...                   ...
2020 8     27    1.016738
 

12.70.q                 discharge
year month day           
1919 9     1     8.162271
           2     6.559898
           3     6.233891
           4     5.764703
           5     5.318841
...                   ...
2020 8     27    2.655645
           28    2.898075
           29    2.661204
           30    2.550519
           31    2.463237

[36891 rows x 1 columns]
230.1.q                 discharge
year month day           
1961 9     1     0.497132
           2     0.497132
           3     0.497132
           4     0.770185
           5     1.135817
...                   ...
2020 8     27    0.284354
           28    0.265331
           29    0.239424
           30    0.222016
           31    0.208085

[21550 rows x 1 columns]
25.24.q                 discharge
year month day           
1971 9     1     1.123540
           2     1.249116
           3     1.249116
           4     1.249116
           5     1.382203
...                   ...
2020 8     27    6.270470
           28 

138.1.q                 discharge
year month day           
1917 9     1     3.803447
           2     2.048162
           3     3.333289
           4     8.381592
           5     8.661993
...                   ...
2020 8     27   16.915182
           28   11.407338
           29    7.986937
           30    5.926696
           31    4.670636

[37621 rows x 1 columns]
22.4.q                 discharge
year month day           
1896 9     1    19.224262
           2    20.088886
           3    23.166435
           4    25.853607
           5    27.275265
...                   ...
2020 8     27   36.005836
           28   32.610046
           29   29.182953
           30   30.295301
           31   41.659996

[45290 rows x 1 columns]
168.2.q                 discharge
year month day           
1985 9     1     1.892823
           2     1.282090
           3     1.033577
           4     0.869942
           5     1.420075
...                   ...
2020 8     27    1.124278
           28  

In [121]:
b.A.loc[0:8].value_counts().loc[-9999]


NameError: name 'b' is not defined