#### This file is used to obtain the training, val & test data (only relevant to models which has been run on FullSWOF-2D)

In [1]:
import pandas as pd
import re
import os
import glob
import json

#### Loading Data

In [2]:
split = 'train'  ######## change this to test & val ########
data_path = '/Users/ragini/Desktop/Thesis/FullSWOF_2D-1.09.01_win_2/Examples/Model_6_extended_flow/'
params_path = glob.glob(os.path.join(data_path, split, '*.txt'))[0]     ######## check for param file in the folder ######## location
examples_path = glob.glob(os.path.join(data_path, split, '*'))
examples_path = [path for path in examples_path if path!=params_path]
evol_paths = [os.path.join(i, 'Outputs/huz_evolution.dat')
             for i in examples_path]
              
with open(params_path, 'r') as f:
    params = json.load(f)

#### DataFrame Parser

In [3]:
def parse_evolution_file_to_datafram(evol_path, params, identifier):
    dir_name = evol_path.split('/')[-3]
    exp_parameters = params[dir_name]
    slope = exp_parameters['slope']
    friction_coeff = exp_parameters['friction_coeff']
    
    output =[]
    fp = open(evol_path)
    
    for idx,line in enumerate(fp):
        # Skipping first 5 lines
        if idx<5:
            continue
        # Skipping new lines
        if line =='\n':
            continue
        # From # time extracting time
        elif line[:6] == '# time':
            extract_time = re.findall(r"[-+]?\d*\.\d+|\d+", line)
            
        # Extracting all the input variables and appending time as first element
        else:
            extract_input = extract_time + re.split(r'\t\s*', line.rstrip('\n'))
            output.append(extract_input)

    fp.close()
    
    # From extracted input creating dataframe
    evol_df = pd.DataFrame(output, columns=['time','i','(j-0.5)*dy','h','u','v[i][j]',
                              'h[i][j]+z[i][j]','z[i][j]','norm_U[i][j]','Fr[i][j] (Froude)','qx[i][j]',
                              'qy[i][j]','q'])
    # Dropping columns which are not required, FROUDE NUMBER CAN BE KEPT FOR DEBUGGING
    evol_df = evol_df.drop(columns=['(j-0.5)*dy','v[i][j]',
                              'h[i][j]+z[i][j]','z[i][j]','norm_U[i][j]','qx[i][j]','Fr[i][j] (Froude)',
                              'qy[i][j]'])
    # Keeping only points that are of interest
#     evol_df = evol_df[evol_df['i'].isin(['101','201','301','401'])].reset_index(drop=True)

    # Grouping everythong base on time and concatenating as strings
#     evol_df  = evol_df.groupby(['time']).agg({'i':','.join, 'u':','.join, 'h':','.join, 'q':','.join}).reset_index()
    
    # Converting time to float and sorting based on time
    evol_df['time'] = evol_df['time'].astype(float)
    evol_df['x'] = evol_df['i'].astype(int)
    evol_df = evol_df.drop(columns=['i'])
    evol_df['h'] = evol_df['h'].astype(float)
    evol_df['u'] = evol_df['u'].astype(float)
    evol_df['q'] = evol_df['q'].astype(float)

    evol_df = evol_df.sort_values('time')
    evol_df['slope'] = slope
    evol_df['friction_coeff'] = friction_coeff
    evol_df['identifier'] = identifier # To identify different hydrogrpahs later
    # Calculating du_dt
#     all_du_dt = [None] # Initial value None as it has previous value
#     for idx in range(1,len(evol_df)):
#         u_before = list(map(float,evol_df['u'][idx-1].split(',')))
#         u_now = list(map(float,evol_df['u'][idx].split(',')))
#         du = [u_now[i] - u_before[i] for i in range(len(u_before))]
#         dt = evol_df['time'][idx] - evol_df['time'][idx-1]
#         du_dt = du/dt
#         all_du_dt.append(','.join(map(str, du_dt)))
    
    # Storing du_dt and removing u
#     evol_df['du_dt'] = all_du_dt
#     evol_df = evol_df.drop(columns=['u'])
    
    # Dropping first row as it doesn't have du_dt
#     evol_df = evol_df.drop(0).reset_index(drop=True)
    return evol_df

#### Dataset Generation

In [4]:
all_evol_hydrographs = pd.DataFrame()
# for evol_path,hydrograph in zip(evol_paths,hydrographs):
for identifier,evol_path in enumerate(evol_paths):

    # Parsing a hydrograph
    evol = parse_evolution_file_to_datafram(evol_path, params, 8 +identifier)
    # Adding hydrograph column to differentiate different hydrographs
#     evol['hydrograph'] = hydrograph
    
    # COncatenating dataframes
    frames = [all_evol_hydrographs, evol] 
    all_evol_hydrographs = pd.concat(frames)
    
all_evol_hydrographs = all_evol_hydrographs.reset_index(drop=True)

#### Saving File as CSV

In [5]:
all_evol_hydrographs.to_csv(data_path + split +'.csv', index=False)