# THIS IS A JUPYTER NOTEBOOK FOR TESTING DATA PROCESSING FUNCTIONS

In [30]:
import os
path = os.path.dirname(os.getcwd())+'\data\\'
print(path)
import numpy as np
import pandas as pd
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt

c:\Users\kubap\Documents\THESIS\gitrepo\data\


## Initial Formatting & Merging into one data file from multiple csv's

In [28]:
def LoadData(path): 
    
    def CheckCol(lst):
        return lst[1:] == lst[:-1] 
    
    flist = [f for f in listdir(path) if "Dataset" in f]
    col = []
    frames = []

    # Fist check if columns in all data*.csv files are consistent
    for file in flist: 
        raw_data = pd.read_csv(path+file, index_col=None, header=0)
        col.append(raw_data.columns.tolist())
        frames.append(raw_data)
        assert CheckCol(col), "Column names are inconsistent! I can't merge the files."

    
    mergedData = pd.concat(frames, ignore_index = True)
    # fill NaN
    mergedData.fillna(0, inplace = True)
    # dropping ALL duplicte values 
    mergedData.drop_duplicates(subset = None, keep = 'first', inplace = True) 
    # calculate the number of non-converged scenarios, can be useful for showing stat later
    nonconverged = mergedData[(mergedData['num_rays'] == 15000) & (mergedData['criterion'] == 0)]
    # leave out only converged scenarios
    convData = mergedData[(mergedData['num_rays'] != 20000) & (mergedData['criterion'] == 1)] #ALL RAW UNPROCESSED DATA
     # remove nonimportant cols
    convData = convData.drop(columns = ['runID','residual','runtime','criterion'])
    # reset index values (loses info from the initial set)
    convData.reset_index(drop=True, inplace=True)
    print(f' Files {flist} were merged and initally formatted.')
    return convData
# Load the data files and merge them
data = LoadData(path)

 Files ['Dataset_GIGA.csv', 'Dataset_GIGA_2.csv', 'Dataset_GIGA_3.csv', 'Dataset_UnseenTest.csv'] were merged and initally formatted.


# Surface & Bottom duct features

In [34]:
def FeatDuct(data, Input_Only = True):   
    # merge SD/BD features into one to emphasize duct propagation mode
    duct_cols = ['duct_prop_type','duct_width_if_sourceinduct', 'duct_SSP_if_sourceinduct']
    duct_df = pd.DataFrame(0, index=np.arange(len(data)), columns=duct_cols)
   
    data = pd.concat((data,duct_df), axis = 1)
    # Surface duct propagation features
    data.loc[data['duct_type'] == 'SD', 'duct_prop_type'] = 1
    data.loc[data['duct_type'] == 'SD', 'duct_width_if_sourceinduct'] =  data.loc[data['duct_type'] == 'SD', 'surface_duct_depth']
    data.loc[data['duct_type'] == 'SD', 'duct_SSP_if_sourceinduct'] = data.loc[data['duct_type'] == 'SD', 'surface_duct_SSP']
    #Bottom duct propagation features
    data.loc[data['duct_type'] == 'BD', 'duct_prop_type'] = -1
    data.loc[data['duct_type'] == 'BD', 'duct_width_if_sourceinduct'] =  data.loc[data['duct_type'] == 'BD', 'bottom_duct_width']
    data.loc[data['duct_type'] == 'BD', 'duct_SSP_if_sourceinduct'] = data.loc[data['duct_type'] == 'BD', 'bottom_duct_SSP']

    data = data.drop(columns = ['duct_type', 'surface_duct', 'bottom_duct', 'source_in_duct','surface_duct_depth','surface_duct_SSP','bottom_duct_width','bottom_duct_depth','bottom_duct_SSP'])        
               
    # DROPPING LOTS OF COLUMNS HERE TO LEAVE OUT PLAIN SIMULATION I/O DATA
    # These features were mostly inaccurate and will be re-created in the later process
    if Input_Only == True:
        data = data.drop(columns = ['deep_CH_axis','deep_CH_SSP','shallow_CH_axis','shallow_CH_SSP'])
        data = data.drop(columns = ['waveguide','CHmax_axis','SSP_CHmax'])
        data = data.drop(columns = 'SSP_source')
        data = data.drop(columns = duct_cols)
    return data
data = FeatDuct(data, Input_Only = True)
data.head()

Unnamed: 0,water_depth_min,water_depth_max,wedge_slope,source_depth,profile,bottom_type,num_rays
0,50,1500,-2,15,Mediterranean Sea Winter,1,9000
1,50,1500,-2,30,Mediterranean Sea Winter,1,10000
2,50,1500,-2,15,Mediterranean Sea Winter,2,9000
3,50,1500,-2,30,Mediterranean Sea Winter,2,8000
4,50,50,0,15,Mediterranean Sea Winter,1,2500


## Bathymetry features

In [35]:
def FeatBathy(data,path):
    
    Bathy = pd.read_excel(path+"env.xlsx", sheet_name = "BATHY")
    wedge = np.zeros([len(data),2]) #wedge parameters, bathymetry info
    
    for dmin, dmax, slope, row in zip(data['water_depth_min'], data['water_depth_max'], data['wedge_slope'], range(len(data)) ):
        
        ### Wedge Loop
        if slope == 0 or slope == -2:
            dstart = dmin
            dend = dmax
        else:
            dstart = dmax
            dend = dmin 
            
        find_lenflat = Bathy.loc[(Bathy['d_start'] == dstart) & (Bathy['d_end'] == dend), 'len_flat']
        lenflat = find_lenflat.values[0]        
        
        find_lenslope = Bathy.loc[(Bathy['d_start'] == dstart) & (Bathy['d_end'] == dend), 'len_slope']
        lenslope = find_lenslope.values[0]        

        wedge[row, 0] = lenflat
        wedge[row, 1] = lenslope
        
     
        
    df_wedge = pd.DataFrame(wedge)
    df_wedge.columns = ['len_flat','len_slope']
    # Choose only slope length, len_flat is redundant
    data = pd.concat([data, df_wedge.iloc[:,1]], axis=1, sort=False)    
    
    return data
data = FeatBathy(data,path)
data.head()

Unnamed: 0,water_depth_min,water_depth_max,wedge_slope,source_depth,profile,bottom_type,num_rays,len_slope
0,50,1500,-2,15,Mediterranean Sea Winter,1,9000,41523.0
1,50,1500,-2,30,Mediterranean Sea Winter,1,10000,41523.0
2,50,1500,-2,15,Mediterranean Sea Winter,2,9000,41523.0
3,50,1500,-2,30,Mediterranean Sea Winter,2,8000,41523.0
4,50,50,0,15,Mediterranean Sea Winter,1,2500,0.0


## SSP value at critical depths

In [36]:
def FeatSSPvec(data, path):
    ssp = pd.read_excel(path+"env.xlsx", sheet_name = "SSP")
    depth = ssp['DEPTH'].values.tolist()
    
    cmat = np.zeros([len(data),len(depth)]) #segmented & interpolated sound speed profile vector   
    weight = np.zeros([len(data),len(depth)]) #weights on the SSP
    
    
    for dmin, dmax, profile, slope, row in zip(data['water_depth_min'], data['water_depth_max'], data['profile'], data['wedge_slope'], range(len(data)) ):

        ### SSP-vec Loop
        
        # d is a depth approximation in case that ssp sampling doesn't match the grid in Bellhop
        d = min(depth, key=lambda x:abs(x-dmin))
        # idx matches the index of ssp-vec entry with max_depth in each scenarion
        # so in flat-bottom scn only a part of ssp is used
        idx = depth.index(d)+1
        
        if slope == 0:
            weight[row,0:idx] = 1.0
            cmat[row,0:idx] = ssp[profile].iloc[0:idx]
                
        else:
            rmax = 44000
            dmax = 1500 
            for dz in range(len(depth)):
                wedge_range = np.round((dmax-dmin)/np.tan(np.deg2rad(abs(slope))))
                start_wedge = 0.5*(rmax - wedge_range)
                ds = np.round((dmax-depth[dz])/np.tan(np.deg2rad(abs(slope))))
                
                # SSP weight matrix for changing the values of SSP-vec with respect to 
                # 'totale coverage' of the water column, so kinda fittign on the bathymetry shape
                # The weight matrix influence is controlled by weight-parameter
                # gamma! If gamma = 0/0 weight is 1.0 everywhere, effectively turning off the 
                # influence of the weight matrix
                
                gamma = 0.0
                
                weight[row,dz] = ((rmax - gamma*(start_wedge+wedge_range-ds))/rmax) 
                cmat[row,dz] = ssp[profile].iloc[dz]*weight[row,dz]

    colnames = []         
    for i in range(len(depth)):
        colnames.append("SSPd-"+str(depth[i]))    
    df_cmat = pd.DataFrame(cmat)
    df_cmat.columns = colnames
    data = pd.concat([data, df_cmat], axis=1, sort=False)    
    #data = data.drop(columns = 'profile')
    return data
data = FeatSSPvec(data, path)
data.head()

Unnamed: 0,water_depth_min,water_depth_max,wedge_slope,source_depth,profile,bottom_type,num_rays,len_slope,SSPd-0,SSPd-5,...,SSPd-1050,SSPd-1100,SSPd-1150,SSPd-1200,SSPd-1250,SSPd-1300,SSPd-1350,SSPd-1400,SSPd-1450,SSPd-1500
0,50,1500,-2,15,Mediterranean Sea Winter,1,9000,41523.0,1512.037617,1512.088496,...,1524.302246,1525.097813,1525.933066,1526.76832,1527.590625,1528.41293,1529.237207,1530.061484,1530.895606,1531.729727
1,50,1500,-2,30,Mediterranean Sea Winter,1,10000,41523.0,1512.037617,1512.088496,...,1524.302246,1525.097813,1525.933066,1526.76832,1527.590625,1528.41293,1529.237207,1530.061484,1530.895606,1531.729727
2,50,1500,-2,15,Mediterranean Sea Winter,2,9000,41523.0,1512.037617,1512.088496,...,1524.302246,1525.097813,1525.933066,1526.76832,1527.590625,1528.41293,1529.237207,1530.061484,1530.895606,1531.729727
3,50,1500,-2,30,Mediterranean Sea Winter,2,8000,41523.0,1512.037617,1512.088496,...,1524.302246,1525.097813,1525.933066,1526.76832,1527.590625,1528.41293,1529.237207,1530.061484,1530.895606,1531.729727
4,50,50,0,15,Mediterranean Sea Winter,1,2500,0.0,1512.037617,1512.088496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
