In [90]:
import numpy as np
import os
import xarray as xr
import pandas as pd
import os

In [102]:
output_dir = '/home/ubuntu/Sonar_data/Herring_training_data'
input_dir = '/home/ubuntu/Sonar_data/Herring_xarray'
# 38 kHz, but has to be checked each time
channels = [0, 1, 2]

image_dir = os.path.join(output_dir, 'images')
masks_dir = os.path.join(output_dir, 'masks')

In [103]:
# normalize can be done with mean and std from the whole dataset or just from one array
def zscore(data):
    nanmin = np.nanmin(data)
    #replace NaN values with min value-10
    data = np.where(np.isnan(data), nanmin-10, data)
    return (data - data.mean()) / data.std()

def min_max(data):
    nanmin = np.nanmin(data)
    #replace NaN values with min value-10
    data = np.where(np.isnan(data), nanmin-10, data)
    
    data_min = data.min()
    data_max = data.max()
    return (data - data_min) / (data_max - data_min)

In [104]:
nc_files = [file for file in os.listdir(input_dir) if file.endswith('.nc')]
for nc_file in nc_files:
    raw_path = os.path.join(input_dir, nc_file)
    #ds = xr.open_zarr(raw_path)
    ds = xr.open_dataset(raw_path)
    
    #raw_data = ds.Sv.isel(channel=channel).values.T  
    #raw_data = normalize(raw_data)
    #np.save(os.path.join(image_dir, nc_file.replace('.nc', '.npy')), raw_data)

    channel_data = []
    for channel in channels:
        data = ds.Sv.isel(channel=channel).values.T
        normalized_data = min_max(data)
        channel_data.append(normalized_data)
    # Stack the channel data along a new axis (similar to RGB channels)
    multi_channel_data = np.stack(channel_data, axis=-1)
    np.save(os.path.join(image_dir, nc_file.replace('.nc', '.npy')), multi_channel_data)
    
    # Extract the time and depth coordinates from the xarray dataset
    time_coords = ds['ping_time'].values
    depth_coords = ds['range_sample'].values
    
    mask_path = os.path.join(input_dir, nc_file.replace('.nc', '.csv'))
    mask_df = pd.read_csv(mask_path)
    mask_df['time'] = pd.to_datetime(mask_df['time'])
    
    # Create the pivot table
    pivot_table = mask_df.assign(value=1).pivot_table(index='time', columns='depth', values='value', fill_value=0)
    
    # Align the binary mask with the vals array
    time_index = pd.Index(time_coords)
    depth_index = pd.Index(depth_coords)
    
    # Reindex the pivot table to match the time and depth coordinates from the xarray dataset
    binary_mask_df = pivot_table.reindex(index=time_index, columns=depth_index, fill_value=0)
    
    # Convert the binary mask DataFrame to a NumPy array
    binary_mask = binary_mask_df.to_numpy().astype(int).T

    np.save(os.path.join(masks_dir, nc_file.replace('.nc', '.npy')), binary_mask)
    

In [105]:
binary_mask_df.sum().sum()

0.0

In [68]:
mask_df.time.min()

Timestamp('2007-07-05 03:35:57.157000')

In [69]:
mask_df[mask_df.depth.isin(depth_coords)]

Unnamed: 0,time,depth


In [106]:
depth_coords = ds['range_sample'].values
condition = (depth_coords > 72) & (depth_coords < 73)
depth_coords[condition]

array([72.00058468, 72.19270767, 72.38483065, 72.57695364, 72.76907662,
       72.9611996 ])

In [107]:
mask_df[(mask_df.depth>72) & (mask_df.depth<73)].depth.values

array([72.04015536, 72.04015536, 72.04493383, 72.22651546, 72.23129392,
       72.23129392, 72.61357104, 72.61357104, 72.6183495 , 72.99106969,
       72.99584816, 72.99584816, 72.8047096 , 72.8047096 , 72.22651546,
       72.23129392, 72.23129392, 72.42243248, 72.42243248, 72.42721094,
       72.8047096 , 72.8047096 , 72.23129392, 72.23129392, 72.60879258,
       72.61357104, 72.61357104, 72.04015536, 72.04015536])

In [108]:
time_coords = ds['ping_time'].values
condition = (time_coords > pd.to_datetime('2007-07-05 19:22:53')) & (time_coords < pd.to_datetime('2007-07-05 19:22:54'))
time_coords[condition]

array(['2007-07-05T19:22:53.657573888'], dtype='datetime64[ns]')

In [109]:
mask_df[(mask_df['time'] > pd.to_datetime('2007-07-05 19:22:53')) & (mask_df['time'] < pd.to_datetime('2007-07-05 19:22:54'))].time.values

array(['2007-07-05T19:22:53.157000000', '2007-07-05T19:22:53.157000000',
       '2007-07-05T19:22:53.157000000', '2007-07-05T19:22:53.157000000',
       '2007-07-05T19:22:53.157000000', '2007-07-05T19:22:53.157000000',
       '2007-07-05T19:22:53.157000000'], dtype='datetime64[ns]')

In [110]:
multi_channel_data.shape

(1301, 21472, 3)