# Data Acquisition

Goal: Extract images from DAS raw data

# 0. Setup and Imports

In [1]:
from azure.storage.blob import BlobServiceClient
import zarr
import xarray as xr
from pathlib import Path
import os

import numpy as np
from datetime import datetime
from scipy import signal as sp
from PIL import Image

from dask.distributed import Client, progress

Setup azure blob and container

In [2]:
account_url = "https://dasdata.blob.core.windows.net/"

blob_service_client = BlobServiceClient(account_url)
container_client = blob_service_client.get_container_client("zarr")

container_client.get_container_properties()

{'name': 'zarr', 'last_modified': datetime.datetime(2023, 2, 2, 14, 24, 54, tzinfo=datetime.timezone.utc), 'etag': '"0x8DB052940E27EBB"', 'lease': {'status': 'unlocked', 'state': 'available', 'duration': None}, 'public_access': 'container', 'has_immutability_policy': False, 'deleted': None, 'version': None, 'has_legal_hold': False, 'metadata': {}, 'encryption_scope': <azure.storage.blob._models.ContainerEncryptionScope object at 0x7fdbfd0965c0>, 'immutable_storage_with_versioning_enabled': False}

Access South cable data and load it into xarray

In [3]:
store = zarr.ABSStore(client=container_client, prefix='ooi_South_Tx.zarr/ooi_South_Tx.zarr')
root = zarr.group(store=store)  
ds = xr.open_zarr(store)

# 1. Configuring variables to interact with data

Variables that will be useful to process data

In [4]:
# Constant parameters of raw data
fs = 200
channel_spacing = 2.0419

# Channels skip in time and distance axis. 1 means no skip, 2 means skip every other channel, etc.
dt = 2
dx = 1

# Parameters for bandpass filter
fs_new = fs//dt
low = 14
high = 35

# Duration of each image in seconds
time_dur_seconds = 15

In [5]:
#Skipping the first 10,000 channels
dist = range(10000,ds.dims["distance"],dx)
time = range(0,ds.dims["time"],dt)

In [6]:
 # Slicing based on previously defined distance range
ds = ds.loc[{"distance": dist, "time": time}]

Helper functions to process DAS data chunks

In [7]:
def median_subtract(data):
    """
    Subtract median of each timepoint from all traces
    """
    # x =  np.tile(np.median(data,axis=1),(data.shape[1],1)).T
    x = np.median(data,axis=1)
    
    return (data.T-x).T


def bandpass_filter(data, fs, low, high):
    """
    Apply a bandpass filter to the data.
    """
    assert fs > 2*high, "High frequency must be less than half the sampling frequency."

    nyq = fs/2
    low = low/nyq
    high = high/nyq
    
    b, a = sp.butter(8, [low, high], 'bandpass')
    
    return sp.filtfilt(b, a, data, axis=1)

def calc_envelope(data):
    """
    Calculate the envelope of the data and return its log transform.
    """
    
    return 20*np.log10(np.abs(sp.hilbert(data, axis=1))/1e-6)

def get_timestamp(times):
    """
    Convert the time to a timestamp.
    """

    return xr.DataArray([datetime.utcfromtimestamp(time*1e-6) for time in times.values], dims=times.dims, coords=times.coords)

def save_to_image(data, filename):
    """
    Save the data to an image file.
    """

    d = data
    filename = str(filename)
    d = (d - np.min(d))/(np.max(d) - np.min(d))*255 #Normalize
    
    im = Image.fromarray(np.uint8(d.T))
    im.resize((128, 128)).save(filename+".png")
    im.close()

    return data


Setting up Dask client to observe progress

In [8]:
client = Client()
client

2023-03-03 20:06:36,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-qh0mjor_', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-2y9slcz2', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-vipcpo6u', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-7xddchub', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-x2lr6ocx', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-npz7ie_o', purging
2023-03-03 20:06:36,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-twzcnuam', purging

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 64,Total memory: 251.70 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44907,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 64
Started: Just now,Total memory: 251.70 GiB

0,1
Comm: tcp://127.0.0.1:36191,Total threads: 8
Dashboard: http://127.0.0.1:38893/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:44807,
Local directory: /tmp/dask-worker-space/worker-g_ndjsh9,Local directory: /tmp/dask-worker-space/worker-g_ndjsh9

0,1
Comm: tcp://127.0.0.1:34093,Total threads: 8
Dashboard: http://127.0.0.1:37041/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:46125,
Local directory: /tmp/dask-worker-space/worker-renaaqlc,Local directory: /tmp/dask-worker-space/worker-renaaqlc

0,1
Comm: tcp://127.0.0.1:42449,Total threads: 8
Dashboard: http://127.0.0.1:34975/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:43465,
Local directory: /tmp/dask-worker-space/worker-7lnqjvn3,Local directory: /tmp/dask-worker-space/worker-7lnqjvn3

0,1
Comm: tcp://127.0.0.1:36219,Total threads: 8
Dashboard: http://127.0.0.1:46881/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:45863,
Local directory: /tmp/dask-worker-space/worker-4a5wi7fs,Local directory: /tmp/dask-worker-space/worker-4a5wi7fs

0,1
Comm: tcp://127.0.0.1:39211,Total threads: 8
Dashboard: http://127.0.0.1:39779/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:44523,
Local directory: /tmp/dask-worker-space/worker-dmobt56z,Local directory: /tmp/dask-worker-space/worker-dmobt56z

0,1
Comm: tcp://127.0.0.1:33095,Total threads: 8
Dashboard: http://127.0.0.1:44231/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:42515,
Local directory: /tmp/dask-worker-space/worker-4tdldic8,Local directory: /tmp/dask-worker-space/worker-4tdldic8

0,1
Comm: tcp://127.0.0.1:36853,Total threads: 8
Dashboard: http://127.0.0.1:37875/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:35497,
Local directory: /tmp/dask-worker-space/worker-8bkhj9ov,Local directory: /tmp/dask-worker-space/worker-8bkhj9ov

0,1
Comm: tcp://127.0.0.1:35475,Total threads: 8
Dashboard: http://127.0.0.1:32927/status,Memory: 31.46 GiB
Nanny: tcp://127.0.0.1:42153,
Local directory: /tmp/dask-worker-space/worker-75omr977,Local directory: /tmp/dask-worker-space/worker-75omr977


In [9]:
ds

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 18.68 MiB 1.46 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint8 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 18.68 MiB 1.46 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint8 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.71 MiB,5.86 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint32,numpy.ndarray
"Array Chunk Bytes 74.71 MiB 5.86 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint32 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,74.71 MiB,5.86 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,17.17 MiB
Shape,"(37500, 19584000)","(3000, 1500)"
Count,4 Graph Layers,169728 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 2.67 TiB 17.17 MiB Shape (37500, 19584000) (3000, 1500) Count 4 Graph Layers 169728 Chunks Type int32 numpy.ndarray",19584000  37500,

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,17.17 MiB
Shape,"(37500, 19584000)","(3000, 1500)"
Count,4 Graph Layers,169728 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 149.41 MiB 11.72 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type int64 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 149.41 MiB 11.72 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type int64 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray


In [10]:
# Assigning timestamp to coordinates for easy querying of data
times = ds.RawDataTime.map_blocks(get_timestamp)
ds = ds.assign_coords({"time": times.compute()})

In [11]:
ds

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 18.68 MiB 1.46 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint8 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 18.68 MiB 1.46 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint8 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,18.68 MiB,1.46 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.71 MiB,5.86 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint32,numpy.ndarray
"Array Chunk Bytes 74.71 MiB 5.86 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type uint32 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,74.71 MiB,5.86 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,uint32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,17.17 MiB
Shape,"(37500, 19584000)","(3000, 1500)"
Count,4 Graph Layers,169728 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 2.67 TiB 17.17 MiB Shape (37500, 19584000) (3000, 1500) Count 4 Graph Layers 169728 Chunks Type int32 numpy.ndarray",19584000  37500,

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,17.17 MiB
Shape,"(37500, 19584000)","(3000, 1500)"
Count,4 Graph Layers,169728 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 149.41 MiB 11.72 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type int64 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 149.41 MiB 11.72 kiB Shape (19584000,) (1500,) Count 3 Graph Layers 13056 Chunks Type int64 numpy.ndarray",19584000  1,

Unnamed: 0,Array,Chunk
Bytes,149.41 MiB,11.72 kiB
Shape,"(19584000,)","(1500,)"
Count,3 Graph Layers,13056 Chunks
Type,int64,numpy.ndarray


# 2. Setting up list of functions to execute and running them

In [12]:
# Setting up computation graph for Dask. Operations are performed in the order they are defined.
rawData = ds.RawData
out = xr.apply_ufunc(median_subtract, rawData, dask="parallelized", output_dtypes=[rawData.dtype])
out = xr.apply_ufunc(bandpass_filter, out, fs_new, low, high, dask="parallelized", output_dtypes=[out.dtype])
out = xr.apply_ufunc(calc_envelope, out, dask="parallelized", output_dtypes=[out.dtype])
out = out.chunk({"time": fs_new*time_dur_seconds, "distance": 37500})
#Slicing and processing only 24hr of data
# out = out.sel(time=slice("2021-11-01T23:11:14.834000000", "2021-11-04T05:35:14.824000000"))

In [13]:
out

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,214.58 MiB
Shape,"(37500, 19584000)","(37500, 1500)"
Count,17 Graph Layers,13056 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 2.67 TiB 214.58 MiB Shape (37500, 19584000) (37500, 1500) Count 17 Graph Layers 13056 Chunks Type int32 numpy.ndarray",19584000  37500,

Unnamed: 0,Array,Chunk
Bytes,2.67 TiB,214.58 MiB
Shape,"(37500, 19584000)","(37500, 1500)"
Count,17 Graph Layers,13056 Chunks
Type,int32,numpy.ndarray


In [14]:
filenames = out.coords["time"].data
filenames = filenames[::fs_new*time_dur_seconds]

In [15]:
time_chunks = out.coords["time"].data
time_chunks = time_chunks[::fs_new*60*15][:-1]

In [None]:
import dask.array as da
import dask
import dask.bag as db

# Setting up the folder structure to save the images
root_folder = f"south_ch_{dist[0]}to{dist[-1]}_fs{fs_new}_bpf{low}to{high}Hz"
Path(root_folder).mkdir(parents=True, exist_ok=True)

# We process 15 minutes of data at a time to not overload the memory
for i,time in enumerate(time_chunks[11:]):
    print(f"Processing {i+1+11} of {len(time_chunks)}")
    
    # Slicing the data to 15 minutes
    dat = out.sel(time=slice(time, time+np.timedelta64(15, 'm')))
    
    # Converting to dask objects to parallelize the computation
    dat = dat.data.to_delayed().flatten()
    
    # Getting the filenames for the images to be saved in this slice of data
    start = np.where(filenames == time)[0][0]
    end = np.where(filenames == time+np.timedelta64(15, 'm'))[0][0]
    files = filenames[start:end]
    
    # Saving the images
    save_imgs = [dask.delayed(save_to_image)(d, root_folder+'/'+str(f)) for d, f in zip(dat, files)]
    bag = db.from_delayed(save_imgs)
    bag.compute()
    
    # Clearing the memory
    del bag
    del save_imgs
    del dat