# Data Acquisition and Processing

Purpose: Write a script that facilitates data acquisiton from the website and processes files to convert them to the required envolope images

## 0. Imports and dependencies

In [1]:
import h5py
import numpy as np
import scipy.signal as sp
import matplotlib.pyplot as plt
from datetime import datetime
from pyproj import Proj
import requests
import bs4
import wget
import os
import sys
from PIL import Image

ModuleNotFoundError: No module named 'wget'

## 1. Adding helper functions

### 1.1 Signal Processing

In [2]:
def get_file_metadata(fp):
    """
    Extract and return some metadata from the file in dictionary form.
    """
    return {
        'firstSample': datetime.utcfromtimestamp(fp['Acquisition']['Raw[0]']['RawDataTime'][0]*1e-6),
        'lastSample': datetime.utcfromtimestamp(fp['Acquisition']['Raw[0]']['RawDataTime'][-1]*1e-6),
        'fs': fp['Acquisition']['Raw[0]'].attrs['OutputDataRate'],
        'nChannels': fp['Acquisition']['Raw[0]']['RawData'].shape[0],
        'channelSpacing': fp['Acquisition'].attrs['SpatialSamplingInterval'],
        'gaugeLength': fp['Acquisition'].attrs['GaugeLength'],
    }

def clip_signal(rawData, metadata, start, end):
    """
    Clip the signal to the specified start and end channels
    """
    assert end <= rawData.shape[0], "End channel must be less than the number of channels in the data."
    return rawData[start:end,:]

def downsample(rawData, metadata, fs_new):
    """
    Downsample the data to fs_new.
    """

    fs = metadata['fs']
    ds = int(fs/fs_new) # Number of samples to skip
    
    return rawData[:,0:rawData.shape[1]:ds]

def spatial_downsampling(rawData, ds, metadata):
    """
    Skip channels in the data to reduce the spatial resolution.
    If ds is supplied it will be used to downsample the data (every ds channels will be kept).
    If ds is not supplied, downsampling will occur based on the channel spacing and gauge length.
    """
    if ds is None:
        ds = int(metadata['gaugeLength']/metadata['channelSpacing'])
    
    return rawData[0:rawData.shape[0]:ds,:]

def median_subtract(rawData):
    """
    Subtract median of each timepoint from all traces
    """
    return rawData - np.tile(np.median(rawData,axis=1),(rawData.shape[1],1)).T

def bandpass_filter(rawData, metadata, low, high):
    """
    Apply a bandpass filter to the data.
    """

    fs = metadata['fs']
    
    assert fs > 2*high, "High frequency must be less than half the sampling frequency."

    nyq = fs/2
    low = low/nyq
    high = high/nyq
    
    b, a = sp.butter(4, [low, high], 'bandpass')
    
    return sp.filtfilt(b, a, rawData, axis=1)

def calc_envelope(rawData):
    """
    Calculate the envelope of the data and return its log transform.
    """
    return np.log(np.abs(sp.hilbert(rawData, axis=1)+1))


### 1.1 Data Acquisition and File Handling

In [3]:
def get_h5_files_from_url(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    h5files = [url + node.get('href') for node in soup.find_all('a') if node.get('href').endswith('.h5')]

    return h5files

def download_file(file, out='../data/'):
    wget.download(file, out=out)

def delete_file(file):
    os.remove('../data/' + file)

def save_to_image(filename, data):
    """
    Save the data to an image file.
    """
    data = (data - np.min(data))/(np.max(data) - np.min(data))*255 #Normalize
    
    im = Image.fromarray(np.uint8(data.T))
    im.resize((im.width//16, im.height//16)).save('../data/processed_clip1k_fs100_bpf1435/' + filename + '.png')
    im.close()

In [4]:
# Get list of h5 files

url = 'http://piweb.ooirsn.uw.edu/das/data/Optasense/SouthCable/TransmitFiber/South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-01T16_09_15-0700/'

h5files = get_h5_files_from_url(url)

In [5]:
print('Number of files: ', len(h5files))
h5files = [file for file in h5files if file.split('/')[-1]+'.png' not in os.listdir('../data/processed_clip1k_fs100_bpf1435/')]
print('Number of files to download: ', len(h5files))

Number of files:  3981
Number of files to download:  3675


In [6]:
clip = 5000
fs_new = 100
ds = 5
low = 14
high = 35

for i,file in enumerate(h5files[:300]):
    #Download file
    file_name = file.split('/')[-1]
    print(i, file_name)
    download_file(file, out='../data/')

    #Open file
    fp = h5py.File('../data/' + file_name, 'r')
    metadata = get_file_metadata(fp)
    rawData = fp['Acquisition']['Raw[0]']['RawData']

    #Signal Processing
    rawData = clip_signal(rawData, metadata, clip, rawData.shape[0]) #Clip signal
    rawData = downsample(rawData, metadata, fs_new) #Downsample
    metadata['fs'] = fs_new

    rawData = spatial_downsampling(rawData, ds, metadata) #Spatial downsampling
    metadata['nChannels'] = rawData.shape[0]

    rawData = median_subtract(rawData) #Median subtraction
    rawData = bandpass_filter(rawData, metadata, low, high) #Bandpass filter

    envelope = calc_envelope(rawData) #Calculate envelope

    #Save to image
    save_to_image(file_name, envelope)

    #Delete file
    delete_file(file_name)



0 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T041714Z.h5
1 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T041814Z.h5
2 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T041914Z.h5
3 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042014Z.h5
4 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042114Z.h5
5 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042214Z.h5
6 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042314Z.h5
7 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042414Z.h5
8 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042514Z.h5
9 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042614Z.h5
10 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042714Z.h5
11 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042814Z.h5
12 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T042914Z.h5
13 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T043014Z.h5
14 South-C1-LR-95km-P1kHz-GL50m-SP2m-FS200Hz_2021-11-02T043114Z.h5
15 So