# ML-ready data generation
We read pressure data from historical balloon flights, pre-process them, and store them as waveform snippets of the same length in .h5 files

In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from obspy.core.utcdatetime import UTCDateTime
import importlib
import matplotlib.pyplot as plt
import obspy
import os

In [4]:
def create_one_obspy_trace(times, amp, balloon, starttime, dt, target_sampling=1.):
                        
    #f = interpolate.interp1d(times, amp, kind='linear', )
    #times_interp = np.arange(one_tec_data.epoch.values.min()*dt, one_tec_data.epoch.values.max()*dt+dt, dt)
    #vTEC = f(times_interp)
    tr = obspy.Trace()
    tr.data = amp
    tr.stats.delta = dt
    tr.interpolate(sampling_rate=target_sampling)
    #tr.stats.network = station
    #tr.stats.station = satellite+'ZZZ'+station
    tr.stats.station = balloon
    tr.stats.starttime = starttime+times[0]
    return tr

def load_balloon_data(dir_data, starttimes, target_sampling=1.):
    
    datas = {'GPS': obspy.Stream(), 'Baro': obspy.Stream()}
    for subdir, dirs, files in os.walk(dir_data):
        #print(files)
        for file in files:
            filepath = subdir + os.sep + file
            if not '.csv' in file:
                continue
                
            #if not 'Tortoise' in file:
            #    continue
            
            balloon = file.split('_')[0]
            if not balloon in starttimes.keys():
                continue
            print(balloon)
            
            data = pd.read_csv(filepath, header=[0])
            
            type_data = file.split('_')[1].split('.')[0]
            
            starttime = starttimes[balloon]
            times = data['GPSTime(s)'].values
            #if balloon == 'Tortoise':
            #    print(file)
            #    print(times/3600)
            dt = times[1]-times[0]
            #print(data.columns)
            try:
                amp = data['WGS84Altitude(m)'].values
            except:
                amp = data[data.columns[-1]].values
            tr_data = create_one_obspy_trace(times, amp, balloon, starttime, dt, target_sampling=target_sampling)
            datas[type_data] += tr_data
            
    return datas

starttimes = {
    'Hare': UTCDateTime(2019, 7, 22),
    'Tortoise': UTCDateTime(2019, 7, 22),
    'Hare2': UTCDateTime(2019, 8, 9),
    'CrazyCatLower': UTCDateTime(2019, 8, 9),
    'CrazyCatUpper': UTCDateTime(2019, 8, 9),
}
target_sampling = 3.
dir_data = '/staff/quentin/Documents/Projects/2020_Ridgecrest/data_balloons/Siddharth_balloon/'
st_all = load_balloon_data(dir_data, starttimes, target_sampling=target_sampling)

Hare2
CrazyCatLower
Hare
CrazyCatUpper
Hare
Hare2
Tortoise
Tortoise
CrazyCatUpper
CrazyCatLower


In [5]:
def trim_st(st_all, duration, overlap=1., get_GPS=True):
    
    new_st = {'GPS': obspy.Stream(), 'Baro': obspy.Stream()}
    for tr_Baro in st_all['Baro']:
        
        balloon = tr_Baro.stats.station
        print(balloon)
        if get_GPS:
            tr_GPS = st_all['GPS'].select(station=balloon)[0].copy()
            tr_GPS.trim(starttime=tr_Baro.stats.starttime, endtime=tr_Baro.stats.endtime)
            starttimes = np.arange(0., tr_GPS.times()[-1], duration*overlap) 
        else:
            starttimes = np.arange(0., tr_Baro.times()[-1], duration*overlap)
            
        print('Length to process:', starttimes.size)
        for id, starttime in enumerate(starttimes):
            #print(id)
            tr_loc_Baro = tr_Baro.copy().trim(starttime=tr_Baro.stats.starttime+starttime, endtime=tr_Baro.stats.starttime+starttime+duration)
            tr_loc_Baro.stats.station = tr_loc_Baro.stats.station+'-'+str(id)
            if get_GPS:
                tr_loc_GPS = tr_GPS.copy().trim(starttime=tr_Baro.stats.starttime+starttime, endtime=tr_Baro.stats.starttime+starttime+duration)
                tr_loc_GPS.stats.station = tr_loc_GPS.stats.station+'-'+str(id)
            new_st['Baro'] += tr_loc_Baro
            if get_GPS:
                new_st['GPS'] += tr_loc_GPS
    return new_st
    
from math import log, ceil, floor
def closest_power(x, power=8):
    possible_results = floor(log(x, power)), ceil(log(x, power))
    return min(possible_results, key= lambda z: abs(x-power**z))
   
def find_closest_duration(target_duration, target_sampling):
    nsize = int(target_sampling*target_duration)
    power = closest_power(nsize, power=2)
    nsize = 2**power
    duration = nsize/target_sampling
    return duration
    
target_duration = 50.
overlap = 1
duration = find_closest_duration(target_duration, target_sampling)
new_st = trim_st(st_all, duration, overlap=overlap, get_GPS=False)

CrazyCatLower
Length to process: 1344
Hare
Length to process: 388
CrazyCatUpper
Length to process: 2897
Hare2
Length to process: 1179
Tortoise
Length to process: 403


In [7]:
import h5py

def store_all_as_hdf5(datasets, st_all, target_size, crop_percent=0.6, freq_min=0.1):

    ## Initialization dataset
    size_crop = st_all['Baro'][0].data.size-int(st_all['Baro'][0].data.size*crop_percent)
    
    there_are_GPS_data = True if len(st_all['GPS']) > 0 else False
    
    # Create a subgroup for each stream with the corresponding event ID as the subgroup name
    event_type = 'balloon'
    for dataset in datasets:
    
        results = {'X': [], 'label': [], 'event_type': [], 'window': [], 'id': [], 'tr_id': [], 'station': [], 'satellite': []}
        print('---', dataset)
        
        idmin, idmax = datasets[dataset]
        idmin, idmax = int(idmin), int(idmax)
        st_Baro_loc = st_all['Baro'][idmin:idmax]
        if there_are_GPS_data:
            st_GPS_loc = st_all['GPS'][idmin:idmax]
        else:
            st_GPS_loc = st_Baro_loc
        #id = 0
        #st_Baro_loc = st_all['Baro'][id:].copy()
        #st_GPS_loc = st_all['GPS'][id:].copy()
        for tr_Baro_loc_in, tr_GPS_loc in zip(st_Baro_loc, st_GPS_loc):
            
            
            tr_Baro_loc = tr_Baro_loc_in.copy()
            window = (str(tr_Baro_loc.stats.starttime), str(tr_Baro_loc.stats.endtime))
            tr_Baro_loc.detrend()
            tr_Baro_loc.filter('highpass', freq=freq_min)
            tr_Baro_loc_cropped = tr_Baro_loc.copy()
            if size_crop > 0:
                tr_Baro_loc_cropped.data[size_crop//2:-size_crop//2] = 0.
           
            if False:
                fig = plt.figure(figsize=(10,5))
                grid = fig.add_gridspec(3, 1)

                ax = fig.add_subplot(grid[:2, 0])
                ax.plot(tr_Baro_loc.times(), tr_Baro_loc.data)
                ax.plot(tr_Baro_loc_cropped.times(), tr_Baro_loc_cropped.data)
                
                ax = fig.add_subplot(grid[2, 0])
                ax.plot(tr_GPS_loc.times(), tr_GPS_loc.data)
                return
            
            """
            obs_time = UTCDateTime('2019-07-22T16:27:26')
            if (tr_Baro_loc.stats.starttime<=obs_time) & (tr_Baro_loc.stats.endtime>=obs_time):
                print('cool')
                print(tr_Baro_loc.stats.station)
                print(tr_Baro_loc.data)
            """
            #X0 = np.expand_dims(tr_Baro_loc_cropped.data/abs(tr_Baro_loc_cropped.data).max(), axis=-1)
            #X1 = np.expand_dims(tr_GPS_loc.data/abs(tr_GPS_loc.data).max(), axis=-1)
            if abs(tr_Baro_loc.data).max() == 0.:
                #print(tr_Baro_loc.stats.station)
                #print(X0.shape, X1.shape)
                continue
            
            X0 = np.expand_dims(tr_Baro_loc_cropped.data, axis=-1)
            if there_are_GPS_data:
                X1 = np.expand_dims(tr_GPS_loc.data, axis=-1)
                if not X0.shape[0] == X1.shape[0]:
                    #print(tr_Baro_loc.stats.station)
                    #print(X0.shape, X1.shape)
                    continue
                X = np.concatenate((X0, X1), axis=-1)
            else:
                X = X0
            X = X[:target_size,:]
            label = np.expand_dims(tr_Baro_loc.data, axis=-1)
            label = label[:target_size,:]
            
            if X.shape[0] < target_size or label.shape[0] < target_size:
                print(tr_Baro_loc.stats.station)
                print(X.shape)
                continue
            
            if np.isnan(X).any():
                #print('problem nan')
                #print(tr_Baro_loc.stats.station)
                #print(X.shape)
                continue
                """
                fig = plt.figure(figsize=(10,5))
                grid = fig.add_gridspec(3, 1)

                ax = fig.add_subplot(grid[:2, 0])
                ax.plot(tr_Baro_loc.times(), tr_Baro_loc.data)
                ax.plot(tr_Baro_loc_cropped.times(), tr_Baro_loc_cropped.data)
                
                ax = fig.add_subplot(grid[2, 0])
                ax.plot(tr_GPS_loc.times(), tr_GPS_loc.data)
                print(tr_Baro_loc.data)
                #print(X1)
                return
                """
            
            results['X'].append( X )
            results['label'].append( label )
            results['event_type'].append( event_type )
            results['window'].append( window )
            results['id'].append( tr_Baro_loc.stats.station )
        
        # Open the HDF5 file in "write" mode
        with h5py.File(filename.format(dataset=dataset), "w") as f:
            f.create_dataset('X', data=results['X'], dtype='float32')
            f.create_dataset('label', data=results['label'], dtype='float32')
            f.create_dataset('event_type', data=str(results['event_type']))
            f.create_dataset('window', data=results['window'])
            f.create_dataset('id', data=results['id'])

def prepare_datasets_dates(list_quantiles, metadata):
    l_dates = metadata['starttime'].astype('int64').quantile(list_quantiles).astype('datetime64[ns]').values
    datasets = {'training': (metadata['starttime'].min(), l_dates[0]), 'validation': (l_dates[0], l_dates[1]), 'testing': (l_dates[1], l_dates[2])}
    for dataset in datasets:
        tmin, tmax = datasets[dataset]
        nb_elem = metadata.loc[(metadata.starttime>=tmin)&(metadata.starttime<=tmax)].shape[0]
        print('dataset "{}" ({} inputs): {} - {}'.format(dataset, nb_elem, tmin, tmax))
    return datasets

def prepare_datasets_ids(list_quantiles, new_st):
    l_ids = np.arange(len(new_st['Baro']))
    l_bound_ids = np.quantile(l_ids, q=list_quantiles)
    datasets = {'training': (0, l_bound_ids[0]), 'validation': (l_bound_ids[0], l_bound_ids[1]), 'testing': (l_bound_ids[1], l_bound_ids[2])}
    for dataset in datasets:
        idmin, idmax = datasets[dataset]
        nb_elem = idmax-idmin
        print('dataset "{}" ({} inputs): {} - {}'.format(dataset, nb_elem, idmin, idmax))
    return datasets
        
filename = '/projects/active/infrasound/data/infrasound/2023_ML_balloon/data/{dataset}_waveform_dataset.h5'
list_quantiles = [0.6, 0.9, 1.]
datasets = prepare_datasets_ids(list_quantiles, new_st)
target_size=int(duration*target_sampling)
results = store_all_as_hdf5(datasets, new_st, target_size, crop_percent=0., freq_min=0.5)

dataset "training" (3726.0 inputs): 0 - 3726.0
dataset "validation" (1863.0 inputs): 3726.0 - 5589.0
dataset "testing" (621.0 inputs): 5589.0 - 6210.0
--- training
CrazyCatLower-1343
(41, 1)
Hare-387
(25, 1)
--- validation
CrazyCatUpper-2896
(122, 1)
--- testing
Hare2-1178
(62, 1)
