# ML-ready data generation
We read pressure data from historical balloon flights, pre-process them, and store them as waveform snippets of the same length in .h5 files

In [523]:
%matplotlib notebook
import pandas as pd
import numpy as np
from obspy.core.utcdatetime import UTCDateTime
import importlib
import matplotlib.pyplot as plt
import obspy
import os
import netCDF4 as nc

### Load data

In [332]:
def create_one_obspy_trace(times, amp, balloon, starttime, dt, name_maps):
                        
    #f = interpolate.interp1d(times, amp, kind='linear', )
    #times_interp = np.arange(one_tec_data.epoch.values.min()*dt, one_tec_data.epoch.values.max()*dt+dt, dt)
    #vTEC = f(times_interp)
    tr = obspy.Trace()
    tr.data = amp
    tr.stats.delta = dt
    #tr.stats.network = station
    #tr.stats.station = satellite+'ZZZ'+station
    tr.stats.station = name_maps[balloon]
    tr.stats.starttime = starttime+times[0]
    return tr

def load_balloon_data(dir_data, starttimes, name_maps):
    
    datas = {'GPS': obspy.Stream(), 'Baro': obspy.Stream()}
    for subdir, dirs, files in os.walk(dir_data):
        #print(files)
        for file in files:
            filepath = subdir + os.sep + file
            if not '.csv' in file:
                continue
                
            #if not 'Tortoise' in file:
            #    continue
            
            balloon = file.split('_')[0]
            if not balloon in starttimes.keys():
                continue
            print(balloon)
            
            data = pd.read_csv(filepath, header=[0])
            
            type_data = file.split('_')[1].split('.')[0]
            
            starttime = starttimes[balloon]
            times = data['GPSTime(s)'].values
            #if balloon == 'Tortoise':
            #    print(file)
            #    print(times/3600)
            dt = times[1]-times[0]
            #print(data.columns)
            try:
                amp = data['WGS84Altitude(m)'].astype(float).values
            except:
                amp = data[data.columns[-1]].values
            tr_data = create_one_obspy_trace(times, amp, balloon, starttime, dt, name_maps)
            datas[type_data] += tr_data
            
    return datas

starttimes = {
    'Hare': UTCDateTime(2019, 7, 22),
    'Tortoise': UTCDateTime(2019, 7, 22),
    'Hare2': UTCDateTime(2019, 8, 9),
    'CrazyCatLower': UTCDateTime(2019, 8, 9),
    'CrazyCatUpper': UTCDateTime(2019, 8, 9),
}
name_maps = {
    'Hare': 'hare',
    'Tortoise': 'tort',
    'Hare2': 'hare2',
    'CrazyCatLower': 'CraLo',
    'CrazyCatUpper': 'CraUp',
}
dir_data = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/Siddharth_balloon/'
st_crazycat = load_balloon_data(dir_data, starttimes, name_maps)

CrazyCatLower
CrazyCatLower
CrazyCatUpper
CrazyCatUpper
Hare2
Hare2
Hare
Hare
Tortoise
Tortoise


In [333]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/Siddharth_balloon/'
st_crazycat['Baro'].write(f"{folder}st_all.mseed", format="MSEED")
st_crazycat['GPS'].write(f"{folder}st_all_gps.mseed", format="MSEED")

In [340]:
from tqdm import tqdm

folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/Strateole2/'

files = []
files.append( f'{folder}ST2_C0_01_STR1_TSEN_P1s_v03b.nc' )
files.append( f'{folder}ST2_C1_04_STR2_TSEN_P1s_v01.nc' )
files.append( f'{folder}ST2_C1_03_TTL4_TSEN_P1s_v01.nc' )
files.append( f'{folder}ST2_C1_01_TTL5_TSEN_P1s_v01.nc' )
files.append( f'{folder}ST2_C1_16_TTL5_TSEN_P1s_v01.nc' )
files.append( f'{folder}ST2_C1_17_TTL3_TSEN_P1s_v01.nc' )

st_strateole = obspy.Stream()
st_strateole_gps = obspy.Stream()
for file in tqdm(files):
    dataset = nc.Dataset(file)
    file = file.replace('P1s_', '')
    dataset_location = nc.Dataset(file)

    starttime = UTCDateTime(dataset_location.date_start)
    time_pressure = dataset.variables['time'][:].filled()
    #time_pressure -= time_pressure[0]
    pressure = dataset.variables['pressure'][:].filled()
    
    dt = np.diff(time_pressure)[0]
    
    tr = obspy.Trace()
    tr.data = pressure
    tr.stats.station = 'b' + file.split('/')[-1].split('_')[2]
    tr.stats.delta = dt
    tr.stats.starttime = UTCDateTime(dataset_location.date_start)
    
    st_strateole += tr
    
    time_gps = dataset_location.variables['time'][:].filled()
    alt = dataset_location.variables['alt'][:].filled()
    
    dt = np.diff(time_gps)[0]
    
    tr = obspy.Trace()
    tr.data = alt
    tr.stats.station = 'b' + file.split('/')[-1].split('_')[2]
    tr.stats.delta = dt
    tr.stats.starttime = UTCDateTime(dataset_location.date_start)
    
    st_strateole_gps += tr

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
100%|██████████| 6/6 [00:02<00:00,  2.31it/s]


In [341]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/Strateole2/'
st_strateole.write(f"{folder}st_all.mseed", format="MSEED")
st_strateole_gps.write(f"{folder}st_all_gps.mseed", format="MSEED")

In [466]:
from tqdm import tqdm

folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/MINIBOOSTER_balloon/signal_ascii/'

st_minibooster = obspy.Stream()
for subdir, dirs, files in os.walk(folder):
    for file in tqdm(files):
        filepath = subdir + file
        
        station = file.split('.')[1]
        file = open(filepath, 'r')
        lines = file.readlines()
        
        year, doy, hour, minute, second, dt = lines[0].split()
        
        tr = obspy.Trace()
        tr.data = np.array(lines[1:]).astype(float)
        tr.stats.station = station
        tr.stats.delta = dt
        tr.stats.starttime = UTCDateTime(f'{year}-{doy}T{hour}:{minute}:{second}')
        st_minibooster += tr

 98%|█████████▊| 54/55 [01:02<00:01,  1.16s/it]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 21: invalid continuation byte

In [468]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/MINIBOOSTER_balloon/signal_ascii/'
st_minibooster.write(f"{folder}st_all.mseed", format="MSEED")

In [359]:
from tqdm import tqdm
from obspy.core.utcdatetime import UTCDateTime

folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/MINIBOOSTER_balloon/balloon_trajectories/'

st_minibooster_gps = obspy.Stream()
for subdir, dirs, files in os.walk(folder):
    for file in tqdm(files):
        filepath = subdir + file
        
        station = file.split('_')[0]
        #print(station)
        df = pd.read_csv(filepath, delimiter='\n', header=None, names=['text'])

        # Define a regular expression pattern to extract the required fields
        pattern = r'\s+gps-time=(\S+)\s+gps-leap=\S+\s+iers-leap=\S+\s+lat=([\+\-]?\d+\.\d+)\s+lon=([\+\-]?\d+\.\d+)\s+elev=([\d\.]+)'
        
        # Use the str.extract method with the pattern to extract the fields into new columns
        df[['gps-time', 'lat', 'lon', 'elev']] = df['text'].str.extract(pattern)
        df.loc[:,'gps-time'] = pd.to_datetime(df.loc[:,'gps-time'])
        df.loc[:,'lat'] = df.loc[:,'lat'].str[1:].astype(float)
        df.loc[:,'lon'] = df.loc[:,'lon'].str[1:].astype(float)
        df.loc[:,'elev'] = df.loc[:,'elev'].astype(float)
        df = df.drop(columns=['text'])
        
        dt = (df.iloc[1]['gps-time'] - df.iloc[0]['gps-time']).total_seconds()
        starttime = UTCDateTime(df['gps-time'].iloc[0])
        
        tr = obspy.Trace()
        tr.data = df.elev.values
        #print(tr.data)
        tr.stats.station = station
        tr.stats.delta = dt
        tr.stats.starttime = starttime
        
        st_minibooster_gps += tr

100%|██████████| 5/5 [00:03<00:00,  1.37it/s]


In [360]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/MINIBOOSTER_balloon/balloon_trajectories/'
st_minibooster_gps.write(f"{folder}st_all_gps.mseed", format="MSEED")

In [510]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/balloon_data/'

st_starliner = obspy.Stream()
datas = {}
for subdir, dirs, files in os.walk(folder):
    for file in tqdm(files):
        filepath = subdir + os.sep + file
        
        try:
            st_starliner += obspy.read(filepath)
        except:
            pass

100%|██████████| 775/775 [00:08<00:00, 92.89it/s] 


In [439]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/balloon_data/'
st_starliner.write(f"{folder}st_all.mseed", format="MSEED")

In [437]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/'

st_starliner_gps = obspy.Stream()
datas = {}
for subdir, dirs, files in os.walk(folder):
    for file in tqdm(files):
        file_name = file
        filepath = subdir + os.sep + file#BNG01
        balloon = subdir.split('/')[-2].split('_')[1]
        balloon = 'BNG0{}'.format(balloon)
        
        channel = file.split('.')[0].split('_')
        print(channel)
        if len(channel) > 1:
            channel = channel[0][:4]+channel[1][:1]
        else:
            channel = channel[0][:5]
        
        print(filepath)
        if 'bounder' in file:
            #    pd.read_csv(filepath, skiprows=34, header=[0])
            #else:
            data = pd.read_csv(filepath, skiprows=67, header=None, names=['AA', 'SEF', 'HTR', 'BW0', 'BW1', 'MET','MAP','PAR','GAR','GF1','GF2','EXT','Pres (hPa)','dP (hPa/line)','Temp (C)','Batt (V)','Cap (V)','GPS Flag (HEX)','GPS SV','Lon','Lat','Alt (m)','Date','Time','gSpeed (m/s)','aRate (m/s)','aFilt (m/s)','heading (deg)', 'N/A'])
            data = data.loc[:,['Lon','Lat','Alt (m)','Date','Time']]
            year = data['Date'].astype(str).str[:4]
            month = data['Date'].astype(str).str[4:6]
            day = data['Date'].astype(str).str[6:]
            data.loc[:,'Date'] = pd.to_datetime(year+'-'+month+'-'+day+'T'+data['Time'])
            
        elif 'gps' in file:
            df = pd.read_csv(filepath, delimiter='\n', header=None, names=['text'])

            # Define a regular expression pattern to extract the required fields
            pattern = r'utc-time=(\S+) lat=([\+\-]?\d+\.\d+) lon=([\+\-]?\d+\.\d+) elev=([\d\.]+)'
            pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}|[+-]?\d+\.\d+'
            pattern = r'utc-time=(\S+)\s+gps-time=\S+\s+gps-leap=\S+\s+iers-leap=\S+\s+lat=([\+\-]?\d+\.\d+)\s+lon=([\+\-]?\d+\.\d+)\s+elev=([\d\.]+)'
            #print(df['text'].str.extract(pattern))
            # Use the str.extract method with the pattern to extract the fields into new columns
            df[['gps-time', 'lat', 'lon', 'elev']] = df['text'].str.extract(pattern)

            # Convert the columns to appropriate data types
            df['lat'] = df['lat'].astype(float)
            df['lon'] = df['lon'].astype(float)
            df['elev'] = df['elev'].astype(float)
            df['gps-time'] = pd.to_datetime(df['gps-time'])

            # Drop the original text column
            data = df.drop(columns=['text'])
            data.columns = ['Date', 'Lat', 'Lon', 'Alt (m)',]
        else:
            file = open(filepath, 'r')
            lines = file.readlines()
            i = 0
            not_found = True
            while not_found:
                if 'Time' in lines[i]:
                    not_found = False
                i+=1
                
            locs = []
            for j in range (0, len(lines)):
                if 'Longitude' in lines[j]:
                    locs.append(j)
            locs += [len(lines)+10]
            #data = pd.read_csv(filepath, skiprows=i-1, header=[0])
            #data.dropna(subset=['Longitude '], inplace=True)
            #print(data)
            
            #cols = ['Date    ','Time    ','Latitude  ','Longitude ','Head','Km/h','Alt-m  ','Lock','N/A1','Temp C','Pa    ','N/A0','Temp C','Pa    ','N/A']
            for isep in range(1,len(locs)):
                
                data = pd.DataFrame(lines[locs[isep-1]:locs[isep]-10])
                data = data[0].str.split(',', expand=True)
                cols = data.iloc[0].values
                data.columns = cols
                data = data.iloc[1:]
                #print(data)
                try:
                    #data['Date    '] = data['Date    '].str.replace('/', '-')
                    data['Date    '] = '2021-08-'+data['Date    '].str.split('/').str[1]
                    data['Date    '] = pd.to_datetime(data['Date    ']+'T'+data['Time    '])
                    data = data.loc[:,['Date    ', 'Latitude  ', 'Longitude ', 'Alt-m  ']]
                    data.columns = ['Date', 'Lat', 'Lon', 'Alt (m)',]
                    data.loc[:,'Alt (m)'] = data.loc[:,'Alt (m)'].str[1:].astype(float)
                except:
                    #data['UTC Date'] = data['UTC Date'].str.replace('/', '-')
                    data['UTC Date'] = '2021-08-'+data['UTC Date'].str.split('/').str[1]
                    data['UTC Date'] = pd.to_datetime(data['UTC Date']+'T'+data['UTC Time'])
                    data = data.loc[:,['UTC Date', 'Latitude  ', 'Longitude ', 'Alt-m  ']]
                    data.columns = ['Date', 'Lat', 'Lon', 'Alt (m)',]
                    #print(data.loc[:,'Lat'].str[2:])
                    data.loc[:,'Lat'] = data.loc[:,'Lat'].str[2:].astype(float)
                    data.loc[:,'Alt (m)'] = data.loc[:,'Alt (m)'].str[1:].astype(float)

                #data['Alt (m)'] = data['Alt (m)'].str[1:].astype(float)
            
        dt = (data.iloc[1]['Date'] - data.iloc[0]['Date']).total_seconds()
        starttime = UTCDateTime(data['Date'].iloc[0])
            
        tr = obspy.Trace()
        tr.data = data['Alt (m)'].values
        tr.stats.station = balloon
        tr.stats.channel = channel
        tr.stats.delta = dt
        tr.stats.starttime = starttime
            
        st_starliner_gps += tr

0it [00:00, ?it/s]
0it [00:00, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

['eagle', 'a']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_1/eagle/eagle_a.txt


100%|██████████| 2/2 [00:00<00:00,  4.06it/s]


['eagle', 'b']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_1/eagle/eagle_b.txt


0it [00:00, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

['eagle', '1']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_2/eagle/eagle_1.txt


 50%|█████     | 1/2 [00:00<00:00,  8.64it/s]

['eagle', '2']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_2/eagle/eagle_2.txt


100%|██████████| 2/2 [00:00<00:00,  3.73it/s]
0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

['bounder']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_3/bounder_1/bounder.csv


100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

['gps']
/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/balloon_4/cube/gps.txt


100%|██████████| 1/1 [00:02<00:00,  2.07s/it]


In [440]:
folder = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/starliner_scrub/20210803_starliner_scrub/trajectory_data/'
st_starliner_gps.write(f"{folder}st_all_gps.mseed", format="MSEED")

In [488]:
import scipy

tr = st_minibooster_gps[0].copy()
starttime = tr.stats.starttime
tr = st_minibooster[20].copy()
offset = tr.stats.starttime-starttime
tr.data[np.argwhere(np.isnan(tr.data))]=0.
tr.detrend()

order = 2
lowcut = 0.05
sos = scipy.signal.butter(order, [lowcut], fs=1./tr.stats.delta, btype='high', output='sos')
data_filt = scipy.signal.sosfilt(sos, tr.data)

plt.figure()
#plt.plot(tr.times()+offset, tr.data)
plt.plot(tr.times()+offset, data_filt)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7f0d430796d0>]

### Building validation/training datasets

In [None]:
def trim_st(st_all, duration, return_times, overlap=1., freq_min=0.01, freq_max=20.,):
    
    new_st = obspy.Stream()
    for itr, tr_in in tqdm(enumerate(st_all), total=len(st_all)):
        
        starttime, endtime = return_times[itr]
        
        tr = tr_in.copy()
        tr.detrend()
        tr.filter('bandpass', freqmin=freq_min, freqmax=freq_max, zerophase=True, corners=6)
        tr.resample(freq_max*2)
        tr.trim(starttime=starttime, endtime=endtime)
        balloon = tr.stats.station
        starttimes = np.arange(0., tr.times()[-1], duration*overlap)
            
        for id, starttime in enumerate(starttimes):
            tr_loc_Baro = tr.copy()
            tr_loc_Baro.trim(starttime=tr.stats.starttime+starttime, endtime=tr.stats.starttime+starttime+duration)
            tr_loc_Baro.stats.station = tr.stats.station + '-' + str(id)
            new_st += tr_loc_Baro
            
    return new_st
    
from math import log, ceil, floor
def closest_power(x, power=8):
    possible_results = floor(log(x, power)), ceil(log(x, power))
    return min(possible_results, key= lambda z: abs(x-power**z))
   
def find_closest_duration(target_duration, target_sampling):
    nsize = int(target_sampling*target_duration)
    power = closest_power(nsize, power=2)
    nsize = 2**power
    duration = nsize/target_sampling
    return duration
    
def get_times(st, flight_id):
    
    l_starttimes, l_endtimes = [tr.stats.starttime for tr in st], [tr.stats.endtime for tr in st.copy()]
    starttime, endtime = min(l_starttimes), max(l_endtimes)
    default_times = (starttime, endtime)
    
    available_times = dict(
        ridgecrest=dict(
            CraLo = (UTCDateTime('2019-08-09T18:52:00'), UTCDateTime('2019-08-10T02:43:31')),
            CraUp = (UTCDateTime('2019-08-09T18:52:00'), UTCDateTime('2019-08-10T02:43:31')),
            tort = (UTCDateTime('2019-07-22T14:09:00'), endtime)
        )
    )
    
    return_times = []
    for tr in st:
        return_times.append( default_times )
    if flight_id in available_times:
        for itr, tr in enumerate(st):
            if tr.stats.station in available_times[flight_id]:
                return_times[itr] = available_times[flight_id][tr.stats.station]
    
    return return_times
    
target_duration = 150.
overlap = 0.25
freq_min=0.05
freq_max=5.
target_sampling = freq_max*2.
st = st_crazycat['Baro'].copy()
flight_id = 'ridgecrest'
"""
start_constant_alt = UTCDateTime('2019-08-09T18:52:00') # Crazycat
end_constant_alt = UTCDateTime('2019-08-10T02:43:31') # Crazycat
start_constant_alt = UTCDateTime('2019-07-22T14:09:00') # Tortoise
end_constant_alt = st_all['Baro'][0].stats.endtime # Tortoise
"""
return_times = get_times(st, flight_id)
duration = find_closest_duration(target_duration, target_sampling)
new_st = trim_st(st, duration, return_times, overlap=overlap, freq_min=freq_min, freq_max=freq_max)

  0%|          | 0/5 [00:00<?, ?it/s]

Length to process: 1106


 20%|██        | 1/5 [00:06<00:25,  6.33s/it]

In [535]:
return_times

[(2019-08-09T18:52:00.000000Z, 2019-08-10T02:43:31.000000Z),
 (2019-08-09T18:52:00.000000Z, 2019-08-10T02:43:31.000000Z),
 (2019-07-22T13:33:21.028400Z, 2019-08-10T23:59:56.300025Z),
 (2019-07-22T13:33:21.028400Z, 2019-08-10T23:59:56.300025Z),
 (2019-07-22T14:09:00.000000Z, 2019-08-10T23:59:56.300025Z)]

In [521]:
import h5py

def store_all_as_hdf5(datasets, st_all, target_size, flight_id, event_type='earthquake', st_label=None):

    """
    ## Initialization dataset
    size_crop = st_all['Baro'][0].data.size-int(st_all['Baro'][0].data.size*crop_percent)
    there_are_GPS_data = True if len(st_all['GPS']) > 0 else False
    """
    
    # Create a subgroup for each stream with the corresponding event ID as the subgroup name
    for dataset in tqdm(datasets):
    
        results = {'X': [], 'label': [], 'event_type': [], 'window': [], 'id': [], 'station': [],}
        print('---', dataset)
        
        """
        idmin, idmax = datasets[dataset]
        idmin, idmax = int(idmin), int(idmax)
        st_Baro_loc = st_all['Baro'][idmin:idmax]
        if there_are_GPS_data:
            st_GPS_loc = st_all['GPS'][idmin:idmax]
        else:
            st_GPS_loc = st_Baro_loc
        for tr_Baro_loc_in, tr_GPS_loc in zip(st_Baro_loc, st_GPS_loc):
        """
        for itr, tr_in in enumerate(st_all):
            
            tr_label = None
            if st_label is not None:
                tr_label = st_label[itr]
            idmin, idmax = datasets[dataset][itr]
            
            if idmin == -1:
                continue
            
            tr = tr_in.copy()
            tr.data = tr.data[idmin:idmax]
            window = (str(tr.stats.starttime), str(tr.stats.endtime))
            
            """
            tr_Baro_loc_cropped = tr_Baro_loc.copy()
            if size_crop > 0:
                tr_Baro_loc_cropped.data[size_crop//2:-size_crop//2] = 0.
            """
            if abs(tr.data).max() == 0.:
                print('Problem amplitude')
                #print(tr_Baro_loc.stats.station)
                #print(X0.shape, X1.shape)
                continue
            
            X = np.expand_dims(tr.data, axis=-1)
            """
            X0 = np.expand_dims(tr_Baro_loc_cropped.data, axis=-1)
            if there_are_GPS_data:
                X1 = np.expand_dims(tr_GPS_loc.data, axis=-1)
                if not X0.shape[0] == X1.shape[0]:
                    #print(tr_Baro_loc.stats.station)
                    #print(X0.shape, X1.shape)
                    continue
                X = np.concatenate((X0, X1), axis=-1)
            else:
                X = X0
            """
            X = X[:target_size,:]
            
            if tr_label is None:
                label = np.zeros_like(X)
            else:
                label = np.expand_dims(tr_label.data, axis=-1)
            label = label[:target_size,:]
            
            if X.shape[0] < target_size or label.shape[0] < target_size:
                print('problem size')
                print(tr.stats.station)
                print(X.shape)
                continue
            
            if np.isnan(X).any():
                print('problem nan')
                print(tr.stats.station)
                continue
            
            results['X'].append( X )
            results['label'].append( label )
            results['event_type'].append( event_type )
            results['window'].append( window )
            results['id'].append( flight_id )
            results['station'].append( tr.stats.station )
        
        # Open the HDF5 file in "write" mode
        continue
        with h5py.File(filename.format(dataset=dataset), "w") as f:
            f.create_dataset('X', data=results['X'], dtype='float32')
            f.create_dataset('label', data=results['label'], dtype='float32')
            f.create_dataset('event_type', data=str(results['event_type']))
            f.create_dataset('window', data=results['window'])
            f.create_dataset('id', data=results['id'])
            
    return results

def prepare_datasets_dates(new_st, list_quantiles, l_dates, target_size, min_size=10):
    
    datasets = dict()
    nb_elem = dict()
    for dataset in tqdm(l_dates):
        tmin, tmax = l_dates[dataset]
        nb_elem[dataset] = 0
        datasets[dataset] = []
        for tr in new_st:
            times = tr.times()
            tr_c = tr.copy()
            tr_c.trim(starttime=tmin, endtime=tmax)
            nb_elem_loc = tr_c.data.size
            if nb_elem_loc < min_size:
                datasets[dataset].append( (-1, -1) )
                continue
            
            """
            print(tr.stats.station)
            print(tmin, tr.stats.starttime)
            print(tmax, tr.stats.starttime)
            print(tmin-tr.stats.starttime, tmax-tr.stats.starttime)
            """
            id_min, id_max = np.argmin(abs(times-(tmin-tr.stats.starttime))), np.argmin(abs(times-(tmax-tr.stats.starttime)))
            datasets[dataset].append( (id_min, id_max) )
            nb_elem[dataset] += nb_elem_loc
    
    for dataset in l_dates:
        tmin, tmax = l_dates[dataset]
        print('dataset "{}" ({} inputs): {} - {}'.format(dataset, nb_elem[dataset]/target_size, tmin, tmax))
    
    return datasets

"""
def prepare_datasets_ids(list_quantiles, new_st):
    l_ids = np.arange(len(new_st['Baro']))
    l_bound_ids = np.quantile(l_ids, q=list_quantiles)
    datasets = {'training': (0, l_bound_ids[0]), 'validation': (l_bound_ids[0], l_bound_ids[1]), 'testing': (l_bound_ids[1], l_bound_ids[2])}
    for dataset in datasets:
        idmin, idmax = datasets[dataset]
        nb_elem = idmax-idmin
        print('dataset "{}" ({} inputs): {} - {}'.format(dataset, nb_elem, idmin, idmax))
    return datasets
"""

def prepare_l_dates(st, list_quantiles, starttime, endtime):
    
    l_starttimes, l_endtimes = [tr.stats.starttime for tr in st], [tr.stats.endtime for tr in st.copy()]
    starttime, endtime = min(l_starttimes), max(l_endtimes)
    
    l_dates = dict()
    l_names = ['training', 'validation', 'testing']
    times_seconds = endtime-starttime
    prev_seconds = 0.
    for i_dataset in range(len(list_quantiles)):
        dataset = l_names[i_dataset]
        quantile = list_quantiles[i_dataset]
        limit_seconds = quantile*times_seconds
        l_dates[dataset] = (starttime+prev_seconds, starttime+prev_seconds+limit_seconds)
        prev_seconds += limit_seconds
        
    return l_dates

st = st_starliner.copy()
st_label = None
flight_id = 'starliner'
filename = '/projects/infrasound/data/infrasound/2023_ML_balloon/data/{dataset}_waveform_dataset_Tortoise.h5'
list_quantiles = [0.3, ]
duration, target_sampling = 100., 10.
target_size=int(duration*target_sampling)

l_dates = prepare_l_dates(st, list_quantiles, starttime, endtime)
datasets = prepare_datasets_dates(st, list_quantiles, l_dates, target_size)
results = store_all_as_hdf5(datasets, st, target_size, flight_id, event_type='earthquake', st_label=st_label)

100%|██████████| 1/1 [00:04<00:00,  4.71s/it]


dataset "training" (248230.326 inputs): 2021-08-03T11:16:14.000000Z - 2021-08-05T04:59:12.605000Z


  0%|          | 0/1 [00:00<?, ?it/s]

--- training
problem nan
BNG01
problem nan
BNG03
problem nan
BNG01


100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


In [527]:
results['X'][0]

array([[14.127342 ],
       [14.1133375],
       [14.12034  ],
       [14.106336 ],
       [14.099333 ],
       [14.095832 ],
       [14.085328 ],
       [14.081827 ],
       [14.081827 ],
       [14.078325 ],
       [14.078325 ],
       [14.074824 ],
       [14.06082  ],
       [14.064321 ],
       [14.06082  ],
       [14.053817 ],
       [14.057319 ],
       [14.053817 ],
       [14.039812 ],
       [14.039812 ],
       [14.043314 ],
       [14.03281  ],
       [14.0188055],
       [13.997798 ],
       [13.980292 ],
       [13.97679  ],
       [13.997798 ],
       [14.015304 ],
       [14.0188055],
       [14.011803 ],
       [14.004801 ],
       [14.001299 ],
       [13.994297 ],
       [13.990796 ],
       [13.983793 ],
       [13.97679  ],
       [13.969789 ],
       [13.969789 ],
       [13.966288 ],
       [13.959285 ],
       [13.959285 ],
       [13.962786 ],
       [13.959285 ],
       [13.952283 ],
       [13.955784 ],
       [13.962786 ],
       [13.966288 ],
       [13.96

## Test tsfresh

In [12]:
from tqdm import tqdm
def split_time_series(df, duration, overlap, sample_rate):
    # Convert 'time' to datetime if not already converted and ensure consistent index
    #df['time'] = pd.to_datetime(df['time'])
    #df.set_index('time', inplace=True)
    df = df.sort_index()

    # Determine the sampling interval from the first two timestamps
    #sample_rate = (df.index[1] - df.index[0]).total_seconds()
    samples_per_chunk = int(duration / sample_rate)
    
    overlap_samples = int(samples_per_chunk * overlap )
    step_size = samples_per_chunk - overlap_samples

    # Create an array of start indices for each sub-series
    start_indices = np.arange(0, len(df) - samples_per_chunk + 1, step_size)
    
    # Generate sub-series by slicing df based on start_indices
    def slice_chunk(start_idx, chunk_id):
        chunk = df.iloc[start_idx:start_idx + samples_per_chunk].copy()
        chunk['id'] = chunk_id
        return chunk
    
    # Use list comprehension to create sub-series efficiently
    sub_series = []
    for i, idx in tqdm(enumerate(start_indices), total=len(start_indices)):
        sub_series.append( slice_chunk(idx, i + 1) )

    # Combine all sub-series into a single DataFrame
    return pd.concat(sub_series)

## Test tsfresh
start_constant_alt = UTCDateTime('2019-07-22T14:09:00') # Tortoise
end_constant_alt = st_all['Baro'][0].stats.endtime # Tortoise
tr = st_all['Baro'][0].copy()
tr.resample(10.)
tr.filter('highpass', freq=0.25)
tr.trim(starttime=start_constant_alt, endtime=end_constant_alt)

pd_Baro = pd.DataFrame(np.c_[tr.times(), tr.data], columns=['time', 'pressure'])
duration = 50.
overlap = 0.25
sample_rate = tr.stats.delta
pd_Baro_total = split_time_series(pd_Baro, duration, overlap, sample_rate)

100%|██████████| 399/399 [00:00<00:00, 1016.20it/s]


## Celine data

In [None]:
start_constant_alt = UTCDateTime('2019-08-09T18:52:00')
end_constant_alt = UTCDateTime('2019-08-10T02:43:31')
tr_upper = st_all['GPS'].copy()
for tr in tr_upper:
    tr.data = tr.data.astype(float)
tr_upper[0].stats.station = 'low'
tr_upper[1].stats.station = 'up'

plt.figure()
for tr in tr_upper:
    plt.plot(tr.times(), tr.data, label=tr.stats.station)
plt.legend(frameon=False)
plt.axvline(start_constant_alt-tr.stats.starttime, linestyle=':', color='black')
plt.axvline(end_constant_alt-tr.stats.starttime, linestyle=':', color='black')
plt.xlabel(f'Time since {tr.stats.starttime}')
plt.ylabel('Altitude (m)')

tr_upper.trim(starttime=start_constant_alt, endtime=end_constant_alt)


tr_upper.write("../2023_Celine_internship/msc_celine_specfem/utils/utils_NORSAR/test_data_Venus/2019_noise_balloon_GPS.mseed", format="MSEED")