### Preparing AI Ready Data
This notebook extracts statistics from 100 days of seismic data for use in classic machine learning. The environment requires obspy to be installed in addition to the standard mlgeo environment

In [22]:
#!pip install obspy
import obspy
from obspy import read
from obspy import UTCDateTime as utc
from scipy.signal import stft
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

In [23]:
#Make folder for AI ready data
! mkdir data/ai_ready
filepath = os.getcwd() + '/data/ai_ready/'
clean_filepath = os.getcwd() + '/data/clean/'
raw_filepath = os.getcwd() + '/data/raw/'

In [24]:
#set eruption time for given sensor/volcano
eruption_time = utc('2005-01-16T00:00')

In [25]:
#read in dates file from Download_Data notebook and convert to UTC objects
dates = [utc(date) for date in np.load(raw_filepath+'date_list.npy', allow_pickle=True)]
print(dates[0])

2004-11-27T00:00:00.000000Z


### Define Functions for Calculating Stats

In [26]:
# General Stats Used
def stats(data):
    mean = np.mean(data)
    maxx = np.max(data)
    minn = np.min(data)
    median = np.median(data)
    range = np.abs(maxx - minn)
    std = np.std(data)
    var = np.var(data)

    return(mean, maxx, minn, median, range, std, var)

# Spectral Domain, use STFT using the same parameters as Zahra et al. 2024
def compute_spec(data):

    x = data
    fs = 8 # specified by Zahra
    nwindow = 256 #given by Zahra
    noverlap = 32 #given by Zahra

    f, t, spectra = stft(x, fs=fs, nperseg=nwindow, noverlap=noverlap)

    #find magnitude of complex spectra
    spectra = np.abs(spectra)

    #find log 10 of spectra
    log_spectra = np.log10(spectra)



    return(spectra, log_spectra)

### Cell Below Will Save Seismograms in Day Chunks and also Combine Them into 1 Large File

In [27]:
#read in cleaned mseed files, save as daily npy files and one giant 100 day file
hundred_data = np.array([])

for i in range(100):
    i += 1

    #read mseed
    st = read(clean_filepath+str(i)+'_cleaned.mseed')

    #save data to day long npys
    data = st[0].data 
    np.save(filepath+str(i)+'_ready.npy', data)

    #append to overall array
    hundred_data = np.append(hundred_data, data)

#save hundred day data npy
np.save(filepath+'100_days.npy', hundred_data)

### Cells Below Will Compute and Save Time Series and Spectral Statistics For Each Hour of Seismogram Data
Data will also be associated with a Before or After Eruption Label in Stats file

In [28]:
#initialize statistics dataframe
statistics = ['mean','max','min','median','range','std','var',
              'spec_mean','spec_max','spec_min','spec_median','spec_range','spec_std','spec_var',
              'speclog_mean','speclog_max','speclog_min','speclog_median','speclog_range','speclog_std','speclog_var',
              'state']

stats_df = pd.DataFrame(columns=statistics)

In [29]:
for i in range(100):
    date = dates[i]
    date += 30*60 #add half an hour for determining eruption state
    i += 1

    #read mseed
    st = read(clean_filepath+str(i)+'_cleaned.mseed')
    data = st[0].data
    hz = st[0].stats.sampling_rate

    #THIS ASSUMES DATA CONTAINS 24 HOURS OF DATA PER DATA
    split_data = np.array_split(data, 2)

    for half in np.arange(2):
        date += 12*60*60  # increment by 12 hours for each half
        if date >= eruption_time:
            state = 'after'
        elif date < eruption_time:
            state = 'before'

        data = split_data[half]
        spectra, log_spectra = compute_spec(data)

        tmean, tmax, tmin, tmedian, trange, tstd, tvar = stats(data)

        smean, smax, smin, smedian, srange, sstd, svar = stats(spectra)

        slmean, slmax, slmin, slmedian, slrange, slstd, slvar = stats(log_spectra)

        stats_df.loc[len(stats_df)] = tmean, tmax, tmin, tmedian, trange, tstd, tvar, smean, smax, \
            smin, smedian, srange, sstd, svar, slmean, slmax, slmin, slmedian, slrange, slstd, slvar, state
        
        


In [30]:
#inspect dataframe header
stats_df.head(3)

Unnamed: 0,mean,max,min,median,range,std,var,spec_mean,spec_max,spec_min,...,spec_std,spec_var,speclog_mean,speclog_max,speclog_min,speclog_median,speclog_range,speclog_std,speclog_var,state
0,2.405779,359.358296,-436.681086,2.457358,796.039383,27.819862,773.944698,1.349449,42.351115,0.000565,...,1.72172,2.964321,-0.075843,1.626865,-3.247877,-0.069392,4.874742,0.428355,0.183488,before
1,-0.950467,2792.007085,-3626.343103,-0.890512,6418.350188,43.829779,1921.049533,1.565712,421.398378,0.000368,...,3.249615,10.56,-0.019853,2.624693,-3.434664,-0.009073,6.059357,0.433242,0.187699,before
2,1.954878,446.572562,-513.103438,2.172117,959.676,32.268532,1041.258151,1.622005,56.014858,0.00053,...,1.895827,3.594158,0.008601,1.748303,-3.275908,0.022684,5.024212,0.434314,0.188629,before


In [31]:
#save stats to csv file
stats_df.to_csv(path_or_buf=filepath+'statistics.csv', index=False)