In [5]:
%matplotlib inline
import obspy
from obspy import read
import glob
import os, sys
import librosa
import numpy as np
from scipy import stats
import scipy as sp
import scipy.signal as signal
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from sklearn.preprocessing import StandardScaler

random_state = 6
np.random.seed(random_state)

In [2]:
def extract_features(seismogram, signal_label):
    data = stats.zscore(seismogram.data)
    sample_rate = seismogram.stats.sampling_rate
    stft = np.abs(librosa.stft(data))
    mfccs = librosa.feature.mfcc(y = data, sr=sample_rate, n_mfcc=40)
    mfccs_mean = np.mean(mfccs.T, axis = 0)
    mfccs_delta = np.mean(librosa.feature.delta(mfccs).T, axis=0)
    
    # 12 chroma
    chroma = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    chroma_mean = np.mean(chroma.T, axis = 0)
    
    ## mel from melspectrogram
    # this feature does not add anything to improve model. Therefore I delete this features from the 
    # feature list
    
    # mel = np.mean(librosa.feature.melspectrogram(data, sr = sample_rate).T, axis= 0)
    
    ## spectral centroid
    S, phase = librosa.magphase(stft)
    centiroid =np.mean(librosa.feature.spectral_centroid(S=S)) # 1 feature
    max_amplitude = np.amax(S)
    mean_amplitude = np.mean(S)
    dbamplitude= np.mean(librosa.amplitude_to_db(S, ref=np.max).T, axis = 0) # 1025 features
    
    ## statistical parameters
    moment = sp.stats.moment(data)
    variation = sp.stats.variation(data)
    skew = sp.stats.skew(data)
    var = np.var(data)
    autocr = np.correlate(data, data)
    kurto = sp.stats.kurtosis(data)
    
    features = np.hstack([mfccs_mean, chroma_mean, centiroid, max_amplitude, mean_amplitude, moment, variation, skew, var, autocr, kurto, signal_label])
    
    return features

def parse_and_stack_seismograms(seismograms, label):
    
    features = np.empty((0,62))
    
    if label == 'earthquake':
        target = 0
    elif label == 'explosion':
        target = 1

    for i, seismogram in enumerate(seismograms):
        single_feature= extract_features(seismogram, target)
        features = np.vstack([features, single_feature])
        
    dataFrame = pd.DataFrame(features)
    return dataFrame

In [3]:
parent_dir_explosions = 'seismogram_v2/explosions/'
sub_dirs_explosions = ['1998-05-11-mb52-india','1998-05-28-mb48-pakistan', '1998-05-30-mb46-pakistan', '2013-02-12-mb51-north-korea', '2016-01-06-mb51-north-korea', '2017-09-03-mb63-north-korea']
parent_dir_earthquake = 'seismogram_v2/earthquakes/'
sub_dirs_earthquake = ['2004-12-26-mw90-sumatra', '2010-03-12-mw55-myanmar-india-border-region', '2017-08-15-mb49-southeast-of-ryukyu-islands', '2017-09-08-mww81-near-coast-of-chiapas-mexico', '2017-10-24-mww67-banda-sea']

## Explosions
print('--- Earthquake-----: ')
total = 0.0
for sub_dir in sub_dirs_earthquake:
    counter = len(glob.glob(os.path.join(parent_dir_earthquake, sub_dir, '*.SAC')))
    total += counter
    print('{}: {}'.format(sub_dir, counter))
print('Total seismograms: {}'. format(total))


## Explosions
print('--- Explosions-----: ')
total = 0.0
for sub_dir in sub_dirs_explosions:
    counter = len(glob.glob(os.path.join(parent_dir_explosions, sub_dir, '*.SAC')))
    total += counter
    print('{}: {}'.format(sub_dir, counter))
print('Total seismograms: {}'. format(total))

--- Earthquake-----: 
2004-12-26-mw90-sumatra: 1193
2010-03-12-mw55-myanmar-india-border-region: 3021
2017-08-15-mb49-southeast-of-ryukyu-islands: 2644
2017-09-08-mww81-near-coast-of-chiapas-mexico: 2722
2017-10-24-mww67-banda-sea: 3382
Total seismograms: 12962.0
--- Explosions-----: 
1998-05-11-mb52-india: 459
1998-05-28-mb48-pakistan: 470
1998-05-30-mb46-pakistan: 398
2013-02-12-mb51-north-korea: 3763
2016-01-06-mb51-north-korea: 2640
2017-09-03-mb63-north-korea: 2804
Total seismograms: 10534.0


In [4]:
earthquakes = read('./seismogram_v2/earthquakes/**/*.SAC')
explosions = read('./seismogram_v2/explosions/**/*.SAC')

df_explosions  = parse_and_stack_seismograms(explosions, 'explosion')
df_earthquakes  = parse_and_stack_seismograms(earthquakes, 'earthquake')

  return a.std(axis) / a.mean(axis)
  return (a - mns) / sstd


ParameterError: Audio buffer is not finite everywhere

In [7]:
def get_column_names():
    features = {
        'mfccs': 40,
        'chroma': 12,
        'centiroid': 1,
        'max_amplitude':1, 
        'mean_amplitude':1,
        'moment': 1,
        'variation': 1, 
        'skew': 1, 
        'var': 1, 
        'autocr': 1, 
        'kurto': 1, 
        'target': 1
    }
    
    names = list(features.keys())
    val = list(features.values())
    
    columns = []
    
    for i in range(len(features)):
        if val[i] > 1:
            for j in range(val[i]):
                columns.append(str(names[i])+'_'+ str(j))
                
        else:
            columns.append(str(names[i]))
    
    return columns

In [8]:
frames = [df_explosions, df_earthquakes]
df = pd.concat(frames)
df.columns = get_column_names()
df.describe()

Unnamed: 0,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,mfccs_6,mfccs_7,mfccs_8,mfccs_9,...,centiroid,max_amplitude,mean_amplitude,moment,variation,skew,var,autocr,kurto,target
count,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,...,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0
mean,841.535354,72.717304,-1.120552,52.234151,2.483228,34.444831,7.67164,22.622343,12.085533,15.528755,...,971.44996,239865400.0,296083.1,0.0,20.183022,-0.027121,555637100000.0,9.640448e+16,6.574073,0.450801
std,191.235047,38.199436,39.099088,19.40694,24.244137,12.98745,15.622192,9.220608,10.365557,7.056197,...,1035.548479,1618594000.0,4228556.0,0.0,4024.576118,1.39349,26259370000000.0,3.678673e+18,197.093498,0.497585
min,480.726115,-87.232311,-198.422667,-32.555981,-88.160009,-41.837174,-48.987699,-26.721758,-35.454891,-23.973469,...,3.588867,4490.561,41.40245,0.0,-427348.3125,-148.525772,0.0,1098498.0,-3.0,0.0
25%,699.059195,44.646795,-28.657048,39.179819,-16.10059,25.82346,-3.387524,16.973561,5.998598,11.365791,...,198.914251,585058.0,1836.427,0.0,-0.326106,-0.084359,36995.06,10651730000.0,-0.067151,0.0
50%,823.266826,73.334659,2.453667,53.082876,3.731802,34.055244,9.972462,22.04225,14.128136,15.497889,...,550.956333,1193249.0,3733.953,0.0,0.20473,-0.002923,148552.3,44483880000.0,0.197497,0.0
75%,959.873963,99.011031,28.34615,64.883821,22.434741,42.256966,20.350147,27.610037,19.563121,19.315831,...,1465.813517,7911418.0,20248.87,0.0,0.90134,0.064298,3477907.0,1656055000000.0,2.471488,1.0
max,2065.265843,233.043814,112.815778,132.919902,61.106265,118.624193,58.102843,71.468831,53.939397,65.529958,...,6894.203309,71428600000.0,543755100.0,0.0,287691.59375,67.194862,3695033000000000.0,4.833089e+20,27424.24321,1.0


In [9]:
df.to_csv('seismogram_data_62_new.csv', index=False)