In [9]:
%matplotlib inline
import obspy
import glob
import os, sys
import librosa
import numpy as np
import scipy as sp
import scipy.signal as signal
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from sklearn.preprocessing import StandardScaler

random_state = 6
np.random.seed(random_state)

In [13]:
# total length of a seismogram is 1260 seconds or 21 minutes
def read_seismogram(filename):
    st = obspy.read(filename)
    return st

def read_seismogram_filtered(filename):
    st = obspy.read(filename)
    st.filter('bandpass', freqmin=0.05, freqmax = 5.0)
    return st

def extract_features(seismogram, signal_label):
    data = seismogram[0].data
    sample_rate = seismogram[0].stats.sampling_rate
    stft = np.abs(librosa.stft(data))
    mfccs = librosa.feature.mfcc(y = data, sr=sample_rate, n_mfcc=40)
    mfccs_mean = np.mean(mfccs.T, axis = 0)
    mfccs_delta = np.mean(librosa.feature.delta(mfccs).T, axis=0)
    
    # 12 chroma
    chroma = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    chroma_mean = np.mean(chroma.T, axis = 0)
    
    ## mel from melspectrogram
    # this feature does not add anything to improve model. Therefore I delete this features from the 
    # feature list
    
    # mel = np.mean(librosa.feature.melspectrogram(data, sr = sample_rate).T, axis= 0)
    
    ## spectral centroid
    S, phase = librosa.magphase(stft)
    centiroid =np.mean(librosa.feature.spectral_centroid(S=S)) # 1 feature
    max_amplitude = np.amax(S)
    mean_amplitude = np.mean(S)
    dbamplitude= np.mean(librosa.amplitude_to_db(S, ref=np.max).T, axis = 0) # 1025 features
    
    
    ## statistical parameters
    moment = sp.stats.moment(data)
    variation = sp.stats.variation(data)
    skew = sp.stats.skew(data)
    var = np.var(data)
    autocr = np.correlate(data, data)
    kurto = sp.stats.kurtosis(data)
    
    
    features = np.hstack([mfccs_mean, chroma_mean, centiroid, max_amplitude, mean_amplitude, moment, variation, skew, var, autocr, kurto, signal_label])
    
    return features

def parse_and_stack_seismograms(parent_dir, sub_dirs):
    
    features = np.empty((0,62))
    if parent_dir == 'seismogram_v2/explosions/':
        signal_label = 1
    elif parent_dir == 'seismogram_v2/earthquakes/':
        signal_label = 0
        
    for indx, sub_dir in enumerate(sub_dirs):
        for filename in glob.glob(os.path.join(parent_dir, sub_dir, '*.SAC')):
            seismogram = read_seismogram(filename)
            single_feature= extract_features(seismogram, signal_label)
            features = np.vstack([features, single_feature])
        
    dataFrame = pd.DataFrame(features)
    return dataFrame

In [14]:
parent_dir_explosions = 'seismogram_v2/explosions/'
sub_dirs_explosions = ['1998-05-11-mb52-india','1998-05-28-mb48-pakistan', '1998-05-30-mb46-pakistan', '2013-02-12-mb51-north-korea', '2016-01-06-mb51-north-korea', '2017-09-03-mb63-north-korea']
parent_dir_earthquake = 'seismogram_v2/earthquakes/'
sub_dirs_earthquake = ['2004-12-26-mw90-sumatra', '2010-03-12-mw55-myanmar-india-border-region', '2017-08-15-mb49-southeast-of-ryukyu-islands', '2017-09-08-mww81-near-coast-of-chiapas-mexico', '2017-10-24-mww67-banda-sea']

## Explosions
print('--- Earthquake-----: ')
total = 0.0
for sub_dir in sub_dirs_earthquake:
    counter = len(glob.glob(os.path.join(parent_dir_earthquake, sub_dir, '*.SAC')))
    total += counter
    print('{}: {}'.format(sub_dir, counter))
print('Total seismograms: {}'. format(total))


## Explosions
print('--- Explosions-----: ')
total = 0.0
for sub_dir in sub_dirs_explosions:
    counter = len(glob.glob(os.path.join(parent_dir_explosions, sub_dir, '*.SAC')))
    total += counter
    print('{}: {}'.format(sub_dir, counter))
print('Total seismograms: {}'. format(total))



--- Earthquake-----: 
2004-12-26-mw90-sumatra: 1193
2010-03-12-mw55-myanmar-india-border-region: 3021
2017-08-15-mb49-southeast-of-ryukyu-islands: 2644
2017-09-08-mww81-near-coast-of-chiapas-mexico: 2722
2017-10-24-mww67-banda-sea: 3382
Total seismograms: 12962.0
--- Explosions-----: 
1998-05-11-mb52-india: 459
1998-05-28-mb48-pakistan: 470
1998-05-30-mb46-pakistan: 398
2013-02-12-mb51-north-korea: 3763
2016-01-06-mb51-north-korea: 2640
2017-09-03-mb63-north-korea: 2804
Total seismograms: 10534.0


In [15]:
df_explosions  = parse_and_stack_seismograms(parent_dir_explosions, sub_dirs_explosions)
df_earthquakes  = parse_and_stack_seismograms(parent_dir_earthquake, sub_dirs_earthquake)



In [16]:
def get_column_names():
    features = {
        'mfccs': 40,
        'chroma': 12,
        'centiroid': 1,
        'max_amplitude': 1, 
        'mean_amplitude': 1,
        'moment': 1,
        'variation': 1, 
        'skew': 1, 
        'var': 1, 
        'autocr': 1, 
        'kurto': 1, 
        'target': 1
    }
    
    names = list(features.keys())
    val = list(features.values())
    
    columns = []
    
    for i in range(len(features)):
        if val[i] > 1:
            for j in range(val[i]):
                columns.append(str(names[i])+'_'+ str(j))
                
        else:
            columns.append(str(names[i]))
    
    return columns

In [17]:
frames = [df_explosions, df_earthquakes]
df = pd.concat(frames)
df.columns = get_column_names()
df.describe()

Unnamed: 0,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,mfccs_6,mfccs_7,mfccs_8,mfccs_9,...,centiroid,max_amplitude,mean_amplitude,moment,variation,skew,var,autocr,kurto,target
count,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,...,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0,23496.0
mean,842.505003,73.359982,-0.894606,52.395407,2.601928,34.513327,7.728752,22.589093,12.131532,15.490778,...,959.045374,247125200.0,309476.9,0.0,18.969707,-0.032753,527421300000.0,1.096999e+17,7.173602,0.448332
std,189.732512,38.111069,38.966119,19.352436,24.226235,12.969548,15.588723,9.172668,10.312101,6.988078,...,1024.497248,1740120000.0,4249657.0,0.0,3913.459504,1.590172,25535650000000.0,3.697947e+18,211.110656,0.497334
min,480.726115,-87.232311,-198.422667,-32.555981,-88.160009,-41.837174,-48.987699,-26.721758,-35.454891,-23.973469,...,3.588867,4490.561,41.40245,0.0,-427348.3125,-148.525772,0.0,1098498.0,-3.0,0.0
25%,701.650737,45.531983,-28.158299,39.503115,-15.94022,25.90437,-3.266232,16.958161,6.091282,11.349311,...,199.003485,596888.1,1863.994,0.0,-0.332406,-0.082917,37504.36,11242910000.0,-0.06924,0.0
50%,825.624647,74.008466,2.385506,53.200929,3.833513,34.087116,9.972833,22.015254,14.141603,15.476138,...,546.48518,1234899.0,3855.819,0.0,0.196938,-0.002934,156536.7,47304430000.0,0.186421,0.0
75%,959.726289,99.577733,28.229987,65.021862,22.487635,42.324745,20.381357,27.575977,19.56752,19.274661,...,1442.009632,8348212.0,21159.13,0.0,0.891293,0.063645,3800722.0,1896499000000.0,2.320519,1.0
max,2065.265843,233.043814,112.815778,132.919902,64.457973,118.624193,58.102843,71.468831,53.939397,65.529958,...,6894.203309,71428600000.0,543755100.0,0.0,287691.59375,67.194862,3695033000000000.0,4.833089e+20,27424.24321,1.0


In [18]:
df.to_csv('seismogram_data_62_new.csv', index=False)