In [1]:
!python --version

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sb
import os
import h5py
import dask.dataframe as dd
import dask
import tsfresh
import plotly 
from tqdm import tqdm_notebook as tqdm
plotly.tools.set_credentials_file(username='ptolmachev', api_key='Fs5sBFAg7YuBn52rzy6n')

def downsample(df, take_every):
    return df.loc[::take_every]

def nice_plot(series):
    fig = plt.figure(figsize = (16,4))
    plt.grid(True)
    try:
        plt.plot(series.compute().tolist(), 'r-',linewidth = 2, alpha = 0.7)
    except:
        plt.plot(series.tolist(), 'r-',linewidth = 2, alpha = 0.7)
    plt.show()

Python 3.7.2


In [None]:
pd.set_option('precision', 15)

In [None]:
df = pd.read_csv('./LANL-Earthquake-Prediction/train_downcasted.csv', index_col = False)
print(df.info(memory_usage='deep'))

# Analysis

In [None]:
%%time
ddfs = []
for i in range(3):
    ddfs.append(dd.read_csv('./LANL-Earthquake-Prediction/EQ_'+str(i+1)+'.csv'))
#     print(ddfs[-1].s.pow(2).mean().compute())

In [None]:
import plotly
import plotly.graph_objs as go

data = [go.Scatter(y=downsample(ddfs[i], 10).compute().s+i*1000, opacity = 0.7) for i in range(3)]
layout = dict(
    title='Three Earthquakes'
)

fig = dict(data=data, layout=layout)
plotly.offline.plot(fig, filename = "Signals before the earthquakes.html", auto_open=True)

In [None]:
ddfs_downsampled_10 =  [downsample(ddfs[i], 100).compute() for i in range(3)]

In [None]:
# # Mean and std over windows

data = [go.Scatter(y=windowed_operation(ddfs_downsampled_10[i].s, 100, "np.std")+i*100, opacity = 0.7) for i in range(3)]
layout = dict(
    title='Three Earthquakes (std)'
)

fig = dict(data=data, layout=layout)
plotly.offline.plot(fig, filename = "Signals before the earthquakes.html", auto_open=True)


# DASK featurization

In [None]:
# # FUNCTIONS FOR COMPUTING FEATURES

# def ema_sum(x, alpha = 0.9): #custom made function
#     coeff = np.array([alpha**i for i in range(len(x))])*(1-alpha)/(1-alpha**(len(x)))
#     return np.sum([x*c for x,c in zip(x,coeff)]).astype(np.float32)

In [59]:
df = (pd.read_hdf('/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/sample.h5', key = 'df'))
df.columns = ["s","y"]

def rolling_window(series, window_size, stride, function, params = None):
    len_series = len(series)
    num_iter = int(np.ceil((len_series-window_size)/stride)+1) 
    
    if params is None:
        kwargs = {}
    else:
        kwargs = params
    
    shape = series.shape[:-1] + (series.shape[-1] - window_size + 1, window_size)
    strides = series.strides + (series.strides[-1],)
    iterator = iter(np.lib.stride_tricks.as_strided(series, shape=shape, strides=strides)[::stride])
    res = 0*np.empty(num_iter, dtype = np.float)
    
    if hasattr(tsfresh.feature_extraction.feature_calculators, function):
        modifier = "tsfresh.feature_extraction.feature_calculators."
    else:
        modifier = ""

    expression = modifier + function + "(next(iterator), **kwargs)"
    
    for i in range(num_iter):
        try:
            res[i] = np.float(eval(expression))
        except StopIteration:
            return res[:i]
    
    return res

In [60]:
def calc_data(df, list_of_functions, list_of_params, window_sizes, stride, save_to):
    '''
    Input: pandas array with columns 's' and 'y' 
    ('s' corresponds to time series and 'y' - to the regression label)
    
    saves new pandas dataframe in hdf5 extension to "save_to" location
    where the columns are the calculated over the time series features from the "list_of_function"
    using windows from "window_sizes" and having a spesified stride
    '''
    
    #checks
    if (len(list_of_functions) != len(list_of_params)) or (len(list_of_params) != len(window_sizes)):
        raise ValueError("Parameters \"list_of_functions\", \
        \"list_of_params\" and \"window_sizes\" must have the same lengths!")
    if stride <=0 :
        raise ValueError("The \"stride\" has to be a postivie number!")
    
    try:
        feature_df = pd.read_hdf(save_to) # if there already exists file with features
    except:
        feature_df = pd.DataFrame()
    
    num_features = len(list_of_functions)
    series = np.array(df["s"], dtype = np.float)
    for i in range(num_features):
        function = list_of_functions[i].split("*")[0]
        window = window_sizes[i]
        print("Calculating function \"{}\" with params: \"{}\" over the window: \"{}\""\
              .format(function, list_of_params[i], window))
        
        name_of_col = list_of_functions[i] + "_" + str(window)
        if name_of_col in list(feature_df.columns):
            pass
        else:
            res = rolling_window(series, window_sizes[i], stride, function, params = list_of_params[i])
            feature_df[name_of_col] = res
            
    feature_df.to_hdf(save_to, key='df')
        
        
    return None

In [61]:
df = (pd.read_hdf('/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/sample.h5', key = 'df'))
df.columns = ["s","y"]

list_of_functions = ["np.max",'np.min', "abs_energy","np.std", \
                     "quantile*1", "quantile*2", "mean_second_derivative_central"]
list_of_params = [None, None, None, None, {"q" : 0.6}, {"q" : 0.8}, None]
window_sizes = len(list_of_functions)*[1000]

stride = 500
save_to = "/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/featurised_sample.h5"
calc_data(df, list_of_functions, list_of_params, window_sizes, stride, save_to)

Calculating function "np.max" with params: "None" over the window: "1000"
Calculating function "np.min" with params: "None" over the window: "1000"
Calculating function "abs_energy" with params: "None" over the window: "1000"
Calculating function "np.std" with params: "None" over the window: "1000"
Calculating function "quantile" with params: "{'q': 0.6}" over the window: "1000"
Calculating function "quantile" with params: "{'q': 0.8}" over the window: "1000"
Calculating function "mean_second_derivative_central" with params: "None" over the window: "1000"


In [62]:
featurised_df = pd.read_hdf("/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/featurised_sample.h5", key = 'df')

In [63]:
featurised_df.head()

Unnamed: 0,abs_energy_1000,np.max_1000,np.min_1000,np.std_1000,quantile_1000,mean_second_derivative_central_1000,quantile*1_1000,quantile*2_1000
0,48807.0,31.0,-26.0,4.734789,8.0,0.001503,6.0,8.0
1,68491.0,31.0,-26.0,6.378595,9.0,-0.00501,6.0,9.0
2,63616.0,31.0,-17.0,5.91553,10.0,-0.002004,7.0,10.0
3,47781.0,21.0,-8.0,4.539343,9.0,-0.001503,6.0,9.0
4,737293.0,104.0,-98.0,26.686003,16.0,0.008016,7.0,16.0
