In [102]:
!python --version

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sb
import os
import h5py
import dask.dataframe as dd
import dask
import tsfresh
import plotly 
from tqdm import tqdm_notebook as tqdm
plotly.tools.set_credentials_file(username='ptolmachev', api_key='Fs5sBFAg7YuBn52rzy6n')

def downsample(df, take_every):
    return df.loc[::take_every]

def nice_plot(series):
    fig = plt.figure(figsize = (16,4))
    plt.grid(True)
    try:
        plt.plot(series.compute().tolist(), 'r-',linewidth = 2, alpha = 0.7)
    except:
        plt.plot(series.tolist(), 'r-',linewidth = 2, alpha = 0.7)
    plt.show()

Python 3.7.2


In [103]:
pd.set_option('precision', 15)

In [None]:
df = pd.read_csv('./LANL-Earthquake-Prediction/train_downcasted.csv', index_col = False)
print(df.info(memory_usage='deep'))

# Visualization

In [109]:
%%time
dfs = []
for i in range(3):
    dfs.append(pd.read_hdf('./sample.h5', key = 'df'))

CPU times: user 28.9 ms, sys: 20.4 ms, total: 49.3 ms
Wall time: 49.1 ms


In [112]:
import plotly
import plotly.graph_objs as go


def Three_EQ_comparison(featurised_df)
data = [go.Scatter(y=downsample(dfs[i], 10).s+i*1000, opacity = 0.7) for i in range(3)]
layout = dict(
    title='Three Earthquakes'
)

fig = dict(data=data, layout=layout)
plotly.offline.plot(fig, filename = "Signals before the earthquakes.html", auto_open=True)

'file:///home/pavel/Documents/0Research/Projects/Kaggle/Signals before the earthquakes.html'

# DASK featurization

In [71]:
# # FUNCTIONS FOR COMPUTING FEATURES

# def ema_sum(x, alpha = 0.9): #custom made function
#     coeff = np.array([alpha**i for i in range(len(x))])*(1-alpha)/(1-alpha**(len(x)))
#     return np.sum([x*c for x,c in zip(x,coeff)]).astype(np.float32)

In [85]:
df = (pd.read_hdf('/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/sample.h5', key = 'df'))
df.columns = ["s","y"]

def rolling_window(series, window_size, stride, function, params = None):
    len_series = len(series)
    num_iter = int(np.ceil((len_series-window_size)/stride)+1) 
    
    if params is None:
        kwargs = {}
    else:
        kwargs = params
    
    shape = series.shape[:-1] + (series.shape[-1] - window_size + 1, window_size)
    strides = series.strides + (series.strides[-1],)
    iterator = iter(np.lib.stride_tricks.as_strided(series, shape=shape, strides=strides)[::stride])
    res = 0*np.empty(num_iter, dtype = np.float)
    
    if hasattr(tsfresh.feature_extraction.feature_calculators, function):
        modifier = "tsfresh.feature_extraction.feature_calculators."
    else:
        modifier = ""

    expression = modifier + function + "(next(iterator), **kwargs)"
    
    for i in range(num_iter):
        try:
            res[i] = np.float(eval(expression))
        except StopIteration:
            return res[:i]
    
    return res

In [95]:
def calc_data(df,col_name, list_of_functions, list_of_params, window_sizes, stride, save_to, rewrite = True):
    '''
    Input: pandas array with signal representations
    
    Saves new pandas dataframe in hdf5 extension to "save_to" location
    where the columns are the calculated over the time series features from the "list_of_function"
    using windows from "window_sizes" and having a spesified stride
    "rewrite" specifies wether to discard all the previous information wtitten to the featurized dataframe 
    '''
    
    #checks
    if (len(list_of_functions) != len(list_of_params)) or (len(list_of_params) != len(window_sizes)):
        raise ValueError("Parameters \"list_of_functions\", \
        \"list_of_params\" and \"window_sizes\" must have the same lengths!")
    if stride <=0 :
        raise ValueError("The \"stride\" has to be a postivie number!")
    
    if rewrite == False:
        try:
            feature_df = pd.read_hdf(save_to) # if there already exists file with features
        except:
            feature_df = pd.DataFrame()
    else:
        feature_df = pd.DataFrame()
    
    num_features = len(list_of_functions)
    series = np.array(df[col_name], dtype = np.float)
    for i in range(num_features):
        function = list_of_functions[i].split("*")[0]
        window = window_sizes[i]
        print("Calculating function \"{}\" with params: \"{}\" over the window: \"{}\""\
              .format(function, list_of_params[i], window))
        
        name_of_new_col = col_name + "_" + list_of_functions[i] + "_" + str(window)
        if name_of_new_col not in list(feature_df.columns) or rewrite == True:
            res = rolling_window(series, window_sizes[i], stride, function, params = list_of_params[i])
            feature_df[name_of_new_col] = res
            
    feature_df.to_hdf(save_to, key='df')
        
        
    return None

In [96]:
df = (pd.read_hdf('/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/sample.h5', key = 'df'))
df.columns = ["s","y"]

list_of_functions = ["np.max",'np.min', "abs_energy","np.std", \
                     "quantile*1", "quantile*2", "mean_second_derivative_central"]
list_of_params = [None, None, None, None, {"q" : 0.6}, {"q" : 0.8}, None]
window_sizes = len(list_of_functions)*[1000]

stride = 500
save_to = "/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/featurised_sample.h5"
calc_data(df, "s", list_of_functions, list_of_params, window_sizes, stride, save_to)

Calculating function "np.max" with params: "None" over the window: "1000"
Calculating function "np.min" with params: "None" over the window: "1000"
Calculating function "abs_energy" with params: "None" over the window: "1000"
Calculating function "np.std" with params: "None" over the window: "1000"
Calculating function "quantile" with params: "{'q': 0.6}" over the window: "1000"
Calculating function "quantile" with params: "{'q': 0.8}" over the window: "1000"
Calculating function "mean_second_derivative_central" with params: "None" over the window: "1000"


In [99]:
featurised_df = pd.read_hdf("/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data/featurised_sample.h5", key = 'df')

In [100]:
featurised_df.head()

Unnamed: 0,s_np.max_1000,s_np.min_1000,s_abs_energy_1000,s_np.std_1000,s_quantile*1_1000,s_quantile*2_1000,s_mean_second_derivative_central_1000
0,31.0,-26.0,48807.0,4.734789,6.0,8.0,0.001503
1,31.0,-26.0,68491.0,6.378595,6.0,9.0,-0.00501
2,31.0,-17.0,63616.0,5.91553,7.0,10.0,-0.002004
3,21.0,-8.0,47781.0,4.539343,6.0,9.0,-0.001503
4,104.0,-98.0,737293.0,26.686003,7.0,16.0,0.008016
