In [128]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import plotly.graph_objs as go
import plotly
from EQ_chunking import chunk_data_on_EQs
import os, fnmatch
plotly.tools.set_credentials_file(username='ptolmachev', api_key='Fs5sBFAg7YuBn52rzy6n')
from dp_utils import *

def nice_plot(series):
    fig = plt.figure(figsize = (16,4))
    plt.grid(True)
    try:
        plt.plot(series.compute().tolist(), 'r-',linewidth = 2, alpha = 0.7)
    except:
        plt.plot(series.tolist(), 'r-',linewidth = 2, alpha = 0.7)
    plt.show()

In [136]:
def find_dir_contains_EQs(data_path, EQs_num):
    list_dir = os.listdir(data_path)
    for i in range(len(list_dir)):
        subdir_path = data_path + "/" + list_dir[i]
        if os.path.isdir(subdir_path):
            if all([os.path.isfile(subdir_path + "/" + "EQ_" + str(num) + ".h5") for num in EQs_num]):
                return subdir_path
    return None

def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    if len(result) == 0:
        return None
    return result[0]
            

def plot_data(function, params, **kwargs):
    size_of_slice = 150000
    data_path = kwargs['path_to_data']
    list_of_EQs_to_plot = kwargs['EQs_num']
    
    #Preparation part (if you haven't done chunking yourself it does it for you)
    EQ_dir = find_dir_contains_EQs(data_path, list_of_EQs_to_plot)
    if EQ_dir is None:
        print("Hold on for a sec...Need to split the data into the chunks first! \n")
        save_chunks_to = data_path + "/EQs"
        print("Splitting the data, and saving it to {}\n".format(save_chunks_to))
        full_data_file = find('train.h5', data_path)
        if full_data_file is None:
            full_data_file = find('train.csv', data_path)
        if full_data_file is None:
            raise IOError("No train dataset on you computer! Please download it prior to runnin g this function.")
        
        chunk_data_on_EQs(full_data_file, save_chunks_to)
    
        print("Finished splitting the data.")
    
    
    # plotting part
    dataframes = [pd.read_hdf(data_path + "/EQs/EQ_" + str(EQ) + ".h5", key = 'table') for EQ in list_of_EQs_to_plot]
    for i in range(len(dataframes)):
        dataframes[i].columns = ["s","ttf"]
    df = pd.concat(dataframes)
    df.columns = ["s","ttf"]
    names = [function.__name__, "downsampled_signal", 'ttf']
    
    # Downsampling is conducted by the last element      
    signals = [function(df.s, **params).values.ravel(), #featurised signal
           w_labels(df.s, **params).values.ravel(), #downsampled signal
           100*w_labels(df.ttf, **params).values.ravel()] # downsample ttf
          
    s_max = []
    for i in range(len(signals)):
        s_max.append(abs(signals[i]).max())
    s_max.append(0)

    
    data1 = [go.Scatter(y=(signals[i] - 0.7*sum(s_max[:i+1])), opacity = 0.7, name  = names[i]) for i in range(len(signals))]

    layout1 = dict(
        title='Eathquakes ' + function.__name__
    )

    fig1 = dict(data=data1, layout=layout1)
    plotly.offline.plot(fig1, filename = "Earthquakes.html", auto_open=True)

    #####################################################################################################
    # takes the first of specified EQs and plots data related to first, middle and last 150000 samples
    b = int( (len(dataframes[0].s) - size_of_slice) / 2)
    e = int( (len(dataframes[0].s) + size_of_slice) / 2)
    
    samples = [function(dataframes[0].s[:size_of_slice], **params).values.ravel(),
               function(dataframes[0].s[b:e], **params).values.ravel(),
               function(dataframes[0].s[len(dataframes[0])-size_of_slice:], **params).values.ravel()]

    s_max = []
    for i in range(len(samples)):
        s_max.append(abs(samples[i]).max())
    s_max.append(0)
    
    names = ["first " + str(size_of_slice),"middle "  + str(size_of_slice),"last " + str(size_of_slice)]
    data2 = [go.Scatter(y= (samples[i] - 0.7*sum(s_max[:i+1])), opacity = 0.7, name  = names[i]) for i in range(len(samples))]

    layout2 = dict(
        title='Earthquakes ' + function.__name__
    )

    fig2 = dict(data=data2, layout=layout2)
    plotly.offline.plot(fig2, filename = "Earthquakes samples (first, middle, last "+ str(size_of_slice) + " dp).html", auto_open=True)
    

In [137]:
#example
params = {"window_size" : 1000}
# One has to specify either: 1) the path to the directory which contains the train.h5 or train.csv file
# to do the chunking the program will split the dataset into separate chunks and save them in
# the "specified_path/EQs/"

# or 2) (if you have done the chunking prior to running this code) specify the directory which contains 
# "EQs" folder with chunks in it

plot_data(w_std, params, path_to_data="/home/pavel/Documents/0Research/Projects/LANL-Earthquake/data", EQs_num = [2,3] )

w_std(df, window_size=1000): 100%|██████████| 99021/99021 [00:14<00:00, 6931.87it/s]
	 window decorator: 
	 - window size: 1000
w_labels(df, window_size=1000): 100%|██████████| 99021/99021 [00:06<00:00, 14145.88it/s]
	 window decorator: 
	 - window size: 1000
w_labels(df, window_size=1000): 100%|██████████| 99021/99021 [00:06<00:00, 14955.61it/s]
	 window decorator: 
	 - window size: 1000
w_std(df, window_size=1000): 100%|██████████| 150/150 [00:00<00:00, 4687.07it/s]
	 window decorator: 
	 - window size: 1000
w_std(df, window_size=1000): 100%|██████████| 150/150 [00:00<00:00, 5833.69it/s]
	 window decorator: 
	 - window size: 1000
w_std(df, window_size=1000): 100%|██████████| 150/150 [00:00<00:00, 6583.78it/s]
	 window decorator: 
	 - window size: 1000
