In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import scipy.io as sio
import scipy.interpolate
import matplotlib.pyplot as plt
import os

In [None]:
#Update dataset name as needed

RAW_DATA_FOLDER = 'raw_data/'
RAW_MAT_FOLDER = RAW_DATA_FOLDER + 'raw_mat/'
RAW_NPY_FOLDER = RAW_DATA_FOLDER + 'raw_npy/'
NPY_FOLDER = 'processed_input_data/'

#split data
## attribute all exp_1 to training
## attribute 2 windows (split_ratio) from exp_2 and exp_3 to training
## the rest were randomly used for either eval and train. 
ML_exp = 'split_master'
split_ratio=2

In [None]:
#EXP_LIST contains a list of raw experiment XXX.mat files 
#Format: raw .mat --> raw .npy --> sliced & labeled .npy 

EXP_LIST = ['EB_025_1','EB_025_2','EB_025_3',
           'EB_050_1','EB_050_2','EB_050_3',
           'EB_150_1','EB_150_2','EB_150_3',
           'PP_025_1','PP_025_2',
           'PP_050_1','PP_050_2',
           'PP_100_1','PP_100_2',
           'PP_150_1']



#SLICE_PARAM = [initial_index, strike, win_H, win_W]
SLICE_PARAM = [0, 50, 128, 64]

#Constant for step size from image processing
DX = 1.14

In [None]:
# Basic functions to generate.npy file 

## dimension have to be adjusted if storing more values of the same dimension(x,y,t)
def compile_cfuv(c,f,u,v):
    ## composing a matrix of dimension (cfuv, x, y, t) 
    data = np.zeros((4, *c.shape))              
    data[0] = c
    data[1] = f
    data[2] = u
    data[3] = v
    ##Transpose axes to (x, y, cfuv, t)
    data = np.transpose(data, axes=(1, 2, 0, 3))
    print(data.shape)
    return data


def create_npy(exp):
    matlab_data_path = RAW_MAT_FOLDER + str(exp) + '.mat' 
    npy_data_path = RAW_NPY_FOLDER + str(exp) +'.npy'
    
    if not os.path.exists(RAW_NPY_FOLDER):
        os.mkdir(RAW_NPY_FOLDER)
    if os.path.exists(npy_data_path):
        raise Exception("raw .npy file already exist")
    else:  
        # storing value of each channel from file path and key (column head named from matlab)
        curl_map = sio.loadmat(matlab_data_path)["curl_map"]
        store_u = sio.loadmat(matlab_data_path)["store_u"]
        store_v = sio.loadmat(matlab_data_path)["store_v"]
        altBW_filt = sio.loadmat(matlab_data_path) ["altBW_filt"]    
        # compile and structure to (x,y,cfuv_channel,t) and save
        data = compile_cfuv(curl_map, altBW_filt, store_u, store_v)
        np.save(npy_data_path, data)

In [None]:
# Generate and save raw_npy for list of experiment
# Only need to run once! 
for e in EXP_LIST:
    print(e)
    create_npy(e)

In [None]:
# Function that slice npy file into appropriate size of win_H and win_W
## SLICE_PARAM = [ini_H=0, strike=50, win_H=128, win_W=64]
## Raw npy is of size H=135, W=313 with 4 channel (cfuv) and stacking times
## 'strike' allows some overlap of sliced npy file 


def slicing_hw(RAW_NPY_FOLDER, exp, slice_param): 
    npy_data_path = RAW_NPY_FOLDER + str(exp) +'.npy'
    data = np.load(npy_data_path)
    
    #data has dimension of (x, y, cfuv, t)
    width_tot = data.shape[1]
    t_tot = data.shape[3]
    
    #unpack slice_param into initial_index, strike_lenght, window_H and window_W
    ii = slice_param[0]
    strike = slice_param[1]
    H = slice_param[2]
    W = slice_param[3]
    
    # Current experiments yield ~5 windows with a fixed slice_param 
    num_win = 1 + (width_tot - W)//strike   
    data_h = data[ii: ii+H]
    data_hw = []
    w_idx = np.arange(num_win)
    for w_i in w_idx:
        left_win = w_i*strike
        data_hwi = data_h[:,left_win:left_win+W]
        
        #data_hw is a list of tuples (number of tuples = num_win)
        #first element of tuple has a shape = (128, 64, 4, 228)
        #second element of tuple is the w_idx
        data_hw.append((data_hwi, w_i))
    
    print("slice_shape =", data_hw[0][0].shape)
    return data_hw, num_win, t_tot

# define total velocity for each frame
def vel_tot_cal(RAW_NPY_FOLDER, exp):
    #calculate the displacement at the bottom and top of frame. The
    #difference is the total velocity
    npy_data_path = RAW_NPY_FOLDER + str(exp) +'.npy'
    data = np.load(npy_data_path)
    #each experiment have distinct tot_t or # of frame 
    t_tot = data.shape[3]
    vel_tot =[]
    for t in range(t_tot):
        u = data[:,:,2,t]
        v = data[:,:,3,t]
        vel_ins = np.sqrt((np.median(u[0], axis=0)-np.median(u[-1], axis=0))**2
                          +(np.median(v[0], axis=0)-np.median(v[-1], axis=0))**2)
        #roughly in the unit of 0.25mm/30s
        vel_tot.append(vel_ins)
    #calculating one velocity per time frame 
    return vel_tot    

# function that provide KE label for each slice of size 64 (width) x 128 (height)
# KE is calcuated by total_CurlSlip/vel

def data_slice_KE(RAW_NPY_FOLDER, NPY_FOLDER, exp, slice_param,step):
    
    #set file location for 
    npy_exp_folder_path = NPY_FOLDER + 'slice_npy/' + str(exp)
    master_folder_path = NPY_FOLDER + 'file_master/'
    

    if os.path.exists(npy_exp_folder_path):
        raise Exception("slice_exp.npy already exist")
    else: 
        os.makedirs(npy_exp_folder_path)
        if not os.path.exists(master_folder_path): 
            os.makedirs(master_folder_path)     
    
    
    data_hw, num_win, t_tot = slicing_hw(RAW_NPY_FOLDER, exp, slice_param)
    vel_tot = vel_tot_cal(RAW_NPY_FOLDER, exp)
    
    w_idx = np.arange(num_win)
    file_master = []
    
    #looping through all windows and times 
    for w_i in w_idx: 
        for t_i in range(t_tot):
            #unpacking data_hw
            data_s, win = data_hw[w_i]
            SLICE = data_s[:, :, :, t_i]
            c = data_s[:,:,0,t_i]
            f = data_s[:,:,1,t_i]
            
            #only process slice with non-zero fault traces
            if np.sum(f) != 0:
                curl_slip = 2*np.trapz(c*f,dx=step,axis=0)
                label_KE = format((np.median(curl_slip)/np.array(vel_tot)[t_i]), '.2f')
                label_SD = format((np.std(curl_slip)/np.array(vel_tot)[t_i]), '.2f')
                
                name_path = '/' + str(label_KE) + '_' + str(label_SD) + '_win_' + str(
                        win) + '_t_' + str('{:03d}'.format(t_i)) + '_' + str(exp) + '_cfuv.npy'
                # save sliced and labeled dataset in slice_npy folder
                np.save(npy_exp_folder_path + name_path, SLICE)
                
                # add to file master
                file_master.append(str(exp) + name_path)          
    
    master_data_path = master_folder_path + str(exp) + ".txt"
    print('non_zero_slice =', len(file_master))
    np.savetxt(master_data_path, file_master, fmt="%s")

In [None]:
# Only need to run once! 
slice_param = SLICE_PARAM
step = DX
for exp in EXP_LIST:
    data_slice_KE(RAW_NPY_FOLDER, NPY_FOLDER, exp, slice_param,step)

In [None]:
def create_dataset(DATA_FOLDER,ML_exp, G1, G2, split_ratio):
    train_combine = []
    eval_combine = []
    test_combine = []
    eval_exp_win_stat = []
    test_exp_win_stat = []
    train_exp_win_stat = []
    for exp in G1:        
        fmt = DATA_FOLDER + 'file_master/' + str(exp) + '.txt'
        a = np.loadtxt(fmt, dtype=str)       
        # for each experiments in G1, we 100% attribute them to training dataset. 
        train_combine = np.concatenate([train_combine, a])
        # housekeeping for the exp/slice included in training
        num_win = int(a[-1].split('win_')[-1][0:1])+1
        for i in range (num_win):
            train_slice = [exp, i]
            train_exp_win_stat.append(train_slice)
            
    for exp in G2:
        fmt = DATA_FOLDER + 'file_master/' + str(exp) + '.txt'
        b = np.loadtxt(fmt, dtype=str)
        #check last element for its window number + 1 to get #of window
        num_win = int(b[-1].split('win_')[-1][0:1])+1
        #check how many time slice per window 
        win_len = len(b)/num_win
        
        #check randomized which slice would be included in 
        shf_idx = np.arange(num_win)
        print(shf_idx)
        np.random.shuffle(shf_idx)
        print(shf_idx)
        
        split_eval = split_ratio
        
        #distribute one random window-slice to eval
        for idx in range(split_eval):
            eval_ini = int(shf_idx[idx]*win_len)
            eval_end = int((shf_idx[idx]+1)*win_len)
            eval_combine = np.concatenate([eval_combine,  b[eval_ini:eval_end]])
            # housekeeping for the exp/slice included in evaluating
            eval_slice = [exp,shf_idx[idx]] 
            eval_exp_win_stat.append(eval_slice)
                
        #distribute one random window-slice to test
        test_ini = int(shf_idx[split_eval]*win_len)
        test_end = int((shf_idx[split_eval]+1)*win_len)
        test_combine = np.concatenate([test_combine,  b[test_ini:test_end]])
        # houseckeeping for the exp/slice included in testting
        test_slice = [exp,shf_idx[split_eval]] 
        test_exp_win_stat.append(test_slice)
        
        #distribute the rest of random window-slices to train
        for idx in range(split_eval+1,num_win):
            train_ini = int(shf_idx[idx]*win_len)
            train_end = int((shf_idx[idx]+1)*win_len)
            train_combine = np.concatenate([train_combine, b[train_ini:train_end]])
            # houseckeeping for the exp/slice included in training
            train_slice = [exp,shf_idx[idx]] 
            train_exp_win_stat.append(train_slice)
            
    tot_dataset = len(train_combine) + len(eval_combine) + len(test_combine)
    train_ratio = format(len(train_combine)/tot_dataset, '.2f')
    eval_ratio = format(len(eval_combine)/tot_dataset, '.2f')
    test_ratio = format(len(test_combine)/tot_dataset, '.2f')
    
    stat_dict = {'train_ew': train_exp_win_stat,
                'eval_ew': eval_exp_win_stat,
                'test_ew': test_exp_win_stat,
                'tot_dataset':tot_dataset,
                'train_ratio':train_ratio,
                'eval_ratio':eval_ratio,
                'test_ratio':test_ratio}
    print(stat_dict)
    ML_EXP_PATH = DATA_FOLDER + str(ML_exp)
    if not os.path.exists(ML_EXP_PATH): 
        os.makedirs(ML_EXP_PATH)
    
    with open(ML_EXP_PATH +'/data_stat.txt', 'w') as f:
        print(stat_dict, file=f)
        
    train_data_path = ML_EXP_PATH +  '/train_master.txt'
    np.savetxt(train_data_path, train_combine, fmt="%s")
    
    eval_data_path = ML_EXP_PATH +  '/eval_master.txt'
    np.savetxt(eval_data_path, eval_combine, fmt="%s")
    
    test_data_path = ML_EXP_PATH +  '/test_master.txt'
    np.savetxt(test_data_path, test_combine, fmt="%s")
    return stat_dict

In [None]:
EXP_LIST = ['EB_025_1','EB_025_2','EB_025_3',
           'EB_050_1','EB_050_2','EB_050_3',
           'EB_150_1','EB_150_2','EB_150_3',
           'PP_025_1','PP_025_2',
           'PP_050_1','PP_050_2',
           'PP_100_1','PP_100_2',
           'PP_150_1']

In [None]:
G1 = ['EB_025_1', 'EB_050_1', 'EB_150_1','PP_025_1','PP_050_1','PP_100_1']
G2 = ['EB_025_2', 'EB_025_3', 'EB_050_2','EB_050_3','EB_150_2','EB_150_3','PP_025_2','PP_050_2','PP_100_2' ]
create_dataset(NPY_FOLDER, ML_exp, G1, G2,split_ratio)