In [1]:
    efit_type = 'EFIT02'

    scenario = {'actuator_names': ['pinj', 'curr', 'tinj'],
                        'profile_names': ['thomson_temp_{}'.format(efit_type),
                                          'thomson_dens_{}'.format(efit_type),
                                          'ffprime_{}'.format(efit_type),
                                          'press_{}'.format(efit_type),
                                          'q_{}'.format(efit_type)],
                        'scalar_names': [],
                        'profile_downsample': 2,
                        'state_encoder_type': 'dense',
                        'state_decoder_type': 'dense',
                        'control_encoder_type': 'dense',
                        'control_decoder_type': 'dense',
                        'state_encoder_kwargs': {'num_layers': 6,
                                                 'layer_scale': 2,
                                                 'std_activation':'relu'},
                        'state_decoder_kwargs': {'num_layers': 6,
                                                 'layer_scale': 2,
                                                 'std_activation':'relu'},
                        'control_encoder_kwargs': {'num_layers': 10,
                                                   'layer_scale': 2,
                                                   'std_activation':'relu'},
                        'control_decoder_kwargs': {'num_layers': 10,
                                                   'layer_scale': 2,
                                                   'std_activation':'relu'},
                        'state_latent_dim':50,
                        'control_latent_dim':5,
                        'x_weight':1,
                        'u_weight':1,
                        'discount_factor':1,
                        'batch_size': 128,
                        'epochs': 100,
                        'flattop_only': True,
                        'raw_data_path': '/scratch/gpfs/jabbate/mixed_data/final_data.pkl',
                        'process_data': True,
                        'processed_filename_base': '/scratch/gpfs/jabbate/data_60_ms_randomized_',
                        'optimizer': 'adagrad',
                        'optimizer_kwargs': {},
                        'shuffle_generators': True,
                        'pruning_functions': ['remove_nan', 'remove_dudtrip', 'remove_I_coil'],
                        'normalization_method': 'RobustScaler',
                        'window_length': 1,
                        'window_overlap': 0,
                        'lookback': 0,
                        'lookahead': 3,
                        'sample_step': 1,
                        'uniform_normalization': True,
                        'train_frac': 0.8,
                        'val_frac': 0.2,
                        'nshots': 12000,
                        'excluded_shots': ['topology_TOP', 'topology_OUT', 'topology_MAR', 'topology_IN', 'topology_DN', 'topology_BOT']}


In [2]:
import pickle
import numpy as np
import gc
import sys
import time
from pathlib import Path
from keras.utils import Sequence
from keras.callbacks import TensorBoard
from helpers.normalization import normalize
from helpers.pruning_functions import remove_dudtrip, remove_I_coil, remove_ECH, remove_gas, remove_nan
from tqdm import tqdm
from helpers import exclude_shots
import numba

def process_data(rawdata, sig_names, normalization_method, window_length=1,
                 window_overlap=0, lookbacks={}, lookahead=3, sample_step=5,
                 uniform_normalization=True, train_frac=0.7, val_frac=0.2,
                 nshots=None,
                 verbose=1, flattop_only=True, randomize=True, **kwargs):
    """Organize data into correct format for training

    Gathers raw data into bins, group into training sequences, normalize,
    and split into training and validation sets.

    Args:
        rawdata (dict): Nested dictionary of raw signal data, or path to pickle.
            Should be of the form rawdata[shot][signal_name] = signal_data.
        sig_names (list): List of signal names as strings.
        normalization_method (str): One of `StandardScaler`, `MinMax`, `MaxAbs`,
            `RobustScaler`, `PowerTransform`.
        window_length (int): Number of samples to average over in each bin/window.
        window_overlap (int): How many timesteps to overlap windows.
        lookbacks (dict of int): How many window lengths for lookback for each sig.
        lookahead (int): How many window lengths to predict into the future.
        sample_step (int): How much to offset sequential training sequences.
            Step of 1 means sample[i] and sample[i+1] will be offset by 1, with
            the rest overlapping.
        uniform_normalization (bool): 'True' uses the same normalization
            parameters over a whole profile, 'False' normalizes each spatial
            point separately.
        val_frac (float): Fraction of samples to use for validation.
        nshots (int): How many shots to use. If None, all available will be used.
        verbose (int): verbosity level. 0 is no CL output, 1 shows progress, 2 is abbreviated.
        flattop_only (bool): Whether to only include data from flattop.

    Returns:
        traindata (dict): Dictionary of numpy arrays, one entry for each signal.
            Each array has shape [nsamples,lookback+lookahead,signal_shape]
        valdata (dict): Dictionary of numpy arrays, one entry for each signal.
            Each array has shape [nsamples,lookback+lookahead,signal_shape]
        param_dict (dict): Dictionary of parameters used during normalization,
            to be used for denormalizing later. Eg, mean, stddev, method, etc.
    """
    ##############################
    # Load data
    ##############################
    if type(rawdata) is not dict:
        if verbose:
            print('Loading')
        abs_path = Path(rawdata).resolve()
        if abs_path.exists():
            with open(abs_path, 'rb') as f:
                rawdata = pickle.load(f, encoding='latin1')
        else:
            print(abs_path)
            raise IOError("No such path to data file")
            
    ##############################
    # get pruning functions
    ##############################
    pruning_functions = kwargs.get('pruning_functions', [])
    if 'ech' not in sig_names:
        pruning_functions.append('remove_ECH')
    if not {'gasB', 'gasC', 'gasD', 'gasE'}.issubset(set(sig_names)):
        pruning_functions.append('remove_gas')
    prun_dict = {'remove_nan': remove_nan,
                 'remove_ECH': remove_ECH,
                 'remove_I_coil': remove_I_coil,
                 'remove_gas': remove_gas,
                 'remove_dudtrip': remove_dudtrip}
    for i, elem in enumerate(pruning_functions):
        if isinstance(elem, str):
            pruning_functions[i] = prun_dict[elem]

    ##############################
    # get excluded shots
    ##############################
    excluded_shots = kwargs.get('excluded_shots', [])
    exclude_dict = {'topology_TOP': exclude_shots.topology_TOP,
                    'topology_SNT': exclude_shots.topology_SNT,
                    'topology_SNB': exclude_shots.topology_SNB,
                    'topology_OUT': exclude_shots.topology_OUT,
                    'topology_MAR': exclude_shots.topology_MAR,
                    'topology_IN': exclude_shots.topology_IN,
                    'topology_DN': exclude_shots.topology_DN,
                    'topology_BOT': exclude_shots.topology_BOT}
    for i, elem in enumerate(excluded_shots):
        if isinstance(elem, str):
            excluded_shots[i] = exclude_dict[elem]
        if not isinstance(elem, list):
            excluded_shots[i] = [elem]
    excluded_shots = [item for sublist in excluded_shots for item in sublist]

    ##############################
    # get sig names
    ##############################
    extra_sigs = ['time', 'shotnum']
    if remove_dudtrip in pruning_functions:
        extra_sigs += ['dud_trip']
    if remove_I_coil in pruning_functions:
        extra_sigs += ['bt', 'curr', 'C_coil_method', 'I_coil_method']
    if remove_gas in pruning_functions:
        extra_sigs += ['gasB', 'gasC', 'gasD', 'gasE', 'pfx1', 'pfx2']
    if remove_ECH in pruning_functions:
        extra_sigs += ['ech']
    sig_names = list(np.unique(sig_names))
    sigsplustime = list(np.unique(sig_names + extra_sigs))
    if verbose:
        print('Signals: ' + ', '.join(sig_names))

    ##############################
    # figure out lookbacks
    ##############################
    if isinstance(lookbacks, int):
        max_lookback = lookbacks
        lookbacks = {sig: max_lookback for sig in sigsplustime}
    else:
        max_lookback = 0
        for val in lookbacks.values():
            if val > max_lookback:
                max_lookback = val
        for sig in sigsplustime:
            if sig not in lookbacks.keys():
                lookbacks[sig] = max_lookback
            
    ##############################
    # find which shots have all the signals needed
    ##############################
    usabledata = []
    all_shots = sorted(list(rawdata.keys()))
    for shot in all_shots:
        rawdata[shot]['shotnum'] = np.ones(rawdata[shot]['time'].shape[0])*shot
        if set(sigsplustime).issubset(set(rawdata[shot].keys())) \
           and rawdata[shot]['time'].size > (max_lookback+lookahead) \
           and shot not in excluded_shots:
            usabledata.append(rawdata[shot])
    usabledata = np.array(usabledata)
    del rawdata
    gc.collect()
    if nshots is not None:
        nshots = np.minimum(nshots, len(usabledata))
        usabledata = usabledata[:nshots]
    else:
        nshots = len(usabledata)
    if verbose:
        print('Number of useable shots: ', str(len(usabledata)))
        print('Number of shots used: ', str(nshots))
        sys.stdout.flush()
    if verbose:
        t = 0
        for shot in usabledata:
            t += shot['time'].size
        print('Total number of timesteps: ', str(t))
        sys.stdout.flush()
        
    ##############################
    # some helper functions
    ##############################          
    def moving_average(a, n):
        """moving average of array a with window size n"""
        ret = np.nancumsum(a, axis=0)
        ret[n:] = ret[n:] - ret[:-n]
        return ret[n - 1:] / n

    def is_valid(shot):
        """checks if a shot is completely NaN or if it never reached flattop"""
        for sig in sigsplustime:
            if np.isnan(shot[sig]).all():  # or np.isinf(shot[sig]).any():
                return False
        if (flattop_only):
            if (shot['t_ip_flat'] == None or shot['ip_flat_duration'] == None):
                return False
        return True

    def get_non_nan_inds(arr):
        """gets indices of array where value is not NaN"""
        if len(arr.shape) == 1:
            return np.where(~np.isnan(arr))[0]
        else:
            return np.where(np.any(~np.isnan(arr), axis=1))[0]

    def get_first_index(shot):
        """gets index of first valid timeslice for a shot"""
        input_max = max([get_non_nan_inds(shot[sig])[0] +
                         lookbacks[sig] for sig in sig_names])
        output_max = max([get_non_nan_inds(shot[sig])[0] -
                          lookahead for sig in sig_names])
        if (flattop_only) and (shot['t_ip_flat'] != None):
            current_max = np.searchsorted(
                shot['time'], shot['t_ip_flat'], side='left')
            return np.ceil(max(input_max, output_max, current_max)).astype(int)
        else:
            return np.ceil(max(input_max, output_max)).astype(int)

    def get_last_index(shot):
        """gets index of last valid timeslice for a shot"""
        partial_min = min([get_non_nan_inds(shot[sig])[-1]
                           for sig in sig_names])
        full_min = min([get_non_nan_inds(shot[sig])[-1] -
                        lookahead for sig in sig_names])
        if (flattop_only) and (shot['t_ip_flat'] != None) and (shot['ip_flat_duration'] != None):
            current_min = np.searchsorted(
                shot['time'], shot['t_ip_flat']+shot['ip_flat_duration'], side='right')
            return np.floor(min(full_min, partial_min, current_min)).astype(int)
        else:
            return np.floor(min(full_min, partial_min)).astype(int)
    
    @numba.njit
    def group_data(array,first,last,sample_step,lookback, lookahead):
        """groups shot data into i/o chunks"""
        data = []
        for i in range(first,last,sample_step):
            data.append(array[i-lookback:i+lookahead+1])
        return data
    
    ##############################
    # loop through shots and do stuff
    ##############################
    alldata = {}
    shots_with_complete_nan = []
    for sig in sigsplustime:
        alldata[sig] = []  # initalize empty lists
    for shot in tqdm(usabledata, desc='Gathering', ascii=True, dynamic_ncols=True,
                     disable=not verbose == 1):
        ##############################
        # take moving average of data and bin it
        ##############################
        binned_shot = {}
        for sig in sigsplustime:
            if np.any(np.isinf(shot[sig])):
                shot[sig][np.isinf(shot[sig])] = np.nan            
            binned_shot[sig] = moving_average(shot[sig],window_length)[::window_length-window_overlap]
        binned_shot['t_ip_flat'] = shot['t_ip_flat']
        binned_shot['ip_flat_duration'] = shot['ip_flat_duration']
        if not is_valid(binned_shot):
            shots_with_complete_nan.append(np.unique(shot["shotnum"]))
            continue

        ##############################
        # group into arrays of input/output pairs
        ##############################
        first = get_first_index(binned_shot)
        last = get_last_index(binned_shot)
        for sig in sigsplustime:
            alldata[sig] += group_data(binned_shot[sig],first,last,sample_step,lookbacks[sig],lookahead)

    if verbose:
        print("Shots with Complete NaN: " + ', '.join(str(e)
                                                      for e in shots_with_complete_nan))
    sys.stdout.flush()
    del usabledata
    gc.collect()
    
    ##############################
    # stack data from all shots together
    ##############################    
    for sig in tqdm(sigsplustime, desc='Stacking', ascii=True, dynamic_ncols=True,
                    disable=not verbose == 1):
        alldata[sig] = np.stack(alldata[sig])
    print("{} samples total".format(len(alldata['time'])))
    sys.stdout.flush()
    ##############################
    # apply pruning functions
    ##############################
    # call fns in the right order to speed things up
    if remove_ECH in pruning_functions:
        alldata = remove_ECH(alldata,verbose)
    if remove_gas in pruning_functions:
        alldata = remove_gas(alldata,verbose)
    if remove_I_coil in pruning_functions:
        alldata = remove_I_coil(alldata,verbose)
    if remove_nan in pruning_functions:
        alldata = remove_nan(alldata,verbose)
    if remove_dudtrip in pruning_functions:
        alldata = remove_dudtrip(alldata,verbose)

    print("{} samples remaining after pruning".format(len(alldata['time'])))
    sys.stdout.flush()
    ##############################
    # normalize data
    ##############################    
    alldata, normalization_params = normalize(
        alldata, normalization_method, uniform_normalization, verbose)
    
    ##############################
    # split into train and validation sets
    ##############################    
    nsamples = alldata['time'].shape[0]
    inds = np.random.permutation(nsamples) if randomize else np.arange(nsamples)
    traininds = inds[:int(nsamples*train_frac)]
    valinds = inds[int(nsamples*train_frac)
                       :int(nsamples*(val_frac+train_frac))]
    traindata = {}
    valdata = {}
    for sig in tqdm(sigsplustime, desc='Splitting', ascii=True, dynamic_ncols=True,
                    disable=not verbose == 1):
        traindata[sig] = alldata[sig][traininds]
        valdata[sig] = alldata[sig][valinds]
    time.sleep(0.1)
    if verbose:
        print('Total number of samples: ', str(nsamples))
        print('Number of training samples: ', str(traininds.size))
        print('Number of validation samples: ', str(valinds.size))
    return traindata, valdata, normalization_params

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import numpy as np
import numba


@numba.njit
def prune_loop(inds,shotnumarr,timearr):
    remove_inds = set()
    for ind in inds:
        shot = shotnumarr[ind]
        time = timearr[ind]
        i = ind
        while np.any(shotnumarr[i] == shot) and np.any(timearr[i] >= time):
            remove_inds.add(i)
            i += 1
            if i>=len(timearr):
                break
    return remove_inds


def remove_dudtrip(data, verbose):
    if verbose:
        print('Removing dudtrip')
    dud_trip_inds = np.nonzero(data['dud_trip'])[0]
    if len(dud_trip_inds)==0:
        return data
    remove_inds = prune_loop(dud_trip_inds,data['shotnum'],data['time'])
    if verbose:
        print("Removed {} samples".format(len(remove_inds)))
    keep_inds = set(range(len(data['time']))).difference(remove_inds)
    if verbose:
        print("{} samples remaining".format(len(keep_inds)))
    for sig in data.keys():
        data[sig] = data[sig][list(keep_inds)]
    return data


def remove_I_coil(data, verbose):
    if verbose:
        print('Removing weird I-coils')
    
    @numba.njit
    def find_Icoil_inds(n,bt,curr,C_coil_method,I_coil_method):
        c_coil = list()
        i_coil = list()
        EFC = list()
        for i in range(n):
            if np.mean(bt[i]*curr[i]) < 0:
                # left-handed
                if not set(np.unique(C_coil_method[i])).issubset({5, 0, -1}):
                    c_coil.append(i)
                if not set(np.unique(I_coil_method[i])).issubset({5, 0, -1}):
                    i_coil.append(i)
                if not np.all(np.logical_xor(C_coil_method[i] == 5, I_coil_method[i] == 5)):
                    EFC.append(i)
            else:
                # right-handed
                if not set(np.unique(C_coil_method[i])).issubset({6, 0, -1}):
                    c_coil.append(i)
                if not set(np.unique(I_coil_method[i])).issubset({7, 0, -1}):
                    i_coil.append(i)
                if not np.any(np.logical_or(np.logical_and(C_coil_method[i] == 6, 
                                                           I_coil_method[i] != 7), 
                                            np.logical_and(C_coil_method[i] != 6, 
                                                           I_coil_method[i] == 7))):
                    EFC.append(i)
                    
        coil_inds = c_coil + i_coil + EFC
        return coil_inds

    coil_inds = np.unique(find_Icoil_inds(len(data['time']),
                                          data['bt'],
                                          data['curr'],
                                          data['C_coil_method'].astype(int),
                                          data['I_coil_method'].astype(int)))
    if len(coil_inds)==0:
        return data
    remove_inds = prune_loop(coil_inds,data['shotnum'],data['time'])
    if verbose:
        print("Removed {} samples".format(len(remove_inds)))
    keep_inds = set(range(len(data['time']))).difference(remove_inds)
    if verbose:
        print("{} samples remaining".format(len(keep_inds)))
    for sig in data.keys():
        data[sig] = data[sig][list(keep_inds)]
    return data


def remove_gas(data, verbose):
    if verbose:
        print('Removing weird gas')
    from functools import reduce
    threshold=2
    gasB_inds = np.nonzero(np.any(data['gasB'] > threshold, axis=1))[0]
    gasC_inds = np.nonzero(np.any(data['gasC'] > threshold, axis=1))[0]
    gasD_inds = np.nonzero(np.any(data['gasD'] > threshold, axis=1))[0]
    gasE_inds = np.nonzero(np.any(data['gasE'] > threshold, axis=1))[0]
    pfx1_inds = np.nonzero(np.any(data['pfx1'] > threshold, axis=1))[0]
    pfx2_inds = np.nonzero(np.any(data['pfx2'] > threshold, axis=1))[0]
    gas_inds = reduce(np.union1d, (gasB_inds, gasC_inds,
                                   gasD_inds, gasE_inds, pfx1_inds, pfx2_inds))
    if len(gas_inds)==0:
        return data
    remove_inds = prune_loop(gas_inds,data['shotnum'],data['time'])
    if verbose:
        print("Removed {} samples".format(len(remove_inds)))
    keep_inds = set(range(len(data['time']))).difference(remove_inds)
    if verbose:
        print("{} samples remaining".format(len(keep_inds)))
    for sig in data.keys():
        data[sig] = data[sig][list(keep_inds)]
    return data


def remove_ECH(data, verbose):
    if verbose:
        print('Removing ECH')
    ech_inds = np.nonzero(np.any(data['ech'] > .5, axis=1))[0]
    if len(ech_inds)==0:
        return data
    remove_inds = prune_loop(ech_inds,data['shotnum'],data['time'])
    if verbose:
        print("Removed {} samples".format(len(remove_inds)))
    keep_inds = set(range(len(data['time']))).difference(remove_inds)
    if verbose:
        print("{} samples remaining".format(len(keep_inds)))
    for sig in data.keys():
        data[sig] = data[sig][list(keep_inds)]
    return data


def remove_nan(data, verbose):
    if verbose:
        print('Removing NaN')
    remove_inds = []
    for sig in data.keys():
        if data[sig].ndim==1:
            remove_inds += np.where(np.isnan(data[sig]))[0].tolist()
        else:
            ax = tuple(np.arange(1,data[sig].ndim).astype(int))
            remove_inds += np.where(np.any(np.isnan(data[sig]),axis=ax))[0].tolist()
    remove_inds = np.unique(remove_inds)
    if verbose:
        print("Removed {} samples".format(len(remove_inds)))
    keep_inds = set(range(len(data['time']))).difference(remove_inds)
    if verbose:
        print("{} samples remaining".format(len(keep_inds)))
    for sig in data.keys():
        data[sig] = data[sig][list(keep_inds)]
    return data


In [4]:
import numpy as np
from scipy import optimize
from tqdm import tqdm
import keras.backend as K


def normalize_arr(data, method, uniform_over_profile=True):
    """Normalizes data before training

    Args:
        data: Numpy array. Array.shape[0] = samples
        method (str): One of `StandardScaler`, `MinMax`, `MaxAbs`,
            `RobustScaler`, `PowerTransform`.
        uniform_over_profile (bool): 'True' uses the same normalization
            parameters over a whole profile, 'False' normalizes each spatial
            point separately.

    Returns:
        data: Numpy array of normalized data.
        param_dict (dict): Dictionary of parameters used during normalization,
            to be used for denormalizing later. Eg, mean, stddev, method, etc.
    """
    param_dict = {}
    # first replace all infs and nans with mean value
    data[np.isinf(data)] = np.nan
    nanmean = np.nanmean(data, axis=(0, 1))
    param_dict['nanmean'] = nanmean
    if data.ndim > 2:
        for i in range(data.shape[2]):
            data[np.isnan(data[:, :, i]), i] = nanmean[i]
    else:
        data[np.isnan(data)] = nanmean
    # then normalize
    if method == 'StandardScaler':
        if uniform_over_profile or data.ndim < 3:
            mean = np.mean(data)
            std = np.std(data)
        else:
            mean = np.mean(data, axis=(0, 1), keepdims=True)
            std = np.std(data, axis=(0, 1), keepdims=True)
        param_dict.update({'method': method,
                           'mean': mean,
                           'std': std})
        return (data-mean)/np.maximum(std, np.finfo(np.float32).eps), param_dict

    elif method == 'MinMax':
        if uniform_over_profile or data.ndim < 3:
            armin = np.amin(data)
            armax = np.amax(data)
        else:
            armin = np.amin(data, axis=(0, 1), keepdims=True)
            armax = np.amax(data, axis=(0, 1), keepdims=True)
        param_dict.update({'method': method,
                           'armin': armin,
                           'armax': armax})
        return (data-armin)/np.maximum((armax-armin), np.finfo(np.float32).eps), param_dict

    elif method == 'MaxAbs':
        if uniform_over_profile or data.ndim < 3:
            maxabs = np.amax(np.abs(data))
        else:
            maxabs = np.amax(np.abs(data), axis=(0, 1), keepdims=True)
        param_dict.update({'method': method,
                           'maxabs': maxabs})
        return data/np.maximum(maxabs, np.finfo(np.float32).eps), param_dict

    elif method == 'RobustScaler':
        if uniform_over_profile or data.ndim < 3:
            median = np.median(data)
            iqr = np.subtract(*np.percentile(data, [75, 25]))
        else:
            median = np.median(data, axis=0)
            iqr = np.subtract(*np.percentile(data, [75, 25], axis=(0, 1)))
        param_dict.update({'method': method,
                           'median': median,
                           'iqr': iqr})
        return (data-median)/np.maximum(iqr, np.finfo(np.float32).eps), param_dict

    elif method == 'PowerTransform':
        def yeo_johnson_transform(x, lmbda):
            """Return transformed input x following Yeo-Johnson transform with
            parameter lambda.
            """
            out = np.zeros_like(x)
            pos = x >= 0  # binary mask
            # when x >= 0
            if abs(lmbda) < np.finfo(np.float32).eps:
                out[pos] = np.log1p(x[pos])
            else:  # lmbda != 0
                out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
            # when x < 0
            if abs(lmbda - 2) > np.finfo(np.float32).eps:
                out[~pos] = - \
                    (np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
            else:  # lmbda == 2
                out[~pos] = -np.log1p(-x[~pos])
            return out

        def yeo_johnson_optimize(x):
            """Find and return optimal lambda parameter of the Yeo-Johnson
            transform by MLE, for observed data x.
            Like for Box-Cox, MLE is done via the brent optimizer. From Scipy
            """
            def _neg_log_likelihood(lmbda):
                """Return the negative log likelihood of the observed data x as a
                function of lambda. From Scipy"""
                x_trans = yeo_johnson_transform(x, lmbda)
                n_samples = x.shape[0]
                loglike = -n_samples / 2 * np.log(x_trans.var())
                loglike += (lmbda - 1) * (np.sign(x) *
                                          np.log1p(np.abs(x))).sum()
                return -loglike
            # choosing bracket -2, 2 like for boxcox
            return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
        if uniform_over_profile or data.ndim < 3:
            lmbda = yeo_johnson_optimize(data.flatten())
            y = yeo_johnson_transform(
                data.flatten(), lmbda).reshape(data.shape)
            mean = np.mean(y)
            std = np.std(y)
        else:
            y = np.zeros_like(data)
            lmbda = np.array([yeo_johnson_optimize(data[:, :, i])
                              for i in range(data.shape[2])])
            for i, l in enumerate(lmbda):
                y[:, :, i] = yeo_johnson_transform(data[:, :, i], l)
            mean = np.mean(y, axis=(0, 1))
            std = np.std(y, axis=(0, 1))
        param_dict.update({'method': method,
                           'lambda': lmbda,
                           'mean': mean,
                           'std': std})
        return (y-mean)/np.maximum(std, np.finfo(np.float32).eps), param_dict
    elif method is None or method == 'None':
        param_dict.update({'method': method})
        return data, param_dict
    else:
        raise ValueError("Unknown normalization method")


def normalize(data, method, uniform_over_profile=True, verbose=1):
    """Normalizes data before training

    Args:
        data: Numpy array or dictionary of numpy arrays. If a dictionary, all
            arrays are normalized using the same method, but each array with
            respect to itself. Array.shape[0] = batches
        method (str): One of `StandardScaler`, `MinMax`, `MaxAbs`,
            `RobustScaler`, `PowerTransform`.
        uniform_over_profile (bool): 'True' uses the same normalization
            parameters over a whole profile, 'False' normalizes each spatial
            point separately.
        verbose (int): verbosity level. 0 is no CL output, 1 shows progress, 2 abbreviates.

    Returns:
        data: Numpy array or dictionary of numpy arrays. Normalized data.
        param_dict (dict): Dictionary of parameters used during normalization,
            to be used for denormalizing later. Eg, mean, stddev, method, etc.
    """
    if type(data) is dict:
        param_dict = {}
        for key in tqdm(data.keys(), desc='Normalizing', ascii=True, dynamic_ncols=True,
                        disable=not verbose==1):
            if key not in ['time', 'shotnum']:
                data[key], p = normalize_arr(
                    data[key], method, uniform_over_profile)
                param_dict[key] = p
        return data, param_dict
    else:
        return normalize_arr(data, method, uniform_over_profile)


def denormalize_arr(data, param_dict):
    """Denormalizes data after training

    Args:
        data: Numpy array of data to denorm.
        param_dict (dict): Dictionary of parameters used during normalization,
            to be used for denormalizing. Eg, mean, stddev, method, etc.

    Returns:
        data: Numpy array of denormalized data.
    """
    eps = np.finfo('float32').eps
    #for key, val in param_dict.items():
    #    if K.is_tensor(val):
    #        val = np.array(K.eval(val))
    if param_dict['method'] == 'StandardScaler':
        return data*np.maximum(param_dict['std'], eps) + param_dict['mean']
    elif param_dict['method'] == 'MinMax':
        return data*np.maximum((param_dict['armax']-param_dict['armin']), eps)
        + param_dict['armin']
    elif param_dict['method'] == 'MaxAbs':
        return data*np.maximum(param_dict['maxabs'], eps)
    elif param_dict['method'] == 'RobustScaler':
        return data*np.maximum(param_dict['iqr'], eps) + param_dict['median']
    elif param_dict['method'] == 'PowerTransform':
        y = data*np.maximum(param_dict['std'], eps) + param_dict['mean']

        def np_yeo_johnson_inverse_transform(x, lmbda):
            """Return inverse-transformed input x following Yeo-Johnson inverse
            transform with parameter lambda. From Scipy
            """
            x_inv = np.zeros_like(x)
            pos = x >= 0
            # when x >= 0
            if np.abs(lmbda) < np.finfo(np.float32).eps:
                x_inv[pos] = np.exp(x[pos]) - 1
            else:  # lmbda != 0
                x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
            # when x < 0
            if np.abs(lmbda - 2) > np.finfo(np.float32).eps:
                x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1,
                                           1 / (2 - lmbda))
            else:  # lmbda == 2
                x_inv[~pos] = 1 - np.exp(-x[~pos])
            return x_inv
        if param_dict['lambda'].size > 1:
            for i, l in enumerate(param_dict['lambda']):
                y[:, i] = np_yeo_johnson_inverse_transform(y[:, i], l)
        else:
            y = np_yeo_johnson_inverse_transform(
                y.flatten(), param_dict['lambda']).reshape(y.shape)
        return y
    elif param_dict['method'] is None or param_dict['method'] == 'None':
        return data
    else:
        raise ValueError("Unknown normalization method")


def denormalize(data, param_dict, verbose=1):
    """Denormalizes data after training

    Args:
        data: Numpy array or dictionary of numpy arrays.
        param_dict (dict): Dictionary of parameters used during normalization,
            to be used for denormalizing. Eg, mean, stddev, method, etc.
        verbose (int): verbosity level. 0 is no CL output, 1 shows progress, 2 abbreviates.

    Returns:
        data: Numpy array or dictionary of numpy arrays. Denormalized data.
    """
    if type(data) is dict:
        data=data.copy() # don't make changes in place
        for key in tqdm(data.keys(), desc='Denormalizing', ascii=True, dynamic_ncols=True,
                        disable=not verbose==1):
            if key not in ['time', 'shotnum']:
                data[key] = denormalize_arr(data[key], param_dict[key])
        return data
    else:
        return denormalize_arr(data, param_dict)


def renormalize(data, param_dict, verbose=1):
    """Normalizes data using already determined parameters

    Args:
        data: Numpy array or dictionary of numpy arrays of raw data.
        param_dict (dict): Dictionary of parameters used during normalization,
            Eg, mean, stddev, method, etc.
        verbose (int): verbosity level. 0 is no CL output, 1 shows progress, 2 abbreviates

    Returns:
        data: Numpy array or dictionary of numpy arrays. Normalized data.
    """
    if type(data) is dict:
        for key in tqdm(data.keys(), desc='Normalizing', ascii=True, dynamic_ncols=True,
                        disable=not verbose==1):
            if key not in ['time', 'shotnum']:
                data[key] = renormalize(data[key], param_dict[key])
        return data
    else:
        # first remove all inf/nan
        data[np.isinf(data)] = np.nan
        if data.ndim > 2:
            for i in range(data.shape[2]):
                data[np.isnan(data[:, :, i]), i] = param_dict['nanmean'][i]
        else:
            data[np.isnan(data)] = param_dict['nanmean']
        # then normalize
        if param_dict['method'] == 'StandardScaler':
            return (data - param_dict['mean'])/np.maximum(
                param_dict['std'], np.finfo(np.float32).eps)
        elif param_dict['method'] == 'MinMax':
            return (data - param_dict['armin'])/np.maximum(
                (param_dict['armax']-param_dict['armin']), np.finfo(np.float32).eps)
        elif param_dict['method'] == 'MaxAbs':
            return data/np.maximum(param_dict['maxabs'], np.finfo(np.float32).eps)
        elif param_dict['method'] == 'RobustScaler':
            return (data - param_dict['median'])/np.maximum(
                param_dict['iqr'], np.finfo(np.float32).eps)
        elif param_dict['method'] == 'PowerTransform':
            def yeo_johnson_transform(x, lmbda):
                """Return transformed input x following Yeo-Johnson transform with
                parameter lambda.
                """
                out = np.zeros_like(x)
                pos = x >= 0  # binary mask
                # when x >= 0
                if abs(lmbda) < np.finfo(np.float32).eps:
                    out[pos] = np.log1p(x[pos])
                else:  # lmbda != 0
                    out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
                # when x < 0
                if abs(lmbda - 2) > np.finfo(np.float32).eps:
                    out[~pos] = - \
                        (np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
                else:  # lmbda == 2
                    out[~pos] = -np.log1p(-x[~pos])
                return out
            y = data
            if param_dict['lambda'].size > 1:
                for i, l in enumerate(param_dict['lambda']):
                    y[:, i] = yeo_johnson_transform(y[:, i], l)
            else:
                y = yeo_johnson_transform(
                    y.flatten(), param_dict['lambda']).reshape(y.shape)
            y = (y - param_dict['mean'])/np.maximum(
                param_dict['std'], np.finfo(np.float32).eps)
            return y
        elif param_dict['method'] is None or param_dict['method'] == 'None':
            return data
        else:
            raise ValueError("Unknown normalization method")


In [5]:
%load_ext line_profiler
scenario['sig_names'] = scenario['profile_names'] + scenario['scalar_names'] + scenario['actuator_names']
verbose=1

In [6]:
%lprun -f process_data -f remove_dudtrip -f remove_I_coil -f remove_gas -f remove_ECH -f remove_nan process_data(scenario['raw_data_path'],scenario['sig_names'],scenario['normalization_method'],scenario['window_length'],scenario['window_overlap'],scenario['lookback'],scenario['lookahead'],scenario['sample_step'],scenario['uniform_normalization'],scenario['train_frac'],scenario['val_frac'],scenario['nshots'],verbose,scenario['flattop_only'],pruning_functions=scenario['pruning_functions'],excluded_shots=scenario['excluded_shots'])

Loading
Signals: curr, ffprime_EFIT02, pinj, press_EFIT02, q_EFIT02, thomson_dens_EFIT02, thomson_temp_EFIT02, tinj
Number of useable shots:  5740
Number of shots used:  5740
Total number of timesteps:  1517720


Gathering: 100%|##########| 5740/5740 [00:51<00:00, 111.41it/s]

Shots with Complete NaN: [156351.], [156352.], [156403.], [156478.], [156481.], [156793.], [156796.], [157744.], [157747.], [158007.], [158008.], [159579.], [161179.], [164471.], [164473.], [164474.], [165187.], [165343.], [165714.], [165864.], [165885.], [165891.], [166428.], [167599.], [171541.], [172008.], [172051.], [172052.], [172053.], [172055.], [172065.], [172066.], [172067.], [172068.], [172069.], [172083.], [172084.], [172085.], [172090.], [172292.], [172293.], [172294.], [172295.], [172296.], [172297.], [172302.], [173989.], [174750.], [174755.], [174762.], [175708.], [175710.], [175711.], [176905.], [176997.], [177001.], [177002.], [177011.], [177020.], [178718.]



Stacking: 100%|##########| 21/21 [01:15<00:00,  3.59s/it]

1052920 samples total





Removing ECH
Removed 395616 samples
657304 samples remaining
Removing weird gas
Removed 35929 samples
621375 samples remaining
Removing weird I-coils
Removed 118826 samples
502549 samples remaining
Removing NaN
Removed 0 samples
502549 samples remaining
Removing dudtrip
Removed 16273 samples
486276 samples remaining
486276 samples remaining after pruning


Normalizing: 100%|##########| 21/21 [00:30<00:00,  1.44s/it]
Splitting: 100%|##########| 21/21 [00:01<00:00, 13.22it/s]


Total number of samples:  486276
Number of training samples:  389020
Number of validation samples:  97256


Timer unit: 1e-06 s

Total time: 254.833 s
File: <ipython-input-2-529bb85e402a>
Function: process_data at line 15

Line #      Hits         Time  Per Hit   % Time  Line Contents
    15                                           def process_data(rawdata, sig_names, normalization_method, window_length=1,
    16                                                            window_overlap=0, lookbacks={}, lookahead=3, sample_step=5,
    17                                                            uniform_normalization=True, train_frac=0.7, val_frac=0.2,
    18                                                            nshots=None,
    19                                                            verbose=1, flattop_only=True, randomize=True, **kwargs):
    20                                               """Organize data into correct format for training
    21                                           
    22                                               Gathers raw data into bins, group into 

In [265]:
%lprun -f process_data -f remove_dudtrip -f remove_I_coil -f remove_gas -f remove_ECH -f remove_nan process_data(scenario['raw_data_path'],scenario['sig_names'],scenario['normalization_method'],scenario['window_length'],scenario['window_overlap'],scenario['lookback'],scenario['lookahead'],scenario['sample_step'],scenario['uniform_normalization'],scenario['train_frac'],scenario['val_frac'],scenario['nshots'],verbose,scenario['flattop_only'],pruning_functions=scenario['pruning_functions'],excluded_shots=scenario['excluded_shots'])

1052920 samples total
486276 samples remaining after pruning


Timer unit: 1e-06 s

Total time: 251.249 s
File: <ipython-input-261-9ed60b859aca>
Function: process_data at line 13

Line #      Hits         Time  Per Hit   % Time  Line Contents
    13                                           def process_data(rawdata, sig_names, normalization_method, window_length=1,
    14                                                            window_overlap=0, lookbacks={}, lookahead=3, sample_step=5,
    15                                                            uniform_normalization=True, train_frac=0.7, val_frac=0.2,
    16                                                            nshots=None,
    17                                                            verbose=1, flattop_only=True, randomize=True, **kwargs):
    18                                               """Organize data into correct format for training
    19                                           
    20                                               Gathers raw data into bins, group int

In [253]:
%lprun -f process_data -f remove_dudtrip -f remove_I_coil -f remove_gas -f remove_ECH -f remove_nan process_data(scenario['raw_data_path'],scenario['sig_names'],scenario['normalization_method'],scenario['window_length'],scenario['window_overlap'],scenario['lookback'],scenario['lookahead'],scenario['sample_step'],scenario['uniform_normalization'],scenario['train_frac'],scenario['val_frac'],scenario['nshots'],verbose,scenario['flattop_only'],pruning_functions=scenario['pruning_functions'],excluded_shots=scenario['excluded_shots'])

1052920 samples total
486276 samples remaining after pruning


Timer unit: 1e-06 s

Total time: 252.923 s
File: <ipython-input-249-9c984d9629d5>
Function: process_data at line 13

Line #      Hits         Time  Per Hit   % Time  Line Contents
    13                                           def process_data(rawdata, sig_names, normalization_method, window_length=1,
    14                                                            window_overlap=0, lookbacks={}, lookahead=3, sample_step=5,
    15                                                            uniform_normalization=True, train_frac=0.7, val_frac=0.2,
    16                                                            nshots=None,
    17                                                            verbose=1, flattop_only=True, randomize=True, **kwargs):
    18                                               """Organize data into correct format for training
    19                                           
    20                                               Gathers raw data into bins, group int

In [146]:
%lprun?

[0;31mDocstring:[0m
Execute a statement under the line-by-line profiler from the
line_profiler module.

Usage:
  %lprun -f func1 -f func2 <statement>

The given statement (which doesn't require quote marks) is run via the
LineProfiler. Profiling is enabled for the functions specified by the -f
options. The statistics will be shown side-by-side with the code through the
pager once the statement has completed.

Options:

-f <function>: LineProfiler only profiles functions and methods it is told
to profile.  This option tells the profiler about these functions. Multiple
-f options may be used. The argument may be any expression that gives
a Python function or method object. However, one must be careful to avoid
spaces that may confuse the option parser.

-m <module>: Get all the functions/methods in a module

One or more -f or -m options are required to get any useful results.

-D <filename>: dump the raw statistics out to a pickle file on disk. The
usual extension for this is ".lprof".

In [149]:
a = np.arange(5)
b = [False,True,True,False,False]
a[b]

array([1, 2])

In [11]:
a = [np.inf,1,2,np.nan]
a = np.array(a)
a[np.isinf(a)] = np.nan
a[np.isinf(a)] = np.nan
a

array([nan,  1.,  2., nan])

In [218]:
np.any(np.random.random((5,6,7))>.5,axis=tuple(np.arange(1,3).astype(int)))

array([ True,  True,  True,  True,  True])

In [216]:
tuple(np.arange(1,2).astype(int))

(1,)

In [164]:
a.tolist()

[0, 1, 2, 3, 4]

In [198]:
a = list()

In [199]:
a

[]