In [7]:
# This cell is included to show what libraries are imported and used in the project
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import random
import sys
import os
import pickle
import math
import itertools

from glob import glob
from scipy.linalg import expm
import bisect
from numpy import linalg as LA

import tensorflow as tf

from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity

from IPython.display import HTML
%matplotlib inline

In [8]:
# Functions that perform a random transformation (Based on Freifeld article)

# Generate L matrix from eq. 10
def generate_L(N_p):
    rows = N_p - 1
    cols = 2 * N_p
    
    delta = float(1 / N_p)
    
    L = np.zeros((rows, cols))
    
    for i in range(rows):
        L[i][2*i] = (i+1) * delta
        L[i][2*i+1] = 1
        L[i][2*i+2] = -(i+1) * delta
        L[i][2*i+3] = -1
    
    return L


# Find basis of null space of matrix via SVD
def nullspace(A, atol=1e-16, rtol=0):
    """Compute an approximate basis for the nullspace of A.

    The algorithm used by this function is based on the singular value
    decomposition of `A`.

    Parameters
    ----------
    A : ndarray
        A should be at most 2-D.  A 1-D array with length k will be treated
        as a 2-D with shape (1, k)
    atol : float
        The absolute tolerance for a zero singular value.  Singular values
        smaller than `atol` are considered to be zero.
    rtol : float
        The relative tolerance.  Singular values less than rtol*smax are
        considered to be zero, where smax is the largest singular value.

    If both `atol` and `rtol` are positive, the combined tolerance is the
    maximum of the two; that is::
        tol = max(atol, rtol * smax)
    Singular values smaller than `tol` are considered to be zero.

    Return value
    ------------
    ns : ndarray
        If `A` is an array with shape (m, k), then `ns` will be an array
        with shape (k, n), where n is the estimated dimension of the
        nullspace of `A`.  The columns of `ns` are a basis for the
        nullspace; each element in numpy.dot(A, ns) will be approximately
        zero.
    """

    A = np.atleast_2d(A)
    u, s, vh = np.linalg.svd(A)
    tol = max(atol, rtol * s[0])
    nnz = (s >= tol).sum()
    ns = vh[nnz:].conj().T
    return ns

# Psi computation, Eq. 20 in Freifeld
def psi_computation(x, a, b, t):
    if a == 0:
        psi = x + t*b
    else:
        psi = math.exp(t*a)*x + (b*(math.exp(t*a)-1))/a
    
    return psi

# Transformation v1! (Algorithm 1 from Freifeld)
def transformation_v1(P, A, U, N_step, N_p, t=1):
    N_pts = len(U)
    delta_t = float(t) / N_step
    
    phi = np.zeros(N_pts)
    
    for i in range(N_pts):
        phi[i] = U[i]
        
        for j in range(N_step):
            c = bisect.bisect_left(P[1:], phi[i])
            if c == N_p:
                c = c-1
            a = A[2*c]
            b = A[2*c+1]

            phi[i] = psi_computation(phi[i], a, b, delta_t)
        
    return phi

# Perform data augmentation
def generate_new_data(sigma, B, X_train, N_step, N_p):
    [D,d] = B.shape
    [train_size,ts_length] = X_train.shape
    
    # Sample new transformation from Gaussian distribution
    theta_new = np.random.multivariate_normal(mean=np.zeros(d), cov=sigma)

    # Compute A matrix for new transformation
    A = np.matmul(B, theta_new)

    # Sample a data point from uniform distribution
    i = random.randint(0, train_size-1)
    x_i = X_train[i]

    # Transform time series
    x = np.linspace(0,1,ts_length)
    x_trans = transformation_v1(tess, A, x, N_step, N_p)

    # Interpolate values to correct interval
    x_trans_resc = (x_trans - np.amin(x_trans)) / (np.amax(x_trans) - np.amin(x_trans))
    T_x_i = np.interp(x, x_trans_resc, x_i)
    
    return T_x_i


def remove_outliers(data):
    # Compute norm of every data point
    data_norm = LA.norm(data, axis=1)
    
    # Find mean and standard deviation
    sd = np.std(data_norm)
    mean = np.mean(data_norm)
    
    # Find indices for data points to be removed
    indices = []
    for i in range(len(data)):
        if (data_norm[i] < mean - 3*sd or mean + 3*sd < data_norm[i]):
            indices.append(i)
            
    # Remove outliers
    return np.delete(data, indices, axis=0)


# ------------------------------- Functions for Tensorflow -------------------------------
# Transformation v2! (TENSORFLOW IMPLEMENTATION)
def transformation_v2(A, U, N_step, N_p, t=1):
    delta_t = float(t) / N_step
    
    phi = U
    
    for j in range(N_step):
        
        # Find cell index
        idx = tf.floor(N_p * phi)
        idx = tf.clip_by_value(idx, clip_value_min=0, clip_value_max=N_p-1)
        idx = tf.cast(idx, tf.int32)
        
        # Fetch values from A (vector field)
        a = tf.reshape(tf.gather(A, 2*idx), [-1])
        b = tf.reshape(tf.gather(A, 2*idx+1), [-1])
        
        # Perform psi computation
        phi = tf.where(tf.equal(a, 0), psi_a_eq_zero(phi, a, b, delta_t), psi_a_noteq_zero(phi, a, b, delta_t))
        
    return phi

def psi_a_eq_zero(x, a, b, t):
    tb = tf.multiply(t,b)
    psi = tf.add(x, tb)
    return psi

def psi_a_noteq_zero(x, a, b, t):
    c1 = tf.exp(tf.multiply(t, a))
    c2 = tf.truediv(tf.multiply(b, tf.subtract(c1, 1)), a)
    psi = tf.add(tf.multiply(c1, x), c2)
    return psi

def tf_linear_interpolation(x, x_trans, y, ts_length):
    
    # POSSIBLY RESCALE VALUES IN X_TRANS TO RANGE [0,1] !!!!!!!!!!!!!
    
    # Find nearest smaller neighbor
    dist = tf.subtract(tf.reshape(x_trans, [-1, 1]), x)
    
    # Find index of interval in tessellation
    greater_than_zero = tf.greater_equal(dist, 0)
    idx = (ts_length-1) - tf.reduce_sum(tf.cast(greater_than_zero, tf.float32), axis=0)
    idx = tf.clip_by_value(idx, clip_value_min=0, clip_value_max=ts_length-2)
    idx = tf.cast(idx, tf.int32)
    
    # Fetch values from x_trans and y
    x0 = tf.gather(x_trans, idx)
    x1 = tf.gather(x_trans, idx+1)
    y0 = tf.gather(y, idx)
    y1 = tf.gather(y, idx+1)
    
    # Perform linear interpolation on points in x
    #frac = tf.truediv(tf.subtract(y1, y0), tf.subtract(x1, x0))
    #x_diff = tf.subtract(x, x0)
    #y_interp = tf.add(y0, tf.multiply(x_diff, frac))
    
    y_interp = y0 + (x-x0) * ((y1-y0)/(x1-x0))
    
    return y_interp

In [9]:
PATH = 'UCR_TS_Archive_2015/'
data_sets = {}

In [14]:
ds_list = []
for folder_PATH in glob('transformations/'+'*'):
    ds_trans = folder_PATH.split("/")[-1]
    ds = ds_trans.split("_")[:-1]
    ds_list.append("_".join(ds))

In [15]:
for folder_PATH in glob('Augmented_data_sets/'+'*'):
    ds_aug = folder_PATH.split("/")[-1]
    ds = ds_aug.split("_")[:-1]
    ds_list.remove('_'.join(ds))

In [16]:
ds_list # 'DiatomSizeReduction','50words'

# ds_list_skip_transformation = ['PhalangesOutlinesCorrect', 'ProximalPhalanxOutlineCorrect', 'FordA','ElectricDevices']
# ds_list_early_stopped = ['wafer']

[]

In [20]:
continue_run = False
skip_transformations = False
#ds_list = ['wafer']

In [21]:
for folder_PATH in glob(PATH+'*/'):
    
    ds = folder_PATH.split("/")[-2]
    if folder_PATH.split("/")[-2] not in ds_list:
        continue
    data_sets[ds] = {}
    
    print(ds)
    
    with open(folder_PATH + ds + '_TRAIN', 'r') as f:
        
        train = f.read().splitlines()
        data_sets[ds]['TRAIN'] = np.array([train[0].split(",")])
        
        for line in train[1:]:
            data_sets[ds]['TRAIN'] = np.append(data_sets[ds]['TRAIN'], [line.split(",")], axis=0)
            
    with open(folder_PATH + ds + '_TEST', 'r') as f:
        
        test = f.read().splitlines()
        data_sets[ds]['TEST'] = np.array([test[0].split(",")])
        
        for line in test[1:]:
            data_sets[ds]['TEST'] = np.append(data_sets[ds]['TEST'], [line.split(",")], axis=0)

DiatomSizeReduction
50words


In [22]:
for ds in ds_list:
    
    print('#################################' + ds + '#################################')
    
    # --------------------------------------------------------------------------------------------------

    # Set up training and test set
    train_size = len(data_sets[ds]['TRAIN'])
    test_size = len(data_sets[ds]['TEST'])
    ts_length = len(data_sets[ds]['TRAIN'][0])-1

    X_train = np.zeros((train_size, ts_length))
    y_train = np.zeros(train_size)

    X_test = np.zeros((test_size, ts_length))
    y_test = np.zeros(test_size)

    for i in range(ts_length+1):
        # Train
        for j in range(train_size):
            if i == 0:
                y_train[j] = int(data_sets[ds]['TRAIN'][j][0])
            else:
                X_train[j][i-1] = float(data_sets[ds]['TRAIN'][j][i])
        # Test
        for j in range(test_size):
            if i == 0:
                y_test[j] = int(data_sets[ds]['TEST'][j][0])
            else:
                X_test[j][i-1] = float(data_sets[ds]['TEST'][j][i])

    if not np.all(y_train):
        zero_idx = True
    else:
        zero_idx = False

    # Make sure the labels are integers
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)

    # Make sure the labels are zero indexed
    num_classes = len(np.unique(y_train))

    idx = 0
    for label in np.unique(y_train):
        y_train[np.where( y_train == label )] = idx
        y_test[np.where( y_test == label )] = idx
        idx += 1

    # Convert labels to one-hot encoding
    y_train_onehot = np.zeros((train_size, num_classes))
    y_train_onehot[np.arange(train_size), y_train] = 1



    # --------------------------------------------------------------------------------------------------



    # Get indices for different classes
    class_indices = {}

    for label in range(num_classes):
        class_indices[label] = np.where( y_train == label )[0]

        
    # Load transformations
    with open('transformations/' + ds + '_transformations', 'rb') as f:
        ds_transformations_pkl = pickle.load(f)
        

    # Seperate transformations into classes
    labels = []
    
    for label in range(num_classes):
        labels = labels + [label] * len(class_indices[label])**2
    
    # Remove pairs to reduce computations
    while (len(labels) > 150000 and skip_transformations):
        del labels[::4]
       
    
    split_indices = []
    for label in range(num_classes):
        split_indices.append(np.count_nonzero(np.array(labels) == label))
   
    split_indices = np.cumsum(split_indices)[:-1]
    
    theta_classes = np.split(ds_transformations_pkl, split_indices)

    
    # Clean set of transformations

    # Remove NaNs
    for i in range(num_classes):
        theta_classes[i] = theta_classes[i][~np.isnan(theta_classes[i]).any(axis=1)]




    # --------------------------------------------------------------------------------------------------





    # Number of intervals and number of cell intersections
    N_p = 10
    N_v = N_p + 1

    N_step = 100

    # Generate tesselation in 1D
    tess = np.linspace(0,1,N_v)

    # Generate L
    L = generate_L(N_p)

    # Find basis for null(L)
    B = nullspace(L)
    [D,d] = B.shape




    # --------------------------------------------------------------------------------------------------




    # Extract data points and perform PCA for all classes
    X_train_classes = {}
    class_sizes = []
    sigmas = {}

    for idx in range(num_classes):
        X_train_classes[idx] = X_train[np.where( y_train == idx )]
        class_sizes.append(len(X_train_classes[idx]))

        pca = PCA(n_components=None)
        pca.fit(remove_outliers(theta_classes[idx]))

        sigmas[idx] = pca.get_covariance()





    # --------------------------------------------------------------------------------------------------





    # Perform data augmentation on data sets

    # Set up numpy array to contain augmented data set
    if continue_run:
        with open('Augmented_data_sets/' + ds + '_augmented', 'rb') as f:
            augmented_data_set = pickle.load(f)
        idx = len(augmented_data_set)
        continue_run = False
    else:
        augmented_data_set = np.zeros((train_size, ts_length+1))
        augmented_data_set[:,1:] = X_train
        augmented_data_set[:,0] = y_train
        idx = 0

    # Run loop until stopped manually
    while idx < 100000:
        # Save transformation to file
        if (idx % 1000 == 0):
            with open('Augmented_data_sets/' + ds + '_augmented', 'wb') as f:
                pickle.dump(augmented_data_set, f)
            print('\nSaving ' + str(len(augmented_data_set)) + ' data points\n')

        class_idx = random.randint(0, num_classes-1)

        # Generate new data point
        new_data = generate_new_data(sigmas[class_idx], B, X_train_classes[class_idx], N_step, N_p)
        data_point = np.insert(new_data, 0, class_idx)

        # Append to augmented data set
        augmented_data_set = np.row_stack((augmented_data_set,data_point))

        idx += 1

#################################DiatomSizeReduction#################################

Saving 16 data points



  explained_variance_ = (S ** 2) / (n_samples - 1)


ValueError: array must not contain infs or NaNs