# Synthetic Testing Data Generator

In [1]:
import pandas as pd
import random
import numpy as np
import copy
import json
from itertools import combinations
from sklearn.gaussian_process import GaussianProcessRegressor as GPR

import pickle

In [2]:
def listify(array):
    """ Helper function to allow np arrays to be turned into json """

    tmp = list()
    
    for arr in array:
        if type(arr) is np.float64:
            tmp.append(arr)
        else:
            tmp.append(listify(arr)) # recursively turns arrays into lists
    
    return tmp

### Dimension Generator

In [3]:
def get_dims(d_val, d):
    """ gets number of values for each dimension into a list """
    
    return [d_val for i in range(d)]

In [4]:
def get_random_dims(d, max_d_val):
    """ gets number of values for each dimension into a list - where values in each dimension is random """

    return [random.choice(range(5, max_d_val)) for i in range(d)]

### Pattern Generator

In [5]:
def take_off(change_point, n):
    """ take_off(5, 7) => [0, 0, 0, 0, 0, 0.01, 0.02] """


    trend = []
    for i in range(change_point):
        trend.append(0)
    for i in range(n-change_point):
        trend.append((i+1)*0.01)

    return list(np.round(trend, 2))

In [6]:
def flat(n):
    """ returns [0, 0, 0, 0, 0] for n = 5"""
    
    trend = [0 for i in range(n)]

    return list(np.round(trend, 2))

In [7]:
def v_shape(change_point, n):

    """ v_shape(5, 7) => [0.04, 0.03, 0.02, 0.01, 0, 0.01, 0.02] """

    trend = [0 for i in range(n)]
    trend[change_point-1] = 0
    for i in range(n):
        if change_point+i < n:
            trend[change_point+i] = (i+1)*0.01
        if change_point-i-2 >=0:
            trend[change_point-i-2] = (i+1)*0.01

    return list(np.round(trend, 2))

In [8]:
def down_flat_up(change_point1, change_point2, n):

    """ down_flat_up(2, 4, 7) => [0.02, 0.01, 0.0, 0, 0.01, 0.02, 0.03] """
    
    trend = [0 for i in range(n)]

    for i in range(n-change_point2):
        trend[change_point2+i] = (i+1)*0.01

    for i in range(change_point1, -1, -1):
        trend[i] = (change_point1-i)*0.01

    return list(np.round(trend, 2))

In [9]:
def up_flat_up(change_point1, change_point2, n):

    """ up_flat_up(2, 4, 7) => [-0.02, -0.01, 0.0, 0, 0.01, 0.02, 0.03] """
    
    trend = [0 for i in range(n)]

    for i in range(n-change_point2):
        trend[change_point2+i] = (i+1)*0.01

    for i in range(change_point1, -1, -1):
        trend[i] = -(change_point1-i)*0.01

    return list(np.round(trend, 2))

In [10]:
def flat_up_flat(change_point1, change_point2, n):

    """ flat_up_flat(2, 4, 7) => [-0.01, -0.01, 0, 0.01, 0.02, 0.02, 0.02] """
    
    trend = [0 for i in range(n)]
    for i in range(change_point1, change_point2):
        trend[i] = (i-change_point1+1)*0.01
    for i in range(change_point2, n):
        trend[i] = trend[change_point2-1]+0.01

    for i in range(n):
        trend[i] = trend[i]-int((change_point2-change_point1)/2)*0.01

    return list(np.round(trend, 2))

In [11]:
def flat_up_down_flat(change_point1, change_point2, change_point3, n):
    
    """ flat_up_flat(2, 5, 8, 10) => [0, 0, 0.01, 0.02, 0.03, 0.02, 0.01, 0, -0.01, -0.01] """

    trend = [0 for i in range(n)]
    for i in range(change_point1, change_point2):
        trend[i] = (i-change_point1+1)*0.01
    for i in range(change_point2, change_point3):
        trend[i] = trend[i-1]-0.01
    for i in range(change_point3, n):
        trend[i] = trend[change_point3-1]-0.01
        
    return list(np.round(trend, 2))

### Mean Generator for each dimension (based on d_val and dimension)

In [12]:
def mean_generator(dims, pattern):
    """ Helper function that returns list of list, depicting the mu values for each value in each dimension """

    if pattern == 'take_off':
        return [take_off(random.choice(range(2, dims[i]-1)), dims[i]) for i in range(len(dims))]

    if pattern == 'v_shape':
        return [v_shape(random.choice(range(2, dims[i]-1)), dims[i]) for i in range(len(dims))]

    if pattern == 'down_flat_up':
        return [down_flat_up(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]) for i in range(len(dims))]

    if pattern == 'up_flat_up':
        return [up_flat_up(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]) for i in range(len(dims))]

    if pattern == 'flat_up_flat':
        return [flat_up_flat(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]) for i in range(len(dims))]

    if pattern == 'flat_up_down_flat':
        return [flat_up_down_flat(random.choice(range(1, dims[i]//3)), random.choice(range(dims[i]//3+1, 2*(dims[i]//3))), random.choice(range(2*(dims[i]//3)+1, dims[i]-1)), dims[i]) for i in range(len(dims))]


    # Special case if using random
    if pattern == 'rand':

        each_dimension_mean_values = list()

        for i in range(len(dims)):
            if dims[i] < 7:
                rand_pattern = random.choices(['take_off', 'flat', 'v_shape', 'down_flat_up', 'up_flat_up', 'flat_up_flat'], [0.19, 0.05, 0.19, 0.19, 0.19, 0.19], k=1)[0]
            else:
                rand_pattern = random.choices(['take_off', 'flat', 'v_shape', 'down_flat_up', 'up_flat_up', 'flat_up_flat', 'flat_up_down_flat'], [0.16, 0.04, 0.16, 0.16, 0.16, 0.16, 0.16], k=1)[0]
    
            if rand_pattern == 'take_off':
                each_dimension_mean_values.append(take_off(random.choice(range(2, dims[i]-1)), dims[i]))
            
            if rand_pattern == 'flat':
                each_dimension_mean_values.append(flat(dims[i]))

            if rand_pattern == 'v_shape':
                each_dimension_mean_values.append(v_shape(random.choice(range(2, dims[i]-1)), dims[i]))

            if rand_pattern == 'down_flat_up':
                each_dimension_mean_values.append(down_flat_up(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]))

            if rand_pattern == 'up_flat_up':
                each_dimension_mean_values.append(up_flat_up(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]))

            if rand_pattern == 'flat_up_flat':
                each_dimension_mean_values.append(flat_up_flat(random.choice(range(1, dims[i]//2)), random.choice(range(dims[i]//2+1, dims[i]-1)), dims[i]))

            if rand_pattern == 'flat_up_down_flat':
                each_dimension_mean_values.append(flat_up_down_flat(random.choice(range(1, dims[i]//3)), random.choice(range(dims[i]//3+1, 2*(dims[i]//3))), random.choice(range(2*(dims[i]//3)+1, dims[i]-1)), dims[i]))
            
        return each_dimension_mean_values
    

### Interaction

In [13]:
def get_two_way_interaction(d, interaction):
    """ Get two way interactions as form of dictionary """
    
    two_way_combos = list(combinations(range(d), 2))
    
    if interaction == 'rand':
        two_way_interactions = {combo: random.choice([-0.0001, 0, 0.0001]) for combo in two_way_combos}
    
    else:
        two_way_interactions = {combo: interaction for combo in two_way_combos}

    return two_way_interactions


### Generate Random Matrix

In [14]:
def generate_random_matrix(each_dimension_mean_values, sd, two_way_interactions):
    """ Generates the actual full synthetic data based on the mean vector and covariance matrix """
    
    random_matrix = np.zeros([len(each_dimension_mean_values[i]) for i in range(len(each_dimension_mean_values))])
    theo_matrix = np.zeros([len(each_dimension_mean_values[i]) for i in range(len(each_dimension_mean_values))])

    means = [[]]
    for i in range(len(each_dimension_mean_values)):
        old_means = copy.deepcopy(means)
        means = list()

        for mean in old_means:
            for value in each_dimension_mean_values[i]:
                tmp = copy.copy(mean)
                tmp.append(value)
                means.append(tmp)
    

    means_coord = [[]]
    for i in range(len(each_dimension_mean_values)):
        old_means_coord = copy.deepcopy(means_coord)
        means_coord = list()

        for mean in old_means_coord:
            for j in range(len(each_dimension_mean_values[i])):
                tmp = copy.copy(mean)
                tmp.append(j)
                means_coord.append(tmp)

    best_synth_score = best_theo_score = -np.inf

    gpr_df = pd.DataFrame()

    for i in range(len(means)):
        
        combo_score_mean = sum(means[i]) + 0.5

        # add on interaction value to mean
        for pair in two_way_interactions:
            values_product = means_coord[i][pair[0]] * means_coord[i][pair[1]]
            combo_score_mean += values_product * two_way_interactions[pair]            

        synth_score = np.random.normal(combo_score_mean, sd, size=1)[0]

        random_matrix[tuple(means_coord[i])] = synth_score

        gpr_df_dict = {str(j):[means_coord[i][j]] for j in range(len(means_coord[i]))}
        gpr_df_dict['y'] = synth_score
        tmp_gpr_df = pd.DataFrame(gpr_df_dict)
        gpr_df = gpr_df.append(tmp_gpr_df)

        theo_score = combo_score_mean
        theo_matrix[tuple(means_coord[i])] = theo_score

        if synth_score > best_synth_score:
            best_synth_score = synth_score
            best_synth_combo = means_coord[i]

        if theo_score > best_theo_score:
            best_theo_score = theo_score
            best_theo_combo = means_coord[i]
    
    return random_matrix, best_synth_combo, best_synth_score, theo_matrix, best_theo_combo, best_theo_score, gpr_df

# Generate Data

Regular data

In [15]:
D_VAL = {2: (5, 7, 'rand'),
    3: (5, 7, 'rand'),
    4: (5, 7, 'rand'),
    5: (5, 7, 'rand')
}
i = 0

for d in (2, 3, 4, 5):
    for d_val in D_VAL[d]:
        for pattern in ['take_off', 'v_shape', 'down_flat_up', 'up_flat_up', 'flat_up_flat', 'rand']:
            
            if pattern == 'flat_up_down_flat' and (d_val == 5 or d_val == 'rand'):
                continue

            for sd in (0.005,):
                for interaction in (-0.0001, 0, 0.0001, 'rand'):
                    for rep in (1, 2):
                        
                        print(i, d, d_val, pattern, sd, interaction, rep)

                        if d_val != 'rand':
                            d_vals = get_dims(d_val, d)
                        else:
                            d_vals = get_random_dims(d, max_d_val = 12)
                    
                        each_dimension_mean_values = mean_generator(d_vals, pattern)

                        two_way_interactions = get_two_way_interaction(d, interaction)

                        synthetic_data, best_synthetic_combo, best_synthetic_score, \
                            theoretical_data, best_theoretical_combo, best_theoretical_score,\
                                   gpr_df = generate_random_matrix(each_dimension_mean_values, sd, two_way_interactions)

                        json_output = {
                            'num_arg_vals': d_vals,
                            'synthetic_data': listify(synthetic_data),
                            'theoretical_data': listify(theoretical_data),
                            'max': {'synth_max': best_synthetic_score, 'theor_max': best_theoretical_score,
                                    'synth_max_coord': best_synthetic_combo, 'theor_max_coord': best_theoretical_combo},
                        }
                            
                        
                        name = f'{i}-D_{d}_{d_val}-M_{pattern}-SD_{sd}-I_{interaction}-{rep}'
                        i += 1

                        with open(f'../../Batch 1/{name}.json', 'w') as f:
                            json.dump(json_output, f, indent=2)  

                        gpr_df.to_csv(f'../../Batch 1/{name}.csv', index = False)           

0 2 5 take_off 0.005 -0.0001 1
1 2 5 take_off 0.005 -0.0001 2
2 2 5 take_off 0.005 0 1
3 2 5 take_off 0.005 0 2
4 2 5 take_off 0.005 0.0001 1
5 2 5 take_off 0.005 0.0001 2
6 2 5 take_off 0.005 rand 1
7 2 5 take_off 0.005 rand 2
8 2 5 v_shape 0.005 -0.0001 1
9 2 5 v_shape 0.005 -0.0001 2
10 2 5 v_shape 0.005 0 1
11 2 5 v_shape 0.005 0 2
12 2 5 v_shape 0.005 0.0001 1
13 2 5 v_shape 0.005 0.0001 2
14 2 5 v_shape 0.005 rand 1
15 2 5 v_shape 0.005 rand 2
16 2 5 down_flat_up 0.005 -0.0001 1
17 2 5 down_flat_up 0.005 -0.0001 2
18 2 5 down_flat_up 0.005 0 1
19 2 5 down_flat_up 0.005 0 2
20 2 5 down_flat_up 0.005 0.0001 1
21 2 5 down_flat_up 0.005 0.0001 2
22 2 5 down_flat_up 0.005 rand 1
23 2 5 down_flat_up 0.005 rand 2
24 2 5 up_flat_up 0.005 -0.0001 1
25 2 5 up_flat_up 0.005 -0.0001 2
26 2 5 up_flat_up 0.005 0 1
27 2 5 up_flat_up 0.005 0 2
28 2 5 up_flat_up 0.005 0.0001 1
29 2 5 up_flat_up 0.005 0.0001 2
30 2 5 up_flat_up 0.005 rand 1
31 2 5 up_flat_up 0.005 rand 2
32 2 5 flat_up_flat 0.005 