In [1]:
import os
import itertools
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
from simulator_utilities import *
np.random.seed(7)

In [9]:
# ========================
# Preparation
# ========================
# Output
folder_path = r'/w/repos/slate_sim/data/no_changes'
summary_file_path = os.path.join(folder_path, 'simulation_data_summary.csv')
context_file_path = os.path.join(folder_path, 'simulation_data_{0}.csv')
all_file_path = os.path.join(folder_path, 'simulation_data2.csv')

# Contexts 
contexts = {'platform': ['Mac', 'Windows'], 
            'network': ['wifi', 'wired'], 
            'country': ['US', 'CA']}
unique_contexts = [list(x) for x in itertools.product(*contexts.values())]

# Parameter statistics
params = {
          'x': {'mean': 1, 'min': 0, 'max': 4, 'std_range': [0.1, 1.1]},
          'y': {'mean': 1, 'min': 0, 'max': 3, 'std_range': [0.1, 1.1]}, 
          'z': {'mean': 1, 'min': 0, 'max': 2, 'std_range': [0.1, 1.1]}
         }

# Initialization
data = []
dist_context = {}
reward_range = [0.05, 0.35]
coefficient_range = [0.1, 1]
interaction2 = True
known_n = None
descritization_policy = {'x': 8, 'y': 6, 'z': 4}
share_descritized_grid = True

# Confidence Interval statistics
ci_mean = 0
ci_std = 0.1
ci_dist = gen_distribution('normal', ci_mean, ci_std, 5000)[0]

# N for each unique context
if known_n:
    n_per_config = known_n
else:
    ci_width = 0.005
    ci_mult = 1.96
    n_per_config = int(((ci_mult*ci_std/(ci_width/2))**2//100+1)*100)
n_per_context = np.prod([x if isinstance(x, int) else len(x)for x in descritization_policy.values()]) * n_per_config
print('Data Size per Configuration: {:,}'.format(n_per_config))
print('Data Size per Context: {:,}'.format(n_per_context))
print('Total Data Size: {:,}'.format(n_per_context*len(unique_contexts)))

Data Size per Configuration: 6,200
Data Size per Context: 1,190,400
Total Data Size: 9,523,200


In [10]:
# ========================
# Generate data
# ========================
param_list = list(params.keys())
df_cols = list(contexts.keys()) + param_list + ['reward']
plot_pairs = [x for x in itertools.combinations(range(len(param_list)), 2)]
df_summary = pd.DataFrame()

df_all = pd.DataFrame()

if share_descritized_grid:
    for k, v in descritization_policy.items():
        if isinstance(v, int):
            descritization_policy[k] = [round(x,4) for x in np.linspace(params[k]['min'], params[k]['max'], v)]

for i, c in enumerate(unique_contexts):
    
#     display(Markdown('**[{0}/{1}] Generating data for context {2} ...**'.format(i+1, len(unique_contexts), c)))

    # [1] Generate Distributions
    n_dist = 5000
    dist_context[i] = gen_param_reward(params, reward_range, n_dist)
#     plot_1d_param_reward(dist_context[i])
    
    # [2] Coefficients
    if interaction2:
        inter_terms = [x for x in itertools.combinations(range(len(param_list)), 2)]
    else:
        inter_terms = []
    n_coef = len(param_list) + len(inter_terms)
    coefficients = np.random.uniform(coefficient_range[0], coefficient_range[1], n_coef)    
    
    # [3] Discretize parameter
    discretize_parameters(dist_context[i], descritization_policy, equal_distance=True)
    dist_context[i]['configs'] = gen_config_reward(dist_context[i], param_list)
    
    # [4] Reward formula
    reward_formula = param_list + ['{0}{1}'.format(param_list[x[0]], param_list[x[1]]) for x in inter_terms]
    dist_context[i]['configs']['reward_equation'] = formulate_equation(reward_formula, coefficients)
    dist_context[i]['configs']['config_rterms'] = add_interactions(dist_context[i]['configs']['config_rterms'], inter_terms)
    display(Markdown('* {0}'.format(dist_context[i]['configs']['reward_equation'])))
    
    # [5] Generate data
    num_values = gen_data(dist_context[i], n_per_config, ci_dist, coefficients, reward_range)
    dist_context[i]['configs']['config_reward'] = np.array([x[-1] for x in num_values])
    c_data = [c + list(x) for x in num_values]
    data = data + c_data
    
    # [6] Plot 2D
#     plot_data = pd.DataFrame(num_values, columns=param_list + ['reward'])
#     plot_2d_paris(plot_data, param_list, inter_terms, round_to=0.05)
            
    # [7] To file
    df_context = pd.DataFrame(c_data, columns=df_cols)
    df_context = df_context.sample(frac=1)
    df_all = df_all.append(df_context)
    
    # [8] Summary
    df_mean = summarize_df(df_context, param_list, c)
    df_summary = df_summary.append(df_mean)

* reward = 0.4536x + 0.4029y + 0.6907z + 0.629xy + 0.9425xz + 0.6135yz

* reward = 0.4717x + 0.942y + 0.7765z + 0.4469xy + 0.4216xz + 0.7715yz

* reward = 0.6787x + 0.8366y + 0.943z + 0.2928xy + 0.1167xz + 0.5696yz

* reward = 0.4244x + 0.9763y + 0.2896z + 0.7557xy + 0.3804xz + 0.3595yz

* reward = 0.1341x + 0.9369y + 0.6904z + 0.5507xy + 0.8814xz + 0.7389yz

* reward = 0.6955x + 0.4882y + 0.3424z + 0.23xy + 0.5295xz + 0.2228yz

* reward = 0.1011x + 0.1249y + 0.1083z + 0.714xy + 0.6099xz + 0.5863yz

* reward = 0.2235x + 0.2301y + 0.7272z + 0.6984xy + 0.8501xz + 0.7932yz

In [28]:
# Save summary file
df_summary.to_csv(summary_file_path, index=False)

In [35]:
all_file_path = os.path.join(folder_path, 'simulation_data_scaledto1.csv')
df_all.to_csv(all_file_path, index=False)

In [12]:
pd.DataFrame(df_all["reward"]).query('reward>0.35').sort_values("reward")

Unnamed: 0,reward
1101410,0.350000
1075106,0.350000
210722,0.350000
611042,0.350000
1015778,0.350000
...,...
898559,0.688074
887423,0.688074
714239,0.688074
1009727,0.688074


Index([        ('Mac', 'wifi', 'US', 0.0, 0.6, 0.6667, 0.09564447481313343),
               ('Mac', 'wifi', 'US', 4.0, 0.6, 1.3333, 0.19001861193672703),
             ('Mac', 'wifi', 'US', 2.8571, 3.0, 1.3333, 0.1826750691872218),
            ('Mac', 'wifi', 'US', 2.8571, 1.8, 1.3333, 0.16679204194345365),
               ('Mac', 'wifi', 'US', 2.8571, 3.0, 2.0, 0.32528477860581445),
                ('Mac', 'wifi', 'US', 0.5714, 0.6, 0.0, 0.2854394461158166),
               ('Mac', 'wifi', 'US', 1.1429, 1.8, 0.0, 0.31625533883648055),
            ('Mac', 'wifi', 'US', 0.5714, 0.6, 0.6667, 0.05014855937056123),
                   ('Mac', 'wifi', 'US', 4.0, 1.2, 2.0, 0.3432934002317119),
             ('Mac', 'wifi', 'US', 2.2857, 1.2, 0.6667, 0.1001668116969795),
       ...
        ('Windows', 'wired', 'CA', 3.4286, 0.0, 0.6667, 0.1257947831351207),
        ('Windows', 'wired', 'CA', 2.8571, 1.2, 1.3333, 0.2018403288895462),
       ('Windows', 'wired', 'CA', 3.4286, 2.4, 0.6667, 0.06438239