# Multi-d Contextual Bandit with Slates on Simulated Dataset

## Preparation

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
import pandas as pd
import numpy as np
from vowpalwabbit import pyvw
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import math
import slates
import os
from tqdm import tqdm

In [None]:
def setup_outcomes():
    return {('Mac', 'wifi', 'CA'): [], ('Mac', 'wifi', 'US'): [], ('Mac', 'wired', 'CA'): [], ('Mac', 'wired', 'US'): [], ('Windows', 'wifi', 'CA'): [], ('Windows', 'wifi', 'US'): [], ('Windows', 'wired', 'CA'): [], ('Windows', 'wired', 'US'): []}

def optimal_policy_sample(context, action, name, sample_size=1):
    temp_df = ground_truth_info[name]['ground_truth_rewards_df']
    row_index = temp_df.index[(temp_df['platform'] == context[0])
                                           & (temp_df['network'] == context[1])
                                           & (temp_df['country'] == context[2])
                                           & (temp_df['x'] == action[0])
                                           & (temp_df['y'] == action[1])
                                           & (temp_df['z'] == action[2])
                                          ]
    possible_rewards = temp_df.iloc[row_index[0]]["reward"]
    return np.random.choice(possible_rewards, sample_size, replace=True)

def optimal_policy_median(context, action, name):
    temp_df = ground_truth_info[name]['ground_truth_rewards_df']
    row_index = temp_df.index[(ground_truth_rewards_df['platform'] == context[0])
                                           & (temp_df['network'] == context[1])
                                           & (temp_df['country'] == context[2])
                                           & (temp_df['x'] == action[0])
                                           & (temp_df['y'] == action[1])
                                           & (temp_df['z'] == action[2])
                                          ]
    possible_rewards = temp_df.iloc[row_index[0]]["reward"]
    return np.median(possible_rewards)

def plot_rewards(log=False):
    contexts = sorted(list(test_configs[next(iter(test_configs))]["outcomes"].keys()))
    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']
    for context in contexts:
        plt.figure(figsize=(10, 5))
        for i, name in enumerate(test_configs):
            samples = test_configs[name]["outcomes"][context]
            window_size = int(len(samples) / 10)
            plot_data = pd.Series(samples).rolling(window_size, min_periods=0).mean()
            if log:
                plot_data = np.log(plot_data)
            optimal_policy_results = optimal_policy_sample(context, ground_truth_info[name]['min_actions'][context], name, len(samples))
            plot_opt = pd.Series(optimal_policy_results).rolling(int(len(samples)/10), min_periods=0).mean()
            if log:
                plot_opt = np.log(plot_opt)
            plt.plot(plot_data, color=colors[i], linewidth=2, label="{} slate".format(name))
            plt.plot(plot_opt, color=colors[i], linewidth=2, linestyle=':', label="Best Possible")
        ymin = 0.05
        if log:
            ymin = np.log(0.05)
        plt.axhline(y=ymin, color='k', linestyle=':', label='Optimal(Groundtruth)')
        plt.xlim([0, 400])
        plt.title("{}".format(context))
        plt.ylabel("Cost")
        plt.legend(loc="upper right")
        plt.show()

Specify files and contexts of the experiment. These data files are generated by using the Continuous Multi-D Simulator.

In [None]:
DATA_PATH = r'E:\data\20200214_vector_plearning_msrnyc\data'
GROUND_TRUTH_DATASETS = {
     "4_3_2_0,1" : "df_all_4_3_2_0.01.csv",
    "8_6_4_0,1" : "df_all_8_6_4_0.01.csv",
     "16_12_8_0,1" : "df_all_16_12_8_0.01.csv",
#      "32_24_16_0,1" : "df_all_32_24_16_0.01.csv"
}

TEST_DATASETS = {
     "4_3_2_0,1" : "df_all_4_3_2_0.01.csv",
    "8_6_4_0,1" : "df_all_8_6_4_0.01.csv",
     "16_12_8_0,1" : "df_all_16_12_8_0.01.csv",
#      "32_24_16_0,1" : "df_all_32_24_16_0.01.csv"
}

In [None]:
contexts_to_choose_from = [
    ('Mac','wifi','CA') ,
    ('Mac','wifi','US'),
    ('Mac','wired','CA'),
    ('Mac','wired','US'),
    ('Windows','wifi','CA'),
    ('Windows','wifi','US'),
    ('Windows','wired','CA'),
    ('Windows','wired','US')
]


Traning parameters: VW commands and iteration number

In [None]:
common_args = "--quiet --cb_type mtr --epsilon 0.2 --first_only"
slates_args = common_args + " --ccb_explore_adf --coin --interactions UUUA" 
cb_args = common_args + " --cb_explore_adf --coin --interactions UUUA"
num_iter = 6000

## Read Data

Read all datasets to compare

In [None]:
test_configs = {}
for name in TEST_DATASETS:
    test_configs[name] = {}
    df = pd.read_csv(os.path.join(DATA_PATH, TEST_DATASETS[name]))
    test_configs[name]["data"] = df
    test_configs[name]["rewards"] = pd.DataFrame(df.groupby(['platform', 'network', 'country','x','y','z'])['reward'].unique()).reset_index()
    test_configs[name]["x"] = sorted(df["x"].unique())
    test_configs[name]["y"] = sorted(df["y"].unique())
    test_configs[name]["z"] = sorted(df["z"].unique())
    test_configs[name]["x_actions"] = ["x="+str(a) for a in test_configs[name]["x"]]
    test_configs[name]["y_actions"] = ["y="+str(a) for a in test_configs[name]["y"]]
    test_configs[name]["z_actions"] = ["z="+str(a) for a in test_configs[name]["z"]]
    all_string_actions, all_actions = slates.combine_float_actions_categorical(test_configs[name]["x"],test_configs[name]["y"],test_configs[name]["z"])
    test_configs[name]["all_string_actions"] = all_string_actions
    test_configs[name]["all_actions"] = all_actions

Find the optimal action/reward from the ground truth.

In [None]:
ground_truth_info = {}

for name in GROUND_TRUTH_DATASETS:
    ground_truth_info[name] = {}
    ground_truth_df = pd.read_csv(os.path.join(DATA_PATH, GROUND_TRUTH_DATASETS[name]))
    ground_truth_rewards_df = pd.DataFrame(ground_truth_df.groupby(['platform', 'network', 'country','x','y','z'])['reward'].unique()).reset_index()

    min_reward = {}
    min_actions = {}

    grps_context = ground_truth_df.groupby(['platform', 'network', 'country'])
    for i, context in enumerate(grps_context.groups.keys()):
        df_temp = grps_context.get_group(context)
        if context not in min_reward.keys():
            min_reward[context] = {}
        grps_action = df_temp.groupby(['x', 'y', 'z'])
        for action in grps_action.groups.keys():
            df_temp2 = grps_action.get_group(action)
            min_reward[context][action] = np.mean(df_temp2['reward'])

        min_reward_action = min(min_reward[context], key=min_reward[context].get)
        min_actions[context] = min_reward_action
        
    ground_truth_info[name]['min_reward'] = min_reward
    ground_truth_info[name]['min_actions'] = min_actions
    ground_truth_info[name]['ground_truth_rewards_df'] = ground_truth_rewards_df


# Slates

In [None]:
trajectory_strings = []
for name in test_configs:
    
    print('Running slates on {0}'.format(name))
    test_configs[name]["outcomes"] = setup_outcomes()
    test_configs[name]["x_outcomes"] = setup_outcomes()
    test_configs[name]["y_outcomes"] = setup_outcomes()
    test_configs[name]["z_outcomes"] = setup_outcomes()
    
    model = pyvw.vw(slates_args)

    df = test_configs[name]["data"]
    rewards = test_configs[name]["rewards"]

    for i in tqdm(range(num_iter)):
        
        # Get data
        platform,network,country = contexts_to_choose_from[np.random.choice(len(contexts_to_choose_from))]
        shared_context = "platform={} region={} connection={}".format(platform, country, network)
        examples = slates.create_slates_example(model, shared_context, [test_configs[name]["x_actions"], test_configs[name]["y_actions"], test_configs[name]["z_actions"]])
        
        # Pred
        pred = slates.slate_pred_conv(model.predict(examples, prediction_type=pyvw.pylibvw.vw.pDECISION_SCORES))
        model.finish_example(examples)
                
        # Choose the slot to samlpe
        chosen_slot = np.random.choice(len(pred))
        slot_to_sample = pred[chosen_slot]
        
        # Sample an index from this slot
        index = slates.sample_index(slot_to_sample)
        
        # Swap sampled action if it was not the 0th item.
        if index != 0:
            slot_to_sample[0], slot_to_sample[index] = slot_to_sample[index], slot_to_sample[0]
            
        # Assign the potentially modified slot back into the prediction
        pred[chosen_slot] = slot_to_sample
        
        exploit_a = 0
        for pred_a in pred:
            all_probs = [x[1] for x in pred_a]
            if pred_a[0][0] == max(pred_a,key=lambda x:x[1])[0] and not(all_probs[1:] == all_probs[:-1]):
                exploit_a +=1

        chosen_x = test_configs[name]["x"][pred[0][0][0]]
        chosen_y = test_configs[name]["y"][pred[1][0][0]]
        chosen_z = test_configs[name]["z"][pred[2][0][0]]
        
        trajectory_strings.append(f"\"('{platform}', '{network}', '{country}')\",\"({chosen_x},{chosen_y},{chosen_z})\",1")
               
        row_index = rewards.index[(rewards['platform'] == platform)
                                       & (rewards['network'] == network)
                                       & (rewards['country'] == country)
                                       & (rewards['x'] == chosen_x)
                                       & (rewards['y'] == chosen_y)
                                       & (rewards['z'] == chosen_z)
                                      ]
        # Choose a reward from the set that matched this example
        possible_rewards = rewards.iloc[row_index[0]]["reward"]
        cost = np.random.choice(possible_rewards)

        x_index = test_configs[name]["x_actions"].index("x="+str(chosen_x))
        y_index = test_configs[name]["y_actions"].index("y="+str(chosen_y))
        z_index = test_configs[name]["z_actions"].index("z="+str(chosen_z))
        x_outcome = (x_index, cost, pred[0][0][1])
        y_outcome = (y_index, cost, pred[1][0][1])
        z_outcome = (z_index, cost, pred[2][0][1])
        
        # Only save the outcome for plotting if it was exploit
        if exploit_a == 3:
            test_configs[name]["outcomes"][(platform,network,country)].append(cost)
            test_configs[name]["x_outcomes"][(platform,network,country)].append(test_configs[name]["x"][x_index])
            test_configs[name]["y_outcomes"][(platform,network,country)].append(test_configs[name]["y"][y_index])
            test_configs[name]["z_outcomes"][(platform,network,country)].append(test_configs[name]["z"][z_index])

        examples = slates.create_slates_example(model, shared_context, [test_configs[name]["x_actions"], test_configs[name]["y_actions"], test_configs[name]["z_actions"]], [x_outcome,y_outcome,z_outcome])
        model.learn(examples)
        model.finish_example(examples)

In [None]:
with open(os.path.join(DATA_PATH, 'slate_trajectory2.csv'), 'w') as f:
    for line in trajectory_strings:
        f.write(line + os.linesep)

## Comparison

### Linear scale plots

In [None]:
plot_rewards(log=False)

### Log scale plots

In [None]:
plot_rewards(log=True)