In [None]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Benchmark all the baseline agents
on a given CyberBattleSim environment and compare
them to the dumb 'random agent' baseline.

NOTE: You can run this `.py`-notebook directly from VSCode.
You can also generate a traditional Jupyter Notebook
using the VSCode command `Export Currenty Python File As Jupyter Notebook`.
"""

# pylint: disable=invalid-name

In [1]:
import sys
sys.path.append("/home/kalic/Desktop/AI_P/CyberBattleSim")
import logging
import gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.plotting as p
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_randomcredlookup as rca
import cyberbattle.agents.baseline.agent_tabularqlearning as tqa
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import Verbosity

logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")

In [2]:
import pickle

In [3]:
# Papermill notebook parameters

#############
# gymid = 'CyberBattleTiny-v0'
#############
gymid = "CyberBattleToyCtf-v0"
env_size = None
iteration_count = 1500
training_episode_count = 20
eval_episode_count = 10
maximum_node_count = 12
maximum_total_credentials = 10
#############
# gymid = "CyberBattleChain-v0"
# env_size = 10
# iteration_count = 9000
# training_episode_count = 50
# eval_episode_count = 5
# maximum_node_count = 22
# maximum_total_credentials = 22

In [33]:
# Parameters
gymid = "CyberBattleToyCtf-v0"
iteration_count = 1500
training_episode_count = 20
eval_episode_count = 10
maximum_node_count = 12
maximum_total_credentials = 10


In [4]:
# Load the Gym environment
if env_size:
    gym_env = gym.make(gymid, size=env_size)
else:
    gym_env = gym.make(gymid)

ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=maximum_node_count,
    maximum_total_credentials=maximum_total_credentials,
    identifiers=gym_env.identifiers
)

In [None]:
debugging = False
if debugging:
    print(f"port_count = {ep.port_count}, property_count = {ep.property_count}")

    gym_env.environment
    # training_env.environment.plot_environment_graph()
    gym_env.environment.network.nodes
    gym_env.action_space
    gym_env.action_space.sample()
    gym_env.observation_space.sample()
    o0 = gym_env.reset()
    o_test, r, d, i = gym_env.step(gym_env.sample_valid_action())
    o0 = gym_env.reset()

    o0.keys()

    fe_example = w.RavelEncoding(ep, [w.Feature_active_node_properties(ep), w.Feature_discovered_node_count(ep)])
    a = w.StateAugmentation(o0)
    w.Feature_discovered_ports(ep).get(a, None)
    fe_example.encode_at(a, 0)

In [6]:
with open('toy_my_dql_run_round_5.pkl','rb') as file:
    toy_my_dql_run_round_5 = pickle.load(file)

In [7]:
dql_run_5_toytest = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=toy_my_dql_run_round_5['learner'],
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="DQL"
)

###### DQL
Learning with: episode_count=20,iteration_count=1500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10


  state_batch = torch.tensor(states_to_consider).to(device)


  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
  Episode 11 stopped at t=1500 
  Episode 12 stopped at t=1500 
  Episode 13 stopped at t=1500 
  Episode 14 stopped at t=1500 
  Episode 15 stopped at t=1500 
  Episode 16 stopped at t=1500 
  Episode 17 stopped at t=1500 
  Episode 18 stopped at t=1500 
  Episode 19 stopped at t=1500 
  Episode 20 stopped at t=1500 
simulation ended


In [18]:
with open('toy_my_dql_run_round_10.pkl','rb') as file:
    toy_my_dql_run_round_10 = pickle.load(file)

In [19]:
dql_run_10_toytest = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=toy_my_dql_run_round_10['learner'],
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="Round_10_DQL"
)

###### Round_10_DQL
Learning with: episode_count=20,iteration_count=1500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10


  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
  Episode 11 stopped at t=1500 
  Episode 12 stopped at t=1500 
  Episode 13 stopped at t=1500 
  Episode 14 stopped at t=1500 
  Episode 15 stopped at t=1500 
  Episode 16 stopped at t=1500 
  Episode 17 stopped at t=1500 
  Episode 18 stopped at t=1500 
  Episode 19 stopped at t=1500 
  Episode 20 stopped at t=1500 
simulation ended


In [None]:
# Evaluate a random agent that opportunistically exploits
# credentials gathere in its local cache
credlookup_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=rca.CredentialCacheExploiter(),
    episode_count=10,
    iteration_count=iteration_count,
    epsilon=0.90,
    render=False,
    epsilon_exponential_decay=10000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    title="Credential lookups (ϵ-greedy)"
)

In [None]:
# Evaluate a Tabular Q-learning agent
tabularq_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=tqa.QTabularLearner(
        ep,
        gamma=0.015, learning_rate=0.01, exploit_percentile=100),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.01,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="Tabular Q-learning"
)

In [None]:
# Evaluate an agent that exploits the Q-table learnt above
tabularq_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=tqa.QTabularLearner(
        ep,
        trained=tabularq_run['learner'],
        gamma=0.0,
        learning_rate=0.0,
        exploit_percentile=90),
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    render=False,
    verbosity=Verbosity.Quiet,
    title="Exploiting Q-matrix"
)

In [5]:
# Evaluate the Deep Q-learning agent
dql_run_yuan = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=dqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=10,
        batch_size=512,
        # torch default learning rate is 1e-2
        # a large value helps converge in less episodes
        learning_rate=0.01
    ),
    # episode_count=training_episode_count,
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="DQL"
)

###### DQL
Learning with: episode_count=10,iteration_count=1500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10


  state_batch = torch.tensor(states_to_consider).to(device)


  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
simulation ended


In [6]:
with open('Toy_results/my_dql_test_run_40.pkl','rb') as file:
    my_dql_test_run_40 = pickle.load(file)

In [7]:
# Evaluate an agent that exploits the Q-function learnt above
toy_my_dql_run_after_40_with_special_epsilon = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=my_dql_test_run_40['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="NVGIL DQL with special epsilon 40"
)

###### NVGIL DQL with special epsilon 40
Learning with: episode_count=10,iteration_count=1500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10
  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
simulation ended


In [6]:
# Evaluate an agent that exploits the Q-function learnt above
toy_my_dql_run_after_40 = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=load_m8['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=False,
    plot_episodes_length=False,
    verbosity=Verbosity.Quiet,
    title="NVGIL DQL"
)

###### NVGIL DQL
Learning with: episode_count=10,iteration_count=1500,ϵ=0.0,ϵ_min=0.0, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10


  state_batch = torch.tensor(states_to_consider).to(device)


  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
simulation ended


In [16]:
# Evaluate an agent that exploits the Q-function learnt above
toy_my_dql_run_after_40_with_epsilon = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=load_m8['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="NVGIL DQL with epsilon 40"
)

###### NVGIL DQL with epsilon 40
Learning with: episode_count=10,iteration_count=1500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10
  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
simulation ended


In [22]:
# Evaluate an agent that exploits the Q-function learnt above
toy_my_dql_run_after_yuan = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=dql_run_yuan['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=False,
    plot_episodes_length=False,
    verbosity=Verbosity.Quiet,
    title="dql_run_after_yuan"
)

###### dql_run_after_yuan
Learning with: episode_count=10,iteration_count=1500,ϵ=0.0,ϵ_min=0.0, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=10
  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
simulation ended


In [35]:
# Evaluate the random agent
random_run_yuan = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=learner.RandomPolicy(),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=1.0,  # purely random
    render=False,
    verbosity=Verbosity.Quiet,
    plot_episodes_length=False,
    title="Random search"
)

###### Random search
Learning with: episode_count=20,iteration_count=1500,ϵ=1.0,ϵ_min=0.0, 


  Episode 1 stopped at t=1500 
  Episode 2 stopped at t=1500 
  Episode 3 stopped at t=1500 
  Episode 4 stopped at t=1500 
  Episode 5 stopped at t=1500 
  Episode 6 stopped at t=1500 
  Episode 7 stopped at t=1500 
  Episode 8 stopped at t=1500 
  Episode 9 stopped at t=1500 
  Episode 10 stopped at t=1500 
  Episode 11 stopped at t=1500 
  Episode 12 stopped at t=1500 
  Episode 13 stopped at t=1500 
  Episode 14 stopped at t=1500 
  Episode 15 stopped at t=1500 
  Episode 16 stopped at t=1500 
  Episode 17 stopped at t=1500 
  Episode 18 stopped at t=1500 
  Episode 19 stopped at t=1500 
  Episode 20 stopped at t=1500 
simulation ended


In [45]:
# Compare and plot results for all the agents
all_runs = [
    # random_run,
    # credlookup_run,
    # tabularq_run,
    # tabularq_exploit_run,
    # dql_run,
    random_run_yuan,
    dql_run_yuan
    # dql_exploit_run
]

# Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
p.plot_averaged_cummulative_rewards(
    all_runs=all_runs,
    title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
    f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
    f'({len(themodel.state_space.feature_selection)}\n'
    f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [6]:
with open('Toy_results/toy_random_run.pkl','rb') as file:
    load_m1 = pickle.load(file)
with open('Toy_results/toy_tabularq_run.pkl','rb') as file:
    load_m2 = pickle.load(file)
with open('Toy_results/toy_tabularq_exploit_run.pkl','rb') as file:
    load_m3 = pickle.load(file)
with open('Toy_results/toy_dql_run.pkl','rb') as file:
    load_m4 = pickle.load(file)
with open('Toy_results/my_dql_test_run_10.pkl','rb') as file:
    load_m5 = pickle.load(file)
with open('Toy_results/my_dql_test_run_20.pkl','rb') as file:
    load_m6 = pickle.load(file)
with open('Toy_results/my_dql_test_run_30.pkl','rb') as file:
    load_m7 = pickle.load(file)
with open('Toy_results/my_dql_test_run_40.pkl','rb') as file:
    load_m8 = pickle.load(file)

In [7]:
load_m5['title'] = 'NVGIL10'
load_m6['title'] = 'NVGIL20'
load_m7['title'] = 'NVGIL30'
load_m8['title'] = 'NVGIL40'

In [9]:
with open('toy_random_run_yuan.pkl','rb') as file:
    random_run_yuan = pickle.load(file)
with open('toy_dql_run_yuan.pkl','rb') as file:
    dql_run_yuan = pickle.load(file)
with open('toy_my_dql_run_after_40.pkl','rb') as file:
    toy_my_dql_run_after_40 = pickle.load(file)    
with open('toy_my_dql_run_after_40.pkl','rb') as file:
    toy_my_dql_run_after_40 = pickle.load(file)  

In [12]:
random_run_yuan['title'] = "Random"
dql_run_yuan['title'] = 'DQL'
toy_my_dql_run_after_40_with_special_epsilon['title'] ='DQL with NGVIL'

In [30]:
contenders = [
    # credlookup_run,
    # tabularq_run,
    # dql_run,
    # dql_exploit_run
    #load_m2,
    # random_run_1,
    #load_m3,
    random_run_yuan,
    dql_run_yuan,
    # toy_my_dql_run_after_yuan,
    # toy_my_dql_run_after_40,
    toy_my_dql_run_after_40_with_special_epsilon


    # load_m4,

    # load_m8,
]
pptxy.plot_episodes_length(contenders)
pptxy.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
    f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders)

In [10]:
# Plot cumulative rewards for all episodes
for r in contenders:
    p.plot_all_episodes(r)

In [None]:

with open('test/toy_my_test/toy_random_run.pkl','rb') as file:
    load1 = pickle.load(file)
with open('test/toy_my_test/toy_tabularq_run.pkl','rb') as file:
    load2 = pickle.load(file)
with open('test/toy_my_test/dql_run_10_toytest_on_toyctf.pkl','rb') as file:
    load3 = pickle.load(file)
with open('test/toy_my_test/toy_dql_run.pkl','rb') as file:
    load4 = pickle.load(file)
with open('test/toy_my_test/toy_my_dql_run_round_10.pkl','rb') as file:
    load5 = pickle.load(file)


In [35]:
with open('toy_paper_results/random_run_yuan.pkl','wb') as file:
    pickle.dump(random_run_yuan, file)
with open('toy_paper_results/dql_run_yuan.pkl','wb') as file:
    pickle.dump(dql_run_yuan, file)
with open('toy_paper_results/toy_my_dql_run_after_40_with_special_epsilon.pkl','wb') as file:
    pickle.dump(toy_my_dql_run_after_40_with_special_epsilon, file)


In [29]:
import cyberbattle.agents.baseline.plotting as pptxy