In [10]:
import numpy as np
import matplotlib.pyplot as plt

In [11]:
import os
# path = "./data"
# os.chdir(path)

### Network Settings

In [12]:
config = dict()
config["subnet"] = np.array([3, 2, 2, 2])
config["topology"] = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]])
config["sn_0_services"] = ['ssh', 'ftp', 'http']
config["sn_1_services"] = ['ssh']
config["sn_2_services"] = ['ftp', 'http']
config["sn_3_services"] = ['ssh','ftp']
config["num_services"] = 4
config["sn_0_os"] = 'linux'
config["sn_1_os"] = 'windows'
config["sn_2_os"] = 'linux'
config["sn_3_os"] = 'linux'
config["num_os"] = 2
config["sn_0_process"] = ['tomcat']
config["sn_1_process"] = []
config["sn_2_process"] = ['daclsvc']
config["sn_3_process"] = []
config["num_process"] = 2
config["sensitive_addr"] = ([3, 0], )

#firewall setting
config["firewall"] = dict()
config["firewall"]["-1,0"] = ['http']
config["firewall"]["0,-1"] = ['ssh', 'ftp', 'http']
config["firewall"]["0,1"] = ['ssh']
config["firewall"]["1,0"] = ['ssh']
config["firewall"]["3,2"] = ['ftp']
config["firewall"]["2,3"] = ['ftp']
config["firewall"]["1,3"] = ['ssh']
config["firewall"]["3,1"] = ['ssh']
config["firewall"]["0,2"] = ['http']
config["firewall"]["2,0"] = ['http']




config["honeypot"] = ([2,0], )

# config_2 = config

In [13]:
config_2 = dict()
config_2["subnet"] = np.array([3, 2, 2, 2])
config_2["topology"] = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]])
config_2["sn_0_services"] = ['ssh', 'ftp', 'http']
config_2["sn_1_services"] = ['ftp', 'http']
config_2["sn_2_services"] = ['ssh']
config_2["sn_3_services"] = ['ssh','ftp']
config_2["num_services"] = 4
config_2["sn_0_os"] = 'linux'
config_2["sn_1_os"] = 'linux'
config_2["sn_2_os"] = 'windows'
config_2["sn_3_os"] = 'linux'
config_2["num_os"] = 2
config_2["sn_0_process"] = ['tomcat']
config_2["sn_1_process"] = []
config_2["sn_2_process"] = ['daclsvc']
config_2["sn_3_process"] = []
config_2["num_process"] = 2
config_2["sensitive_addr"] = ([3, 0], )

#firewall setting
config_2["firewall"] = dict()
config_2["firewall"]["-1,0"] = ['http']
config_2["firewall"]["0,-1"] = ['ssh', 'ftp', 'http']
config_2["firewall"]["0,2"] = ['ssh']
config_2["firewall"]["2,0"] = ['ssh']
config_2["firewall"]["3,1"] = ['ftp']
config_2["firewall"]["1,3"] = ['ftp']
config_2["firewall"]["2,3"] = ['ssh']
config_2["firewall"]["3,2"] = ['ssh']
config_2["firewall"]["0,1"] = ['http']
config_2["firewall"]["1,0"] = ['http']

config_2["honeypot"] = ([2,0], )


In [14]:
idx_map = dict()
idx_map["subnet_idx"] = 0
idx_map["host_idx"] = 1
idx_map["value"] = 2
idx_map["discovered"] = 3
idx_map["reachable"] = 4
idx_map["compromised"] = 5
idx_map["ssh"] = 6
idx_map["ftp"] = 7
idx_map["http"] = 8
idx_map["scp"] =9
idx_map["os"] = 10
idx_map["process"] = 11
idx_map["access_level"] = 12
idx_map["new_service"] = 13

service_map = dict()
service_map["ssh"] = 1
service_map["ftp"] = 2
service_map["http"] = 3
service_map["scp"] = 4
service_map[1] = "ssh"
service_map[2] = "ftp"
service_map[3] = "http"
service_map[4] = "scp"

### Transfer network to feature matrix


In [15]:
class host_map(object):
    def __init__(self, config):
        self.subnet = config["subnet"]

    def addr2idx(self, addr):
        idx = np.sum(self.subnet[:addr[0]]) + addr[1]
        return idx

    def idx2addr(self, idx):
        idx_list = np.hstack((np.zeros(1), np.cumsum(self.subnet)))
        sn_idx = np.where(idx_list <= idx)[0][-1]
        host_idx = int(idx-idx_list[sn_idx])
        return [sn_idx, host_idx]

In [16]:
def host_matrix(config, idx_map):
    subnet = config["subnet"]
    sensitive_addr = config["sensitive_addr"]
    num_host = np.sum(subnet)
    # 2 is host address, 1 is the value of host, 3 is host state (discovered, reachable, compromised)
    # second and third 1 is whether have information of process and os
    # last 1 is the attacker's access level
    num_feature_size = 2+1+3+config["num_services"]+1+1+1
    host_state = np.zeros((num_host, num_feature_size))
    for sn in range(len(subnet)):
        services = config["sn_" + str(sn) + "_services"]
        svs_idx = []
        for svs in services:
            svs_idx.append(idx_map[svs])
        for host in range(subnet[sn]):
            n_hosts = np.sum(subnet[:sn])+host
            
            # different os have different value to represent
            if  config["sn_" + str(sn) + "_os"] == "linux":
                host_state[n_hosts, :] = np.array([sn, host, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
            elif config["sn_" + str(sn) + "_os"] == "windows":
                host_state[n_hosts, :] = np.array([sn, host, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1])
                
            # services state: 0 absent, 1 present, 2 unknown, 3 masked
            host_state[n_hosts, svs_idx] = 1

    for sens_add in sensitive_addr:
        sensitive_host = np.sum(subnet[:sens_add[0]])+sens_add[1]
        host_state[sensitive_host, 2] =500
    return host_state

In [17]:
# host_matrix(config, idx_map)[93,2]

### Environment Settings

In [18]:
class env(object):
    def __init__(self, config, config_2, idx_map, service_map):
        """

        :param host_true_state: vectorized config of each host (attacker cannot observe)
        :param idx_map: dict, record the idx of different values in vectorized host correspond to which feature
        """
        self.config_1 = config
        self.config_2 = config_2
        self.h_map = host_map(config)
        self.subnet = config["subnet"]
        self.topology = config["topology"]
        self.idx_map = idx_map
        self.service_map = service_map
        self.total_att_reward = 0
        
        # a variable to record current config, when config change to another, variable willl change to 2
        self.current_config = 1
        self.c_config = config
        
        # settings of firewall
        self.firewall = config["firewall"]
        
        
        # three feature matrices: for attacker, for defender, true feature state
        self.host_true_state = host_matrix(config, idx_map)
        self.def_state = host_matrix(config, idx_map)
        self.true_state = host_matrix(config, idx_map)
        self.MTD_state = host_matrix(config, idx_map)
        # Defender has no information for attack status for each host at first
        self.MTD_state[:, idx_map["compromised"]] = 0
        self.com_MTD_state = np.hstack((self.MTD_state, np.zeros((self.MTD_state.shape[0],1))))
        
        # the observation of attacker contain no information at start
        self.att_state = np.zeros(self.host_true_state.shape)
        self.att_state[:, 0:2] = self.host_true_state[:, 0:2]
        self.att_state[:self.subnet[0], idx_map["reachable"]] = 1
        self.att_state[:self.subnet[0], idx_map["access_level"]] = 1
        self.step = 0
       
        
        # put indices of reachable subnets into a list
        self.reachable_id = [0]
        # put indices of subnets that can use action require access level 2
        self.a2_id = [0]
        self.access_list = [1]*np.sum(self.subnet)
        self.access_list[:self.subnet[0]] = [2]*self.subnet[0]
        self.tg_idx = [id for id in range(self.subnet[0])]
        
        
        # first 0 represent num. of detected attack, second 0 represent num. of resets
        self.IDS_state = np.zeros(2)
        self.remain_restart = 5

        
        # initialise comprised subnets list
        self.com_sn_list = []
        
        self.num_reset = 0
        self.att_reward_list = []
        self.att_step_list = []
        
        # initialize reward for MTD
        self.MTD_reward = 0
        self.MTD_step = 0
        self.deactive_1 = {}
        self.deactive_2 = {}
        self.deactive_3 = {}
        self.deactive_1["host"] = 0
        self.deactive_1["services"] = "ssh"
        self.deactive_1["step"] = 0
        self.deactive_1["work"] = 0
        self.deactive_2["host"] = 0
        self.deactive_2["services"] = "ssh"
        self.deactive_2["step"] = 0
        self.deactive_2["work"] = 0
        self.deactive_3["host"] = 0
        self.deactive_3["services"] = "ssh"
        self.deactive_3["step"] = 0
        self.deactive_3["work"] = 0
        self.MTD_budget_1 = 2500
        self.MTD_budget = 2000
        
        self.com_success = 0
        self.success = 0
        self.att_honeypot = 0
        
        self.failure = 0

        self.num_success = 0
        self.num_fail = 0
        
        self.exploit_prob = np.zeros(len(self.subnet))
        
        self.alarm = 0
        
        self.episode = 0

    def att_step(self, action, en_point, target, access_level):
        """
        we also use values to represent different actions

        - 0:`Exploit_ssh`: cost 2, prob 0.3
        - 1:`Exploit_ftp`: cost 1, prob 0.6
        - 2:`Exploit_http`: cost 3, prob 0.9
        - 3:`PrivilegeEscalation`: cost 1
        - 4:`ServiceScan`: cost 1
        - 5:`OSScan`: cost 2
        - 6:`SubnetScan`: cost 1
        - 7:`ProcessScan`: cost 1
        """
        self.step += 1
        self.att_honeypot = 0
        exp_prob = self.exploit_prob[target[0]]
        self.fail_learning = 0
        
        
        subnet = self.subnet
        idx_map = self.idx_map
        sn_0 = np.sum(subnet[:target[0]])
        sn_1 = np.sum(subnet[:target[0] + 1])
        host = self.h_map.addr2idx(target)
        # whether target reachable
        reachable = self.att_state[host, idx_map["reachable"]]
        # whether target discovered
        discovered = self.att_state[host, idx_map["discovered"]]
        # whether target compromised
        compromised = self.att_state[host, idx_map["compromised"]]
        # whether have correct os information
        os_info = (self.att_state[host, idx_map["os"]] == self.MTD_state[host, idx_map["os"]])
        # whether have correct process information
        process_info = (self.att_state[host, idx_map["process"]] == self.MTD_state[host, idx_map["process"]])
        
        
        # information for IDS
        self.failure = 0
        # initialize the total times the network can resstart by IDS
#         self.remain_reset = 5
       

        p = np.random.rand()
        com_success = 0
        pe_success = 0
        reward = 0
        

        # `ProcessScan`
        if action == 7:
            reward += -1
            if discovered:
                self.att_state[host, idx_map["process"]] = 1
                
        # `PrivilegeEscalation`
        elif action == 3:
            reward += -1
            if discovered and process_info and compromised:
                self.att_state[host, idx_map["access_level"]] = 2
                self.tg_idx.remove(host)
                pe_success = 1
                
        # `SubnetScan`
        elif action == 6:
            reward += -1
            if reachable:
                self.att_state[sn_0:sn_1, idx_map["discovered"]] = 1

        # `ServiceScan`
        elif action == 4:
            reward += -1
            if discovered and access_level == 2:
                self.att_state[sn_0:sn_1, idx_map["ssh"]] = self.MTD_state[sn_0:sn_1, idx_map["ssh"]]
                self.att_state[sn_0:sn_1, idx_map["ftp"]] = self.MTD_state[sn_0:sn_1, idx_map["ftp"]]
                self.att_state[sn_0:sn_1, idx_map["http"]] = self.MTD_state[sn_0:sn_1, idx_map["http"]]

        # OsScan
        elif action == 5:
            reward = -2
            if discovered and access_level == 2:
                self.att_state[host, idx_map["os"]] = self.MTD_state[host, idx_map["os"]]
                

        # services exploit
        # ssh
        elif action == 0:
            reward += -2
            
            if discovered and os_info and access_level == 2\
            and ("ssh" in self.c_config["firewall"][str(en_point)+","+str(target[0])])\
            and self.att_state[host, idx_map["ssh"]] == self.MTD_state[host, idx_map["ssh"]] == 1:
                if p <= exp_prob + 0.3:
                    com_success = 1
                    self.att_state[host, idx_map["compromised"]] = 1
                    self.att_state[host, idx_map["value"]] = self.MTD_state[host, idx_map["value"]]
                    reward += self.att_state[host, idx_map["value"]]
                else:
                    self.fail_learning = 1
            else:
                self.failure = 1
                
        # ftp
        elif action == 1:
            reward += -1
                        
            if discovered and os_info and access_level == 2\
            and ("ftp" in self.c_config["firewall"][str(en_point)+","+str(target[0])])\
            and self.att_state[host, idx_map["ftp"]] == self.MTD_state[host, idx_map["ftp"]] == 1:
                if p <= exp_prob + 0.6:
                    com_success = 1
                    self.att_state[host, idx_map["compromised"]] = 1
                    self.att_state[host, idx_map["value"]] = self.MTD_state[host, idx_map["value"]]
                    reward += self.att_state[host, idx_map["value"]]
                else:
                    self.fail_learning = 1
            else:
                self.failure = 1
                
        # http
        elif action == 2:
            reward += -3
            
            
            if discovered and os_info and access_level == 2\
            and ("http" in self.c_config["firewall"][str(en_point)+","+str(target[0])])\
            and self.att_state[host, idx_map["http"]] == self.MTD_state[host, idx_map["http"]] == 1:
                if p <= exp_prob + 0.8:
                    com_success = 1
                    self.att_state[host, idx_map["compromised"]] = 1
                    self.att_state[host, idx_map["value"]] = self.MTD_state[host, idx_map["value"]]
                    reward += self.att_state[host, idx_map["value"]]
                else:
                    self.fail_learning = 1
            else:
                self.failure = 1
                
                
        elif action == 8:
            reward += -2
                      
            
            if discovered and os_info and access_level == 2\
            and ("scp" in self.c_config["firewall"][str(en_point)+","+str(target[0])])\
            and self.att_state[host, idx_map["scp"]] == self.MTD_state[host, idx_map["scp"]] == 1:
                if p <= exp_prob + 0.2:
                    com_success = 1
                    self.att_state[host, idx_map["compromised"]] = 1
                    self.att_state[host, idx_map["value"]] = self.MTD_state[host, idx_map["value"]]
                    reward += self.att_state[host, idx_map["value"]]
                else:
                    self.fail_learning = 1
            else:
                self.failure = 1

        if self.fail_learning == 1:
            self.exploit_prob[target[0]] += 0.01

        
        self.total_att_reward += reward 
        
        # when host in a subnet compromised, the connected subnet become reachable
        if com_success:
            if target in self.c_config["sensitive_addr"]:
                self.num_success += 1
#                 print("total reward:", self.total_att_reward)
#                 print("total steps:", self.step)
#                 print("att_success")
                self.num_reset += 1
                if self.num_reset % 10 == 0:
                    self.att_step_list.append(self.step)
                self.episode += 1
                if self.episode % 10 == 0:
                    self.att_reward_list.append(self.total_att_reward)
                self.success = 1
                self.reset()
            elif target in self.c_config["honeypot"]:
                self.att_honey_pot = 1
                self.exploit_prob -= 0.005
                
            #if (self.att_state[sn_0:sn_1, idx_map["compromised"]] == 1).all():
            else:
                if target[0] not in self.com_sn_list:
                    self.com_sn_list.append(target[0])
                
                idx = np.where(self.topology[target[0], :] == 1)[0].tolist()
                new_idx = []
                new_idx = [id for id in idx if id not in self.reachable_id]
#                 print("new_idx:", new_idx)
                self.reachable_id = self.reachable_id+new_idx
            
                # add index of newly reachable host to the list, and mark their feature as "reachable"
                for ids in new_idx:
                    new_sn_0 = np.sum(subnet[:ids])
                    new_sn_1 = np.sum(subnet[:ids + 1])
                    self.tg_idx = self.tg_idx + list(range(new_sn_0, new_sn_1))
                    self.att_state[new_sn_0:new_sn_1, idx_map["reachable"]] = 1
        
                    
        if pe_success:
            al_idx = np.where(self.topology[target[0], :] == 1)[0].tolist()
            new_a2_idx = []
            new_a2_idx = [id for id in al_idx if id not in self.a2_id]
            self.a2_id = self.a2_id+new_a2_idx
            
            for ids in new_a2_idx:
                    new_sn_0 = np.sum(subnet[:ids])
                    new_sn_1 = np.sum(subnet[:ids + 1])
                    self.access_list[new_sn_0:new_sn_1] = [2]*self.subnet[ids]
                    
        if self.step == 400:
            self.num_fail += 1
            self.episode += 1
            if self.episode % 10 == 0:
                self.att_reward_list.append(self.total_att_reward)
#             print("att_fail")
            self.reset()
            
        
        
        self.com_success = com_success   
        
        return reward, self.att_state, self.tg_idx, self.access_list, self.com_sn_list
    
    
    def IDS_def(self, action):
        """
        action:
        0 : reset the network
        1 : nothing
        """
        
        # for SD
        
        # prob of correctly give alarm
        true_pos_SD = 0.9
        false_pos_SD = 0.1
        
        p_1 = np.random.rand()
        if self.failure == 1:
            if p_1 < true_pos_SD:
                self.IDS_state[0] += 1
        if self.failure == 0:
            if p_1 < false_pos_SD:
                self.IDS_state[0] += 1
                
        # for AD
        
        true_pos_AD = 0.9
        false_pos_AD = 0.4
        
        p_2 = np.random.rand()
        
        num_compromised = np.sum(self.att_state[:, idx_map["compromised"]])
        prob_det = 1 - (1-true_pos_AD)**num_compromised - (1-false_pos_AD)**(np.sum(self.subnet)-num_compromised)
        if p_2 < prob_det:
            self.IDS_state[0] += 1
            
        
#         if  self._att == 1:
        if action == 0:
            reward = -40
            self.remain_restart -= 1
            reward += np.sum(self.att_state[:, idx_map["compromised"]])*10
            self.IDS_state[1] += 1
#             print("Restart Systerm")
            self.restart()
        
        elif action == 1:
            reward = 0
            
        
        return reward, self.IDS_state, self.remain_restart
    
    def simple_MTD(self):
        """
        simple MTD just directly change the configuration of the network to a paticular another 
        action:
        0 : change the configuration of network
        1 : nothing
        """
        if self.step % 10 == 0:
            if self.current_config == 1:
                self.host_true_state = host_matrix(self.config_2, idx_map)
                self.MTD_state = host_matrix(self.config_2, idx_map)
                self.true_state = host_matrix(self.config_2, idx_map)
                self.current_config = 2
                self.c_config = self.config_2
            elif self.current_config == 2:
                self.host_true_state = host_matrix(self.config_1, idx_map)
                self.MTD_state = host_matrix(self.config_1, idx_map)
                self.true_state = host_matrix(self.config_1, idx_map)
                self.current_config = 1
                self.c_config = self.config_1
        
        
    
    def MTD_def(self, action, target):
        """
        action contain:
        - 0: 'change_ssh' to other service
        - 1: 'change_ftp' to other service
        - 2: 'change_http' to other service
        - 3: 'deactive_ssh' on a host
        - 4: 'deactive_ftp' on a host
        - 5: 'deactive_http' ob a host
        - 6: 'change_operation system'
        - 7: Nothing
        """
        reward = 0
        subnet = self.subnet
        idx_map = self.idx_map
        host = self.h_map.addr2idx(target)
        self.MTD_step += 1
        
        if self.success == 1:
            reward -= 200
        elif self.att_honeypot == 1:
            reward += 100
            
        if self.com_success == 1:
            reward -=30
        
        
        if self.deactive_1["work"] == 1:
            # if deactive service influence the legal traffic, give negative reward
            if np.sum(self.MTD_state[self.deactive_1["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget_1 -= 5
            if self.deactive_1["step"] == 3:
                self.deactive_1["work"] = 0
                self.MTD_state[self.deactive_1["host"], idx_map[self.deactive_1["services"]]] = 1
            elif self.deactive_1["step"] < 3:
                self.deactive_1["step"] += 1
        elif self.deactive_2["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_2["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget_1 -= 5
            if self.deactive_2["step"] == 3:
                self.deactive_2["work"] = 0
                self.MTD_state[self.deactive_2["host"], idx_map[self.deactive_2["services"]]] = 1
            elif self.deactive_2["step"] < 3:
                self.deactive_2["step"] += 1 
        elif self.deactive_3["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_3["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget_1 -= 5
            if self.deactive_3["step"] == 3:
                self.deactive_3["work"] = 0
                self.MTD_state[self.deactive_3["host"], idx_map[self.deactive_3["services"]]] = 1
            elif self.deactive_3["step"] < 3:
                self.deactive_3["step"] += 1  
        
        
        if self.att_state[host, self.idx_map["compromised"]] == 1:
            self.MTD_state[host, self.idx_map["compromised"]] = 1
            
        else:
            # 'change_ssh', 'change_ftp' or 'change_http' to another service
            if action == 0 or action == 1 or action == 2 or action == 3:
                service = self.service_map[action+1]
                if self.MTD_state[host, idx_map[service]] == 1 and self.MTD_budget_1 >= 10:
                    # find service except the one we want to change
                    array = np.array([idx_map["ssh"], idx_map["ftp"], idx_map["http"], idx_map["scp"]])
                    idx_1 = np.where(self.MTD_state[host, idx_map["ssh"]:idx_map["scp"]] == 0)[0]
                    idx_2 = array[array != idx_map[service]]
                    idx = [x for x in idx_1 if x in idx_2]
                    
                    # then there will be three situations
                    # 1. only one service not exist on the host, then change to that service
                    if len(idx) == 1:
                        self.MTD_state[host, idx] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[idx])
                        reward -= 10
                        self.MTD_budget_1 -= 10
                    # 2. more than one services not exist
                    elif len(idx) > 1:
                        n_s_id = np.random.choice(idx)
                        self.MTD_state[host, n_s_id] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[n_s_id])
                        
                        reward -= 10
                        self.MTD_budget_1 -= 10
                    # 3. no services not exist on this host, then no action will be done
        
        
#         if action == 0 or 1 or 2:
#             service = self.service_map[action+1]
#             if MTD_state[host, idx_map[service]] == 1:
#                 MTD_state[host, idx_map[service]] = 0
#                 MTD_state[host, idx_map["new_service"]] = 1
            
        
        # 'deactive_ssh', 'deactive_ftp' or 'deactive_http'
            elif action == 6 or action == 4 or action == 5 or action == 7:
                service = self.service_map[action-3]
                if self.MTD_state[host, idx_map[service]] == 1 and (self.deactive_1["work"] == 0 or self.deactive_2["work"] == 0 or self.deactive_3["work"] == 0)\
                and self.MTD_budget_1 >= 35:
                    self.MTD_state[host, idx_map[service]] = 0
                    if self.deactive_1["work"] == 0:
                        self.deactive_1["host"] = host
                        self.deactive_1["services"] = service
                        self.deactive_1["step"] = 0
                        self.deactive_1["work"] = 1
                    elif self.deactive_2["work"] == 0:
                        self.deactive_2["host"] = host
                        self.deactive_2["services"] = service
                        self.deactive_2["step"] = 0
                        self.deactive_2["work"] = 1
                    elif self.deactive_3["work"] == 0:
                        self.deactive_3["host"] = host
                        self.deactive_3["services"] = service
                        self.deactive_3["step"] = 0
                        self.deactive_3["work"] = 1
                    reward -= 20
                    self.MTD_budget_1 -= 20
        
            elif action == 8 and self.MTD_budget_1 >= 10:
                if self.MTD_state[host, idx_map["os"]] == 1:
                    self.MTD_state[host, idx_map["os"]] = 2
                else:
                    self.MTD_state[host, idx_map["os"]] = 1
                reward  -= 10
                self.MTD_budget_1 -= 10
                
            elif action == 9:
                reward = 0
            

        return reward, self.MTD_state
    
    
    def complex_MTD(self, action, target):
        """
        action contain:
        - 0: 'change_ssh' to other service
        - 1: 'change_ftp' to other service
        - 2: 'change_http' to other service
        - 3: 'deactive_ssh' on a host
        - 4: 'deactive_ftp' on a host
        - 5: 'deactive_http' ob a host
        - 6: 'change_operation system'
        - 7: Nothing
        """
        reward = 0
        subnet = self.subnet
        idx_map = self.idx_map
        host = self.h_map.addr2idx(target)
        self.MTD_step += 1
        
        if self.success == 1:
            reward -= 200
        elif self.att_honeypot == 1:
            reward += 100
            
        if self.com_success == 1:
            reward -=30
            
        self.alarm = 0
        
        
        true_pos_SD = 0.9
        false_pos_SD = 0.1
        
        p_1 = np.random.rand()
        if self.failure == 1:
            if p_1 < true_pos_SD:
                self.alarm += 1
        if self.failure == 0:
            if p_1 < false_pos_SD:
                self.alarm += 1
                
        # for AD
        
        true_pos_AD = 0.9
        false_pos_AD = 0.4
        
        p_2 = np.random.rand()
        
        num_compromised = np.sum(self.att_state[:, idx_map["compromised"]])
        prob_det = 1 - (1-true_pos_AD)**num_compromised - (1-false_pos_AD)**(np.sum(self.subnet)-num_compromised)
        if p_2 < prob_det:
            self.alarm += 1
            
        
        if self.deactive_1["work"] == 1:
            # if deactive service influence the legal traffic, give negative reward
            if np.sum(self.MTD_state[self.deactive_1["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_1["step"] == 3:
                self.deactive_1["work"] = 0
                self.MTD_state[self.deactive_1["host"], idx_map[self.deactive_1["services"]]] = 1
            elif self.deactive_1["step"] < 3:
                self.deactive_1["step"] += 1
        elif self.deactive_2["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_2["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_2["step"] == 3:
                self.deactive_2["work"] = 0
                self.MTD_state[self.deactive_2["host"], idx_map[self.deactive_2["services"]]] = 1
            elif self.deactive_2["step"] < 3:
                self.deactive_2["step"] += 1 
        elif self.deactive_3["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_3["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_3["step"] == 3:
                self.deactive_3["work"] = 0
                self.MTD_state[self.deactive_3["host"], idx_map[self.deactive_3["services"]]] = 1
            elif self.deactive_3["step"] < 3:
                self.deactive_3["step"] += 1  
        
        
        if self.att_state[host, self.idx_map["compromised"]] == 1:
            self.MTD_state[host, self.idx_map["compromised"]] = 1
            
        else:
            # 'change_ssh', 'change_ftp' or 'change_http' to another service
            if action == 0 or action == 1 or action == 2 or action == 3:
                service = self.service_map[action+1]
                if self.MTD_state[host, idx_map[service]] == 1 and self.MTD_budget >= 10:
                    # find service except the one we want to change
                    array = np.array([idx_map["ssh"], idx_map["ftp"], idx_map["http"], idx_map["scp"]])
                    idx_1 = np.where(self.MTD_state[host, idx_map["ssh"]:idx_map["scp"]] == 0)[0]
                    idx_2 = array[array != idx_map[service]]
                    idx = [x for x in idx_1 if x in idx_2]
                    
                    # then there will be three situations
                    # 1. only one service not exist on the host, then change to that service
                    if len(idx) == 1:
                        self.MTD_state[host, idx] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[idx])
                        reward -= 10
                        self.MTD_budget -= 10
                    # 2. more than one services not exist
                    elif len(idx) > 1:
                        n_s_id = np.random.choice(idx)
                        self.MTD_state[host, n_s_id] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[n_s_id])
                        
                        reward -= 10
                        self.MTD_budget -= 10
                    # 3. no services not exist on this host, then no action will be done
        
        
#         if action == 0 or 1 or 2:
#             service = self.service_map[action+1]
#             if MTD_state[host, idx_map[service]] == 1:
#                 MTD_state[host, idx_map[service]] = 0
#                 MTD_state[host, idx_map["new_service"]] = 1
            
        
        # 'deactive_ssh', 'deactive_ftp' or 'deactive_http'
            elif action == 6 or action == 4 or action == 5 or action == 7:
                service = self.service_map[action-3]
                if self.MTD_state[host, idx_map[service]] == 1 and (self.deactive_1["work"] == 0 or self.deactive_2["work"] == 0 or self.deactive_3["work"] == 0)\
                and self.MTD_budget >= 35:
                    self.MTD_state[host, idx_map[service]] = 0
                    if self.deactive_1["work"] == 0:
                        self.deactive_1["host"] = host
                        self.deactive_1["services"] = service
                        self.deactive_1["step"] = 0
                        self.deactive_1["work"] = 1
                    elif self.deactive_2["work"] == 0:
                        self.deactive_2["host"] = host
                        self.deactive_2["services"] = service
                        self.deactive_2["step"] = 0
                        self.deactive_2["work"] = 1
                    elif self.deactive_3["work"] == 0:
                        self.deactive_3["host"] = host
                        self.deactive_3["services"] = service
                        self.deactive_3["step"] = 0
                        self.deactive_3["work"] = 1
                    reward -= 20
                    self.MTD_budget -= 20
        
            elif action == 8 and self.MTD_budget >= 10:
                if self.MTD_state[host, idx_map["os"]] == 1:
                    self.MTD_state[host, idx_map["os"]] = 2
                else:
                    self.MTD_state[host, idx_map["os"]] = 1
                reward  -= 10
                self.MTD_budget -= 10
                
            elif action == 9:
                reward = 0
            
        self.com_MTD_state[:, :-1] = self.MTD_state
        self.com_MTD_state[0, -1] = self.alarm
        return reward, self.com_MTD_state
    
    def complex_MTD_1(self, action, target):
        """
        action contain:
        - 0: 'change_ssh' to other service
        - 1: 'change_ftp' to other service
        - 2: 'change_http' to other service
        - 3: 'deactive_ssh' on a host
        - 4: 'deactive_ftp' on a host
        - 5: 'deactive_http' ob a host
        - 6: 'change_operation system'
        - 7: Nothing
        """
        reward = 0
        subnet = self.subnet
        idx_map = self.idx_map
        host = self.h_map.addr2idx(target)
        self.MTD_step += 1
        
        if self.success == 1:
            reward -= 200
        elif self.att_honeypot == 1:
            reward += 100
            
        if self.com_success == 1:
            reward -=30
            
        
        
        true_pos_SD = 0.9
        false_pos_SD = 0.1
        
        p_1 = np.random.rand()
        if self.failure == 1:
            if p_1 < true_pos_SD:
                self.alarm += 1
        if self.failure == 0:
            if p_1 < false_pos_SD:
                self.alarm += 1
                
        # for AD
        
        true_pos_AD = 0.9
        false_pos_AD = 0.4
        
        p_2 = np.random.rand()
        
        num_compromised = np.sum(self.att_state[:, idx_map["compromised"]])
        prob_det = 1 - (1-true_pos_AD)**num_compromised - (1-false_pos_AD)**(np.sum(self.subnet)-num_compromised)
        if p_2 < prob_det:
            self.alarm += 1
            
        
        if self.deactive_1["work"] == 1:
            # if deactive service influence the legal traffic, give negative reward
            if np.sum(self.MTD_state[self.deactive_1["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_1["step"] == 3:
                self.deactive_1["work"] = 0
                self.MTD_state[self.deactive_1["host"], idx_map[self.deactive_1["services"]]] = 1
            elif self.deactive_1["step"] < 3:
                self.deactive_1["step"] += 1
        elif self.deactive_2["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_2["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_2["step"] == 3:
                self.deactive_2["work"] = 0
                self.MTD_state[self.deactive_2["host"], idx_map[self.deactive_2["services"]]] = 1
            elif self.deactive_2["step"] < 3:
                self.deactive_2["step"] += 1 
        elif self.deactive_3["work"] == 1:
            if np.sum(self.MTD_state[self.deactive_3["host"], :].take([idx_map["ssh"], idx_map["ftp"], idx_map["http"]])) == 0:
                reward -= 5
                self.MTD_budget -= 5
            if self.deactive_3["step"] == 3:
                self.deactive_3["work"] = 0
                self.MTD_state[self.deactive_3["host"], idx_map[self.deactive_3["services"]]] = 1
            elif self.deactive_3["step"] < 3:
                self.deactive_3["step"] += 1  
        
        
        if self.att_state[host, self.idx_map["compromised"]] == 1:
            self.MTD_state[host, self.idx_map["compromised"]] = 1
            
        else:
            # 'change_ssh', 'change_ftp' or 'change_http' to another service
            if action == 0 or action == 1 or action == 2 or action == 3:
                service = self.service_map[action+1]
                if self.MTD_state[host, idx_map[service]] == 1 and self.MTD_budget >= 10:
                    # find service except the one we want to change
                    array = np.array([idx_map["ssh"], idx_map["ftp"], idx_map["http"], idx_map["scp"]])
                    idx_1 = np.where(self.MTD_state[host, idx_map["ssh"]:idx_map["scp"]] == 0)[0]
                    idx_2 = array[array != idx_map[service]]
                    idx = [x for x in idx_1 if x in idx_2]
                    
                    # then there will be three situations
                    # 1. only one service not exist on the host, then change to that service
                    if len(idx) == 1:
                        self.MTD_state[host, idx] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[idx])
                        reward -= 10
                        self.MTD_budget -= 10
                    # 2. more than one services not exist
                    elif len(idx) > 1:
                        n_s_id = np.random.choice(idx)
                        self.MTD_state[host, n_s_id] = 1
                        self.MTD_state[host, idx_map[service]] = 0
                        
                        connect_sn_id = np.where(self.topology[target[0], :] == 1)[0].tolist()
                        # change the firewall setting as service change
                        for ids in connect_sn_id:
                            firewall = self.c_config["firewall"][str(ids)+","+str(target[0])]
                            if service in firewall:
                                self.c_config["firewall"][str(ids)+","+str(target[0])].remove(service)
                                self.c_config["firewall"][str(ids)+","+str(target[0])].append(self.service_map[n_s_id])
                        
                        reward -= 10
                        self.MTD_budget -= 10
                    # 3. no services not exist on this host, then no action will be done
        
        
#         if action == 0 or 1 or 2:
#             service = self.service_map[action+1]
#             if MTD_state[host, idx_map[service]] == 1:
#                 MTD_state[host, idx_map[service]] = 0
#                 MTD_state[host, idx_map["new_service"]] = 1
            
        
        # 'deactive_ssh', 'deactive_ftp' or 'deactive_http'
            elif action == 6 or action == 4 or action == 5 or action == 7:
                service = self.service_map[action-3]
                if self.MTD_state[host, idx_map[service]] == 1 and (self.deactive_1["work"] == 0 or self.deactive_2["work"] == 0 or self.deactive_3["work"] == 0)\
                and self.MTD_budget >= 35:
                    self.MTD_state[host, idx_map[service]] = 0
                    if self.deactive_1["work"] == 0:
                        self.deactive_1["host"] = host
                        self.deactive_1["services"] = service
                        self.deactive_1["step"] = 0
                        self.deactive_1["work"] = 1
                    elif self.deactive_2["work"] == 0:
                        self.deactive_2["host"] = host
                        self.deactive_2["services"] = service
                        self.deactive_2["step"] = 0
                        self.deactive_2["work"] = 1
                    elif self.deactive_3["work"] == 0:
                        self.deactive_3["host"] = host
                        self.deactive_3["services"] = service
                        self.deactive_3["step"] = 0
                        self.deactive_3["work"] = 1
                    reward -= 20
                    self.MTD_budget -= 20
        
            elif action == 8 and self.MTD_budget >= 10:
                if self.MTD_state[host, idx_map["os"]] == 1:
                    self.MTD_state[host, idx_map["os"]] = 2
                else:
                    self.MTD_state[host, idx_map["os"]] = 1
                reward  -= 10
                self.MTD_budget -= 10
                
            elif action == 9:
                reward = 0
            
        self.com_MTD_state[:, :-1] = self.MTD_state
        self.com_MTD_state[0, -1] = self.alarm
        return reward, self.com_MTD_state
    
    
    def reset(self):
        """
        back to initial state for both attacker and defender
        """
        
        # MTD
        self.def_state = self.true_state
        self.MTD_state = self.true_state
        self.MTD_state[:, idx_map["compromised"]] = 0
        
        # attacker
        self.att_state = np.zeros(self.true_state.shape)
        self.att_state[:, 0:2] = self.true_state[:, 0:2]
        self.att_state[:self.subnet[0], idx_map["reachable"]] = 1
        self.att_state[:self.subnet[0], idx_map["access_level"]] = 1
        self.step = 0
        
        self.reachable_id = [0]
        self.com_sn_list = []
        self.a2_id = [0]
        self.access_list = [1]*np.sum(self.subnet)
        self.access_list[:self.subnet[0]] = [2]*self.subnet[0]
        self.tg_idx = [id for id in range(self.subnet[0])]
        
        self.total_att_reward = 0
        
        # IDS
        self.IDS_state = np.zeros(2)
        self.remain_restart = 5
        
        self.MTD_reward = 0
        self.MTD_step = 0
        self.deactive_1 = {}
        self.deactive_2 = {}
        self.deactive_1["host"] = 0
        self.deactive_1["services"] = "ssh"
        self.deactive_1["step"] = 0
        self.deactive_1["work"] = 0
        self.deactive_2["host"] = 0
        self.deactive_2["services"] = "ssh"
        self.deactive_2["step"] = 0
        self.deactive_2["work"] = 0
        self.deactive_3["host"] = 0
        self.deactive_3["services"] = "ssh"
        self.deactive_3["step"] = 0
        self.deactive_3["work"] = 0
        self.MTD_budget_1 = 2500
        self.MTD_budget = 2000
        
        self.failure = 0
        self.com_success = 0
        self.success = 0
        
        self.alarm = 0

        
    def restart(self):
        """
        back to initial state for both attacker
        """
        self.def_state = self.true_state
        
        
#         self.att_state[:, 0:2] = self.true_state[:, 0:2]
        self.att_state[:, 3:6] = 0
        self.att_state[:self.subnet[0], idx_map["reachable"]] = 1
        self.att_state[:self.subnet[0], idx_map["access_level"]] = 1
        
        self.reachable_id = [0]
        self.a2_id = [0]
        self.access_list = [1]*np.sum(self.subnet)
        self.access_list[:self.subnet[0]] = [2]*self.subnet[0]
        self.tg_idx = [id for id in range(self.subnet[0])]
        
        
    def att_reward_plot(self):
        plt.figure()
        plt.subplot(121)
        plt.plot(self.att_reward_list)
        plt.ylabel("attack reward per ten success")
        plt.subplot(122)
        plt.plot(self.att_step_list)
        plt.ylabel("attack step per ten success")
        plt.show()

    @property
    def success_rate(self):
        return self.num_success/(self.num_success+self.num_fail)
        

In [19]:
def epsilon_greedy(epsilon, q_values):
    idx = np.random.choice(np.where(q_values == np.max(q_values))[0])
    num_action = len(q_values)
    prob = np.zeros(num_action)+epsilon/num_action
    prob[idx] += 1-epsilon
    return np.random.choice(num_action, p=prob)

### Attack Agents

In [20]:
class Att_random(object):
    def __init__(self, config, host_true_state, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.topology = config["topology"]
        self.subnet = config["subnet"]
        self.num_sn = len(self.subnet)
        self.num_action = num_action
        
        # initialize require variables
        self.initial_action = np.random.choice(num_action)
        self.initial_target_id = np.random.choice(range(self.subnet[0]))
        self.initial_target = self.h_map.idx2addr(self.initial_target_id)
        self._action = self.initial_action
        self.target_id = self.initial_target_id
        self.en_point = -1
        
    def step(self, reward, next_att_state, tg_idx, access_list, com_sn_list):
        target_id = np.random.choice(tg_idx)
        target_sn = self.h_map.idx2addr(target_id)[0]
        idx = np.where(self.topology[target_sn, :] == 1)[0].tolist()
        if target_sn == 0:
            en_list = [-1]
        else:
            en_list = []
        en_list = en_list + [id for id in idx if id in com_sn_list]
        en_point = np.random.choice(en_list)
        action = np.random.choice(self.num_action)
        target = self.h_map.idx2addr(target_id)
        
        return action, en_point, target, access_list[target_id]

In [21]:
class Att_qlearning(object):
    def __init__(self, config, host_true_state, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.topology = config["topology"]
        self.subnet = config["subnet"]
        self.num_sn = len(self.subnet)
        self.lr = lr
        self.discount = discount
        self.num_action = num_action
        self.epsilon_decay = epsilon_decay
        self.epsilon = initial_epsilon
        
        state = np.zeros(host_true_state.shape)
        state[:, 0:2] = host_true_state[:, 0:2]
        
        # initialize require variables
        self.initial_state = str(state.astype(np.int))
        self.initial_action = np.random.choice(num_action)
        self.initial_target_id = np.random.choice(range(self.subnet[0]))
        self.initial_target = self.h_map.idx2addr(self.initial_target_id)
        self._state = str(state.astype(np.int))
        self._action = self.initial_action
        self.target_id = self.initial_target_id
        self.en_point = -1
        self.q_func = dict()
        self.q_func[self._state] = dict()
#         self.q_func[self._state][self.target_id] = dict()
#         self.q_func[self._state][self.target_id][self.en_point] = np.zeros(num_action, dtype=np.float32)
        self.q_func[self._state][self.target_id] = np.zeros((self.num_sn+1, self.num_action), dtype=np.float32)
        self.q_func[self._state][self.target_id][:-1, :] = -np.inf
        
        

    def step(self, reward, next_att_state, tg_idx, access_list, com_sn_list):
        """
        update q values
        """
        
        next_state = str(next_att_state.astype(np.int))
        if next_state not in self.q_func:
            self.q_func[next_state] = dict()
            for tg_id in tg_idx:
                target_sn = self.h_map.idx2addr(tg_id)[0]
                if target_sn == 0:
                    self.q_func[next_state][tg_id] = np.zeros((self.num_sn+1, self.num_action), dtype=np.float32)
                    en_list = [-1]
                else:
                    self.q_func[next_state][tg_id] = np.zeros((self.num_sn, self.num_action), dtype=np.float32)
                    en_list = []
                
                idx = np.where(self.topology[target_sn, :] == 1)[0].tolist()
                en_list = en_list + [id for id in idx if id in com_sn_list]                
                
                # make the q_value of those impossible entry points to be negative infinity
                if target_sn == 0:
                    for epoint in range(-1, self.num_sn):
                        if epoint not in en_list:
                            self.q_func[next_state][tg_id][epoint, :] = -np.inf
                else:
                    for epoint in range(self.num_sn):
                        if epoint not in en_list:
                            self.q_func[next_state][tg_id][epoint, :] = -np.inf
                            
            
            self.q_func[self._state][self.target_id][self.en_point, self._action] += \
            self.lr*(reward-self.q_func[self._state][self.target_id][self.en_point, self._action])
            best_target = np.random.choice(tg_idx)
            target_sn = self.h_map.idx2addr(best_target)[0]
            idx = np.where(self.topology[target_sn, :] == 1)[0].tolist()
            if target_sn == 0:
                en_list = [-1]
            else:
                en_list = []
            en_list = en_list + [id for id in idx if id in com_sn_list]
            best_epoint = np.random.choice(en_list)
            
        else:
            max_q = -np.inf
            for tg_id, q_values in self.q_func[next_state].items():
                max_q_col = np.max(q_values, axis = 1)
                if np.max(max_q_col) > max_q:
                    max_q = np.max(max_q_col)
                    best_target = tg_id
                    best_epoint = np.argmax(max_q_col)
                elif np.max(max_q_col) == max_q and np.random.rand() >= 0.5:
                    best_target = tg_id
                    best_epoint = np.argmax(max_q_col)
            
            self.q_func[self._state][self.target_id][self.en_point, self._action] += \
            self.lr*(reward + self.discount*max_q - self.q_func[self._state][self.target_id][self.en_point, self._action])
             
            
        self._state = next_state
        self.target_id = best_target
        self.en_point = best_epoint
        self._target = self.h_map.idx2addr(best_target)
        if self._target[0] == 0 and best_epoint == self.num_sn:
            self.en_point = -1
        else:
            self.en_point = best_epoint
        self._action = epsilon_greedy(self.epsilon, self.q_func[self._state][best_target][best_epoint, :])
        self.epsilon *= self.epsilon_decay
        return self._action, self.en_point, self._target, access_list[best_target]
    
    @property
    def q_values(self):
        return self.q_func

In [22]:
class Att_ucb(object):
    def __init__(self, config, host_true_state, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.topology = config["topology"]
        self.subnet = config["subnet"]
        self.num_sn = len(self.subnet)
        self.lr = lr
        self.discount = discount
        self.num_action = num_action
        self.epsilon_decay = epsilon_decay
        self.epsilon = initial_epsilon
        
        state = np.zeros(host_true_state.shape)
        state[:, 0:2] = host_true_state[:, 0:2]
        
        # initialize require variables
        self.initial_state = str(state.astype(np.int))
        self.initial_action = np.random.choice(num_action)
        self.initial_target_id = np.random.choice(range(self.subnet[0]))
        self.initial_target = self.h_map.idx2addr(self.initial_target_id)
        self._state = str(state.astype(np.int))
        self._action = self.initial_action
        self.target_id = self.initial_target_id
        self.en_point = -1
        self.q_func = dict()
        self.q_func[self._state] = dict()
#         self.q_func[self._state][self.target_id] = dict()
#         self.q_func[self._state][self.target_id][self.en_point] = np.zeros(num_action, dtype=np.float32)
        self.q_func[self._state][self.target_id] = np.zeros((self.num_sn+1, self.num_action), dtype=np.float32)
        self.q_func[self._state][self.target_id][:-1, :] = -np.inf
        self.N = 0
        self.U = 0.5*np.ones((np.sum(self.subnet), self.num_sn+1, self.num_action), dtype=np.float32)
        
    def step(self, reward, next_att_state, tg_idx, access_list, com_sn_list):
        self.N += 1
        self.U[self.target_id, self.en_point, self._action] += 1
        next_state = str(next_att_state.astype(np.int))
        if next_state not in self.q_func:
            self.q_func[next_state] = dict()
            for tg_id in tg_idx:
                target_sn = self.h_map.idx2addr(tg_id)[0]
                if target_sn == 0:
                    self.q_func[next_state][tg_id] = np.zeros((self.num_sn+1, self.num_action), dtype=np.float32)
                    en_list = [-1]
                else:
                    self.q_func[next_state][tg_id] = np.zeros((self.num_sn, self.num_action), dtype=np.float32)
                    en_list = []
                
                idx = np.where(self.topology[target_sn, :] == 1)[0].tolist()
                en_list = en_list + [id for id in idx if id in com_sn_list]                
                
                # make the q_value of those impossible entry points to be negative infinity
                if target_sn == 0:
                    for epoint in range(-1, self.num_sn):
                        if epoint not in en_list:
                            self.q_func[next_state][tg_id][epoint, :] = -np.inf
                else:
                    for epoint in range(self.num_sn):
                        if epoint not in en_list:
                            self.q_func[next_state][tg_id][epoint, :] = -np.inf
                
            self.q_func[self._state][self.target_id][self.en_point, self._action] += \
            self.lr*(reward-self.q_func[self._state][self.target_id][self.en_point, self._action])
            
            best_target = np.random.choice(tg_idx)
            target_sn = self.h_map.idx2addr(best_target)[0]
            idx = np.where(self.topology[target_sn, :] == 1)[0].tolist()
            if target_sn == 0:
                en_list = [-1]
            else:
                en_list = []
            en_list = en_list + [id for id in idx if id in com_sn_list]
            best_epoint = np.random.choice(en_list)
            
        else:
            
            max_ucb = -np.inf
            max_q = -np.inf
            for tg_id, q_values in self.q_func[next_state].items():
                if tg_id >= self.subnet[0]:
                    ucb = q_values + 0.5*np.log(self.N/self.U[tg_id, :-1, :])
                else:
                    ucb = q_values + 0.5*np.log(self.N/self.U[tg_id, :, :])
                max_ucb_col = np.max(ucb, axis = 1)
                if np.max(max_ucb_col) > max_ucb:
                    max_ucb = np.max(max_ucb_col)
                    best_target = tg_id
                    best_epoint = np.argmax(max_ucb_col)
                elif np.max(max_ucb_col) == max_ucb and np.random.rand() >= 0.5:
                    best_target = tg_id
                    best_epoint = np.argmax(max_ucb_col)
                
                max_q_col = np.max(q_values, axis = 1)
                if np.max(max_q_col) > max_q:
                    max_q = np.max(max_q_col)
            
            self.q_func[self._state][self.target_id][self.en_point, self._action] += \
            self.lr*(reward + self.discount*max_q - self.q_func[self._state][self.target_id][self.en_point, self._action])
        
        self._state = next_state
        self.target_id = best_target
        self.en_point = best_epoint
        self._target = self.h_map.idx2addr(best_target)
        if self._target[0] == 0 and best_epoint == self.num_sn:
            self.en_point = -1
        else:
            self.en_point = best_epoint
        self._action = np.argmax(self.q_func[self._state][self.target_id][self.en_point,:] + 0.5*np.log(self.N/self.U[self.target_id, self.en_point, :]))
        return self._action, self.en_point, self._target, access_list[best_target]
    
    @property
    def q_values(self):
        return self.q_func
            

### IDS Agents

In [23]:
class IDS_qlearning(object):
    def __init__(self, config, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.subnet = config["subnet"]
        self.lr = lr
        self.discount = discount
        self.num_action = num_action
        self.epsilon_decay = epsilon_decay
        self.epsilon = initial_epsilon
        
        state = np.zeros(2)
        
        # initialize require variables
        self.initial_state = str(state.astype(np.int))
        self.initial_action = 0
        self._state = str(state.astype(np.int))
        self._action = self.initial_action
        self.q_IDS = dict()
        self.q_IDS[self._state] = np.zeros(num_action, dtype=np.float32)
        
    def step(self, reward, next_IDS_state, remain_restart):
        
        next_state = str(next_IDS_state.astype(np.int))
        if next_state not in self.q_IDS:
            self.q_IDS[next_state] = np.zeros(self.num_action, dtype=np.float32)
            
            self.q_IDS[self._state][self._action] += \
            self.lr*(reward-self.q_IDS[self._state][self._action])
            
        else:
            self.q_IDS[self._state][self._action] += \
            self.lr*(reward + self.discount*np.max(self.q_IDS[next_state]) - self.q_IDS[self._state][self._action])
            
        self._state = next_state
        if remain_restart > 0:
            self._action = epsilon_greedy(self.epsilon, self.q_IDS[self._state])
        else:
            self._action = 1
        
        return self._action

### Running Attack without defend

In [24]:
def run_experiment(env, agent, step_limit):
    target = agent.initial_target
    action = agent.initial_action
    access_level = 2
    en_point = -1

    for steps in range(step_limit):
        reward, next_state, tg_idx, access_list, com_sn_list = env.att_step(action, en_point, target, access_level)
        action, en_point, target, access_level = agent.step(reward, next_state, tg_idx, access_list, com_sn_list)
        
#         print("action:", action)
#     reward, next_state, tg_idx, access_list, com_sn_list = env.att_step(action, en_point, target, access_level)
#     action, en_point, target, access_level, q_values = agent.step(reward, next_state, tg_idx, access_list, com_sn_list)
#     return q_values

In [25]:
def non_zero_mean(array):
    exist = (array != 0)
    num = array.sum(axis=0)
    den = exist.sum(axis=0)
    num = num[num != 0]
    den = den[den != 0]
    return num/den

In [26]:
def multi_experiment_1(num_experiment, att_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_experiment(Env, agent, step_limit)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length = len(reward)
        tt_reward[i, :length] = reward
        tt_step[i, :length] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [27]:
# mean_reward, mean_step, mean_s_r = multi_experiment_1(20, Att_qlearning, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_sn4.npy', mean_reward)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_sn4.npy', mean_step)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_sn4.npy', mean_s_r)

In [28]:
# mean_reward_random, mean_step_random, mean_s_r_random = multi_experiment_1(20, Att_random, config, config_2, idx_map, service_map, 2000000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_random_sn4.npy', mean_reward_random)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_random_sn4.npy', mean_step_random)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_random_sn4.npy', mean_s_r_random)

In [29]:
# mean_reward_ucb, mean_step_ucb, mean_s_r_ucb = multi_experiment_1(20, Att_ucb, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_ucb_sn4.npy', mean_reward_ucb)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_ucb_sn4.npy', mean_step_ucb)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_ucb_sn4.npy', mean_s_r_ucb)

### Running the game

In [30]:
def run_game_IDS(env, att_agent, def_agent, step_limit):
    att_target = att_agent.initial_target
    att_action = att_agent.initial_action
    access_level = 2
    en_point = -1
    
    IDS_action = 0
    
    for steps in range(step_limit):
        att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
        att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        IDS_reward, IDS_next_state, remain_restart = env.IDS_def(IDS_action)
        IDS_action = def_agent.step(IDS_reward, IDS_next_state, remain_restart)
        

In [31]:
def multi_game_IDS(num_experiment, att_agent, IDS_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        a_agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        d_agent = IDS_agent(config, lr = 0.1, discount = 0.99, num_action = 2, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_game_IDS(Env, a_agent, d_agent, step_limit)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length = len(reward)
        tt_reward[i, :length] = reward
        tt_step[i, :length] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [32]:
# mean_IDS_game_reward, mean_IDS_game_step, mean_s_r_IDS = multi_game_IDS(20, Att_ucb, IDS_qlearning, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_IDS_sn4_ucb_greedy.npy', mean_IDS_game_reward)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_IDS_sn4_ucb_greedy.npy', mean_IDS_game_step)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_IDS_sn4_ucb_greedy.npy', mean_s_r_IDS)

In [33]:
# mean_IDS_game_reward, mean_IDS_game_step, mean_s_r_IDS = multi_game_IDS(20, Att_qlearning, IDS_qlearning, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_IDS_sn4_greedy_greedy.npy', mean_IDS_game_reward)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_IDS_sn4_greedy_greedy.npy', mean_IDS_game_step)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_IDS_sn4_greedy_greedy.npy', mean_s_r_IDS)

In [34]:
# mean_IDS_game_reward, mean_IDS_game_step, mean_s_r_IDS = multi_game_IDS(50, config, config_2, idx_map, service_map, 500000)
# np.save('mean_reward_IDS_1.npy', mean_reward_IDS)
# np.save('mean_step_IDS_1.npy', mean_step_IDS)
# np.save('mean_s_r_IDS_1.npy', mean_s_r_IDS)

### MTD Agent

In [35]:
class MTD_qlearning(object):
    def __init__(self, config, host_true_state, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.topology = config["topology"]
        self.subnet = config["subnet"]
        self.num_sn = len(self.subnet)
        self.lr = lr
        self.discount = discount
        self.num_action = num_action
        self.epsilon_decay = epsilon_decay
        self.epsilon = initial_epsilon
        
        state = np.zeros(host_true_state.shape)
        
        # initialize require variables
        self.initial_state = str(state.astype(np.int))
        self.initial_action = 7
        self.initial_target_id = np.random.choice(range(np.sum(self.subnet)))
        self.initial_target = self.h_map.idx2addr(self.initial_target_id)
        self._state = str(state.astype(np.int))
        self._action = self.initial_action
        self.target_id = self.initial_target_id
        
        self.q_func = dict()
        self.q_func[self._state] = dict()
        self.q_func[self._state][self.target_id] = np.zeros(self.num_action)        
        
    def step(self, reward, MTD_next_state):
        next_state = str(MTD_next_state.astype(np.int))
        if next_state not in self.q_func:
            self.q_func[next_state] = dict()
            for tgs in range(np.sum(self.subnet)):
                self.q_func[next_state][tgs] = np.zeros(self.num_action) 
                
            self.q_func[self._state][self.target_id][self._action] += \
            self.lr*(reward-self.q_func[self._state][self.target_id][self._action])
            
            best_target = np.random.choice(range(np.sum(self.subnet)))
        
        else:
            max_q = -np.inf
            for tg_id, q_values in self.q_func[next_state].items():
                if np.max(q_values) > max_q:
                    max_q = np.max(q_values)
                    best_target = tg_id
                elif np.max(q_values) == max_q and np.random.rand() >= 0.5:
                    best_target = tg_id
            
            self.q_func[self._state][self.target_id][self._action] += \
            self.lr*(reward + self.discount*max_q - self.q_func[self._state][self.target_id][self._action])
             
        self._state = next_state
        self.target_id = best_target
        self._target = self.h_map.idx2addr(best_target)
        self._action = epsilon_greedy(self.epsilon, self.q_func[self._state][best_target])
        self.epsilon *= self.epsilon_decay
        
        return self._action, self._target


In [36]:
class MTD_ucb(object):
    def __init__(self, config, host_true_state, lr, discount, num_action, epsilon_decay, initial_epsilon):
        self.h_map = host_map(config)
        self.topology = config["topology"]
        self.subnet = config["subnet"]
        self.num_sn = len(self.subnet)
        self.lr = lr
        self.discount = discount
        self.num_action = num_action
        
        state = np.zeros(host_true_state.shape)
        
        # initialize require variables
        self.initial_state = str(state.astype(np.int))
        self.initial_action = 7
        self.initial_target_id = np.random.choice(range(np.sum(self.subnet)))
        self.initial_target = self.h_map.idx2addr(self.initial_target_id)
        self._state = str(state.astype(np.int))
        self._action = self.initial_action
        self.target_id = self.initial_target_id
        
        self.q_func = dict()
        self.q_func[self._state] = dict()
        self.q_func[self._state][self.target_id] = np.zeros(self.num_action)   
        
        self.N = 0
        self.U = 0.5*np.ones((np.sum(self.subnet), self.num_action), dtype=np.float32)
        
        
    def step(self, reward, MTD_next_state):
        self.N += 1
        self.U[self.target_id, self._action] += 1
        next_state = str(MTD_next_state.astype(np.int))
        if next_state not in self.q_func:
            self.q_func[next_state] = dict()
            for tgs in range(np.sum(self.subnet)):
                self.q_func[next_state][tgs] = np.zeros(self.num_action) 
                
            self.q_func[self._state][self.target_id][self._action] += \
            self.lr*(reward-self.q_func[self._state][self.target_id][self._action])
            
            best_target = np.random.choice(range(np.sum(self.subnet)))
        
        else:
            max_ucb = -np.inf
            max_q = -np.inf
            for tg_id, q_values in self.q_func[next_state].items():
                ucb = q_values + 0.5*np.log(self.N/self.U[tg_id, :])
                if np.max(ucb) > max_ucb:
                    max_ucb = np.max(ucb)
                    best_target = tg_id
                elif np.max(ucb) == max_ucb and np.random.rand() >= 0.5:
                    best_target = tg_id
                    
                if np.max(q_values) > max_q:
                    max_q = np.max(q_values)
            
            self.q_func[self._state][self.target_id][self._action] += \
            self.lr*(reward + self.discount*max_q - self.q_func[self._state][self.target_id][self._action])
             
        self._state = next_state
        self.target_id = best_target
        self._target = self.h_map.idx2addr(best_target)
        self._action = np.argmax(self.q_func[self._state][self.target_id] + 0.5*np.log(self.N/self.U[self.target_id, :]))
        
        return self._action, self._target


In [37]:
def run_game_MTD(env, att_agent, def_agent, step_limit, config):
    att_target = att_agent.initial_target
    att_action = att_agent.initial_action
    access_level = 2
    en_point = -1
    
    MTD_action = def_agent.initial_action
    MTD_target = def_agent.initial_target
    MTD_reward = 0
    MTD_ex_reward = 0
    
    sensitive_target = config["sensitive_addr"]
    
    for steps in range(step_limit):
        MTD_reward, MTD_next_state = env.MTD_def(MTD_action, MTD_target)
        MTD_reward += MTD_ex_reward
        MTD_action, MTD_target = def_agent.step(MTD_reward, MTD_next_state)
#         print(MTD_action)
        att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
        att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        if att_target in config["honeypot"]:
            MTD_ex_reward = 100
        else:
            MTD_ex_reward = 0
#     MTD_reward, MTD_next_state = env.MTD_def(MTD_action, MTD_target)
#     MTD_reward += MTD_ex_reward
#     MTD_action, MTD_target, MTD_q = def_agent.step(MTD_reward, MTD_next_state)
#     return MTD_q

In [38]:
def multi_game_MTD(num_experiment, att_agent, MTD_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        a_agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        d_agent = MTD_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 10, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_game_MTD(Env, a_agent, d_agent, step_limit, config)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length_1 = len(reward)
        length_2 = len(step)
        tt_reward[i, :length_1] = reward
        tt_step[i, :length_2] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [39]:
# mean_MTD_game_reward, mean_MTD_game_step, mean_s_r_MTD = multi_game_MTD(20, Att_ucb, MTD_qlearning, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_MTD_sn4_ucb_greedy.npy', mean_MTD_game_reward)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_MTD_sn4_ucb_greedy.npy', mean_MTD_game_step)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_MTD_sn4_ucb_greedy.npy', mean_s_r_MTD)

In [None]:
# mean_MTD_game_reward_ucb, mean_MTD_game_step_ucb, mean_s_r_MTD_ucb = multi_game_MTD(20, Att_ucb, MTD_ucb, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_MTD_sn4_ucb_ucb2.npy', mean_MTD_game_reward_ucb)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_MTD_sn4_ucb_ucb2.npy', mean_MTD_game_step_ucb)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_MTD_sn4_ucb_ucb2.npy', mean_s_r_MTD_ucb)

In [None]:
def run_game_MTD_simple(env, att_agent, step_limit, config):
    att_target = att_agent.initial_target
    att_action = att_agent.initial_action
    access_level = 2
    en_point = -1
    
        
    for steps in range(step_limit):
        att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
        att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        env.simple_MTD()

In [None]:
def multi_game_MTD_simple(num_experiment, att_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        a_agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_game_MTD_simple(Env, a_agent, step_limit, config)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length_1 = len(reward)
        length_2 = len(step)
        tt_reward[i, :length_1] = reward
        tt_step[i, :length_2] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [None]:
# mean_MTD_game_reward_simple, mean_MTD_game_step_simple, mean_s_r_MTD_simple = multi_game_MTD_simple(20, Att_ucb, config, config_2, idx_map, service_map, 2500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_MTD_sn4_ucb_simple_11.npy', mean_MTD_game_reward_simple)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_MTD_sn4_ucb_simple_11.npy', mean_MTD_game_step_simple)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_MTD_sn4_ucb_simple_11.npy', mean_s_r_MTD_simple)

In [None]:
def run_game_MTD_cp(env, att_agent, def_agent, step_limit, config):
    att_target = att_agent.initial_target
    att_action = att_agent.initial_action
    access_level = 2
    en_point = -1
    
    MTD_action = def_agent.initial_action
    MTD_target = def_agent.initial_target
    MTD_reward = 0
    MTD_ex_reward = 0
    
    sensitive_target = config["sensitive_addr"]
    
    att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
    att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        
    for steps in range(step_limit):
        MTD_reward, MTD_next_state = env.complex_MTD(MTD_action, MTD_target)
        MTD_reward += MTD_ex_reward
        MTD_action, MTD_target = def_agent.step(MTD_reward, MTD_next_state)
#         print(MTD_action)
        att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
        att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        if att_target in config["honeypot"]:
            MTD_ex_reward = 100
        else:
            MTD_ex_reward = 0

In [None]:
def multi_game_MTD_cp(num_experiment, att_agent, MTD_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        a_agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        d_agent = MTD_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 10, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_game_MTD_cp(Env, a_agent, d_agent, step_limit, config)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length_1 = len(reward)
        length_2 = len(step)
        tt_reward[i, :length_1] = reward
        tt_step[i, :length_2] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [None]:
# mean_MTD_game_reward_cp, mean_MTD_game_step_cp, mean_s_r_MTD_cp = multi_game_MTD_cp(20, Att_ucb, MTD_ucb, config, config_2, idx_map, service_map, 1500000)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_MTD_sn4_ucb_cp2.npy', mean_MTD_game_reward_cp)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_MTD_sn4_ucb_cp2.npy', mean_MTD_game_step_cp)
# np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_MTD_sn4_ucb_cp2.npy', mean_s_r_MTD_cp)


In [None]:
def run_game_MTD_cp_1(env, att_agent, def_agent, step_limit, config):
    att_target = att_agent.initial_target
    att_action = att_agent.initial_action
    access_level = 2
    en_point = -1
    
    MTD_action = def_agent.initial_action
    MTD_target = def_agent.initial_target
    MTD_reward = 0
    MTD_ex_reward = 0
    
    sensitive_target = config["sensitive_addr"]
    
    att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
    att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        
    for steps in range(step_limit):
        MTD_reward, MTD_next_state = env.complex_MTD_1(MTD_action, MTD_target)
        MTD_reward += MTD_ex_reward
        MTD_action, MTD_target = def_agent.step(MTD_reward, MTD_next_state)
#         print(MTD_action)
        att_reward, att_next_state, att_tg_idx, access_list, com_sn_list = env.att_step(att_action, en_point, att_target, access_level)
        att_action, en_point, att_target, access_level = att_agent.step(att_reward, att_next_state, att_tg_idx, access_list, com_sn_list)
        if att_target in config["honeypot"]:
            MTD_ex_reward = 100
        else:
            MTD_ex_reward = 0

In [None]:
def multi_game_MTD_cp_1(num_experiment, att_agent, MTD_agent, config, config_2, idx_map, service_map, step_limit):
    tt_reward = np.zeros((num_experiment, int(step_limit/10)))
    tt_step = np.zeros((num_experiment, int(step_limit/10)))
    s_r = 0
    
    for i in range(num_experiment):
        Env = env(config, config_2, idx_map, service_map)
        a_agent = att_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 9, epsilon_decay = 0.99, initial_epsilon = 0.5)
        d_agent = MTD_agent(config, Env.host_true_state, lr = 0.15, discount = 0.99, num_action = 10, epsilon_decay = 0.99, initial_epsilon = 0.5)
        run_game_MTD_cp_1(Env, a_agent, d_agent, step_limit, config)
        reward = np.asarray(Env.att_reward_list)
        step = np.asarray(Env.att_step_list)
        s_r += Env.success_rate
        length_1 = len(reward)
        length_2 = len(step)
        tt_reward[i, :length_1] = reward
        tt_step[i, :length_2] = step
        
    mean_reward = non_zero_mean(tt_reward)   
    mean_step = non_zero_mean(tt_step)
    mean_success_rate = s_r/num_experiment
    
    return mean_reward, mean_step, mean_success_rate

In [None]:
mean_MTD_game_reward_cp_1, mean_MTD_game_step_cp_1, mean_s_r_MTD_cp_1 = multi_game_MTD_cp(20, Att_ucb, MTD_ucb, config, config_2, idx_map, service_map, 1500000)
np.save('/Users/zhouqinghong/OneDrive/project/data/mean_reward_MTD_sn4_ucb_cp_13.npy', mean_MTD_game_reward_cp_1)
np.save('/Users/zhouqinghong/OneDrive/project/data/mean_step_MTD_sn4_ucb_cp_13.npy', mean_MTD_game_step_cp_1)
np.save('/Users/zhouqinghong/OneDrive/project/data/mean_s_r_MTD_sn4_ucb_cp_13.npy', mean_s_r_MTD_cp_1)
