diff --git a/pydial/Simulate.py b/pydial/Simulate.py index 7977564..408d57b 100644 --- a/pydial/Simulate.py +++ b/pydial/Simulate.py @@ -130,11 +130,11 @@ def run_dialogs(self, numDialogs): ''' for i in range(numDialogs): logger.info('Dialogue %d' % (i+1)) - self.run(session_id='simulate_dialog'+str(i), sim_level=self.sim_level) + self.run(session_id='simulate_dialog'+str(i), sim_level=self.sim_level, roi=roi) self.agent_factory.power_down_factory() # Important! -uses FORCE_SAVE on policy- which will finalise learning and save policy. - def run(self, session_id, agent_id='Smith', sim_level='dial_act'): + def run(self, session_id, agent_id='Smith', sim_level='dial_act', roi=False): ''' Runs one episode through the simulator @@ -146,8 +146,12 @@ def run(self, session_id, agent_id='Smith', sim_level='dial_act'): ''' # GENERATE A USER PREFERENCE: a * Length + (1-a) * Success + + preference = torch.randn(2) preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) + if roi: + pass logger.dial('User\'s preference: [{}, {}]'.format(preference[0], preference[1])) # RESET THE USER SIMULATOR: diff --git a/pydial/policy/PolicyManager.py b/pydial/policy/PolicyManager.py index 549af2c..4ba6e1c 100644 --- a/pydial/policy/PolicyManager.py +++ b/pydial/policy/PolicyManager.py @@ -300,6 +300,9 @@ def _load_domains_policy(self, domainString=None): elif policy_type == 'morl': from policy import MORLPolicy self.domainPolicies[domainString] = MORLPolicy.MORLPolicy(in_policy_file, out_policy_file, domainString, learning) + elif policy_type == 'roi-morl': + from policy import RoiMORLPolicy + self.domainPolicies[domainString] = RoiMORLPolicy.RoiMORLPolicy(in_policy_file, out_policy_file, domainString, learning) else: try: # try to view the config string as a complete module path to the class to be instantiated diff --git a/pydial/policy/RoiMORLPolicy.py b/pydial/policy/RoiMORLPolicy.py new file mode 100644 index 0000000..79f3726 --- /dev/null +++ b/pydial/policy/RoiMORLPolicy.py @@ -0,0 +1,852 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### + +''' +MORL-Policy.py - multi-objective reinforcement learning policy +''' + +import copy +import os +import sys +import json +import random +import utils +from collections import namedtuple +from collections import deque +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct + +import random +import torch +import copy +import numpy as np +import torch.optim as optim +from torch.autograd import Variable +import torch.nn.functional as F + +import ontology.FlatOntologyManager as FlatOnt +import DRL.utils as drlutils +import DMORL.naive as naive +import DMORL.envelope as envelope +import Policy +import SummaryAction +from Policy import TerminalAction, TerminalState +from policy.feudalRL.DIP_parametrisation import DIP_state +from utils.monitor import Monitor + +import queue +from collections import namedtuple +from termcolor import colored + +logger = utils.ContextLogger.getLogger('') + +use_cuda = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor +ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor +Tensor = FloatTensor + +# --- for flattening the belief --- # +def flatten_belief(belief, domainUtil, merge=False): + belief = belief.getDomainState(domainUtil.domainString) + if isinstance(belief, TerminalState): + if domainUtil.domainString == 'CamRestaurants': + return [0] * 268 + elif domainUtil.domainString == 'CamHotels': + return [0] * 111 + elif domainUtil.domainString == 'SFRestaurants': + return [0] * 633 + elif domainUtil.domainString == 'SFHotels': + return [0] * 438 + elif domainUtil.domainString == 'Laptops11': + return [0] * 257 + elif domainUtil.domainString == 'TV': + return [0] * 188 + + policyfeatures = ['full', 'method', 'discourseAct', 'requested', \ + 'lastActionInformNone', 'offerHappened', 'inform_info'] + + flat_belief = [] + for feat in policyfeatures: + add_feature = [] + if feat == 'full': + # for slot in self.sorted_slots: + for slot in domainUtil.ontology['informable']: + for value in domainUtil.ontology['informable'][slot]: # + ['**NONE**']: + add_feature.append(belief['beliefs'][slot][value]) + + # pfb30 11.03.2017 + try: + add_feature.append(belief['beliefs'][slot]['**NONE**']) + except: + add_feature.append(0.) # for NONE + try: + add_feature.append(belief['beliefs'][slot]['dontcare']) + except: + add_feature.append(0.) # for dontcare + + elif feat == 'method': + add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']] + elif feat == 'discourseAct': + add_feature = [belief['beliefs']['discourseAct'][discourseAct] + for discourseAct in domainUtil.ontology['discourseAct']] + elif feat == 'requested': + add_feature = [belief['beliefs']['requested'][slot] \ + for slot in domainUtil.ontology['requestable']] + elif feat == 'lastActionInformNone': + add_feature.append(float(belief['features']['lastActionInformNone'])) + elif feat == 'offerHappened': + add_feature.append(float(belief['features']['offerHappened'])) + elif feat == 'inform_info': + add_feature += belief['features']['inform_info'] + else: + logger.error('Invalid feature name in config: ' + feat) + + flat_belief += add_feature + + return flat_belief + + +def is_corner(corner_w, S): + eps = 1e-6 + for s in S: + if s.l < corner_w[0]-eps and\ + s.u > corner_w[0]+eps: + print(colored("skip {} ...".format(corner_w), "green")) + return False + return True + +def intersect(v, s): + d = (v.v1 - v.v0) - (s.v1 - s.v0) + if d == 0: return None + w = (v.v1 - s.v1) / d + if w < v.l or w > v.u or w < s.l or w > s.u: + return None + return w + +def update_ccs(S, corWs, new_value): + if len(S) == 0: + nv = optvalue(new_value[0], new_value[1], 0.0, 1.0) + S.add(nv) + print(colored("add {} to set.".format(nv), "green")) + else: + discard = True + useless = [] + updates = [] + nv = optvalue(new_value[0], new_value[1], 0.0, 1.0) + for s in S: + dnv = nv.v0 - nv.v1 + ds = s.v0 - s.v1 + if (nv.v1+dnv*s.l > s.v1+ds*s.l or nv.v1+dnv*s.l == s.v1+ds*s.l) and\ + (nv.v1+dnv*s.u > s.v1+ds*s.u or nv.v1+dnv*s.u == s.v1+ds*s.u): + if nv.v1+dnv*s.l == s.v1+ds*s.l: nv = nv._replace(l = s.l) + if nv.v1+dnv*s.u == s.v1+ds*s.u: nv = nv._replace(u = s.u) + if nv.v1+dnv*s.l == s.v1+ds*s.l and\ + nv.v1+dnv*s.u == s.v1+ds*s.u: + print("repeat! compare to ", s) + discard = True + else: + useless.append(s) + discard = False + # elif (nv.v1+dnv*s.l < s.v1+ds*s.l or nv.v1+dnv*s.l == s.v1+ds*s.l) and\ + # (nv.v1+dnv*s.u < s.v1+ds*s.u or nv.v1+dnv*s.u == s.v1+ds*s.u): + + else: + # None if the intersection is out of range + w = intersect(nv, s) + if w and nv.v1 > s.v1: + if w < nv.u: nv = nv._replace(u = w) + if w > s.l: + useless.append(s) + s = s._replace(l = w) + updates.append(s) + corWs.put_nowait(FloatTensor([w, 1.0-w])) + print(colored("add perference {} to set.".format(w), "green")) + discard = False + elif w and nv.v0 > s.v0: + if w > nv.l: nv = nv._replace(l = w) + if w < s.u: + useless.append(s) + s = s._replace(u = w) + updates.append(s) + corWs.put_nowait(FloatTensor([w, 1.0-w])) + print(colored("add perference {} to set.".format(w), "green")) + discard = False + + for s in useless: + print(colored("remove {} from set.".format(s), "green")) + S.remove(s) + + for s in updates: + print(colored("update {} in set.".format(s), "green")) + S.add(s) + + if not discard: + S.add(nv) + print(colored("add {} to set.".format(nv), "green")) + else: + print(colored("give up to add {} to set.".format(nv), "green")) + + return S, corWs + + +class RoiMORLPolicy(Policy.Policy): + '''Derived from :class:`Policy` + ''' + + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None): + super(RoiMORLPolicy, self).__init__(domainString, is_training) + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.prev_state_check = None + + # parameter settings + if 0: # cfg.has_option('morlpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper + self.n_in = cfg.getint('morlpolicy', 'n_in') + else: + self.n_in = self.get_n_in(domainString) + + self.n_rew = 1 + if cfg.has_option('morlpolicy', 'n_rew'): + self.n_rew = cfg.getint('morlpolicy', 'n_rew') + + self.lr = 0.001 + if cfg.has_option('morlpolicy', 'learning_rate'): + self.lr = cfg.getfloat('morlpolicy', 'learning_rate') + + self.epsilon = 0.5 + if cfg.has_option('morlpolicy', 'epsilon'): + self.epsilon = cfg.getfloat('morlpolicy', 'epsilon') + + self.epsilon_decay = True + if cfg.has_option('morlpolicy', 'epsilon_decay'): + self.epsilon_decay = cfg.getboolean('morlpolicy', 'epsilon_decay') + + self.randomseed = 1234 + if cfg.has_option('GENERAL', 'seed'): + self.randomseed = cfg.getint('GENERAL', 'seed') + + self.gamma = 1.0 + if cfg.has_option('morlpolicy', 'gamma'): + self.gamma = cfg.getfloat('morlpolicy', 'gamma') + + self.weight_num = 32 + if cfg.has_option('morlpolicy', 'weight_num'): + self.weight_num = cfg.getint('morlpolicy', 'weight_num') + + self.episode_num = 1000 + if cfg.has_option('morlpolicy', 'episode_num'): + self.episode_num = cfg.getfloat('morlpolicy', 'episode_num') + + self.optimizer = "Adam" + if cfg.has_option('morlpolicy', 'optimizer'): + self.optimizer = cfg.get('morlpolicy', 'optimizer') + + self.save_step = 100 + if cfg.has_option('policy', 'save_step'): + self.save_step = cfg.getint('policy', 'save_step') + + self.update_freq = 50 + if cfg.has_option('morlpolicy', 'update_freq'): + self.update_freq = cfg.getint('morlpolicy', 'update_freq') + + self.policyfeatures = [] + if cfg.has_option('morlpolicy', 'features'): + logger.info('Features: ' + str(cfg.get('morlpolicy', 'features'))) + self.policyfeatures = json.loads(cfg.get('morlpolicy', 'features')) + + self.algorithm = 'naive' + if cfg.has_option('morlpolicy', 'algorithm'): + self.algorithm = cfg.get('morlpolicy', 'algorithm') + logger.info('Learning algorithm: ' + self.algorithm) + + self.batch_size = 32 + if cfg.has_option('morlpolicy', 'batch_size'): + self.batch_size = cfg.getint('morlpolicy', 'batch_size') + + self.mem_size = 1000 + if cfg.has_option('morlpolicy', 'mem_size'): + self.mem_size = cfg.getint('morlpolicy', 'mem_size') + + self.training_freq = 1 + if cfg.has_option('morlpolicy', 'training_freq'): + self.training_freq = cfg.getint('morlpolicy', 'training_freq') + + # set beta for envelope algorithm + self.beta = 0.1 + if cfg.has_option('morlpolicy', 'beta'): + self.beta = cfg.getfloat('morlpolicy', 'beta') + self.beta_init = self.beta + self.beta_uplim = 1.00 + self.tau = 1000. + self.beta_expbase = float(np.power(self.tau * (self.beta_uplim - self.beta), 1. / (self.episode_num+1))) + self.beta_delta = self.beta_expbase / self.tau + self.beta -= self.beta_delta + + # using homotopy method for optimization + self.homotopy = False + if cfg.has_option('morlpolicy', 'homotopy'): + self.homotopy = cfg.getboolean('morlpolicy', 'homotopy') + + self.epsilon_delta = (self.epsilon - 0.05) / self.episode_num + + self.episodecount = 0 + + # construct the models + self.state_dim = self.n_in + self.summaryaction = SummaryAction.SummaryAction(domainString) + if action_names is None: + self.action_names = self.summaryaction.action_names + else: + self.action_names = action_names + self.action_dim = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + self.reward_dim = self.n_rew + + model = None + if self.algorithm == 'naive': + model = naive.NaiveLinearCQN(self.state_dim, self.action_dim, self.reward_dim) + elif self.algorithm == 'envelope': + model = envelope.EnvelopeLinearCQN(self.state_dim, self.action_dim, self.reward_dim) + + self.model_ = model + self.model = copy.deepcopy(model) + + # initialize memory + self.trans_mem = deque() + self.trans = namedtuple('trans', ['s', 'a', 's_', 'r', 'd', 'ms', 'ms_']) + self.priority_mem = deque() + self.mem_last_state = None + self.mem_last_action = None + self.mem_last_mask = None + self.mem_cur_state = None + self.mem_cur_action = None + self.mem_cur_mask = None + + if self.optimizer == 'Adam': + self.optimizer = optim.Adam(self.model_.parameters(), lr=self.lr) + elif self.optimizer == 'RMSprop': + self.optimizer = optim.RMSprop(self.model_.parameters(), lr=self.lr) + + try: + self.loadPolicy(self.in_policy_file) + except: + logger.info("No previous model found...") + + self.w_kept = None + self.update_count = 0 + if self.is_training: + self.model_.train() + if use_cuda: + self.model.cuda() + self.model_.cuda() + + self.monitor = None + + def get_n_in(self, domain_string): + if domain_string == 'CamRestaurants': + return 268 + elif domain_string == 'CamHotels': + return 111 + elif domain_string == 'SFRestaurants': + return 636 + elif domain_string == 'SFHotels': + return 438 + elif domain_string == 'Laptops6': + return 268 # ic340: this is wrong + elif domain_string == 'Laptops11': + return 257 + elif domain_string is 'TV': + return 188 + else: + print 'DOMAIN {} SIZE NOT SPECIFIED, PLEASE DEFINE n_in'.format(domain_string) + + def act_on(self, state, preference=None): + if self.lastSystemAction is None and self.startwithhello: + systemAct, nextaIdex = 'hello()', -1 + else: + systemAct, nextaIdex = self.nextAction(state, preference) + self.lastSystemAction = systemAct + self.summaryAct = nextaIdex + self.prevbelief = state + + systemAct = DiaAct.DiaAct(systemAct) + return systemAct + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + cState, cAction = self.convertStateAction(state, action) + + execMask = self.summaryaction.getExecutableMask(state, cAction) + execMask = torch.Tensor(execMask).type(FloatTensor) + + # # normalising total return to -1~1 + # reward /= 20.0 + + self.mem_last_state = self.mem_cur_state + self.mem_last_action = self.mem_cur_action + self.mem_last_mask = self.mem_cur_mask + self.mem_cur_state = np.vstack([np.expand_dims(x, 0) for x in [cState]]) + # self.mem_cur_action = np.eye(self.action_dim, self.action_dim)[[cAction]] + self.mem_cur_action = cAction + self.mem_cur_mask = execMask + + state = self.mem_last_state + action = self.mem_last_action + next_state = self.mem_cur_state + terminal = False + + if state is not None and action is not None: + self.trans_mem.append(self.trans( + torch.from_numpy(state).type(FloatTensor), # state + action, # action + torch.from_numpy(next_state).type(FloatTensor), # next state + torch.from_numpy(reward).type(FloatTensor), # reward + terminal, # terminal + self.mem_last_mask, # action mask + self.mem_cur_mask)) # next action mask + + # randomly produce a preference for calculating priority + preference = self.w_kept + # preference = torch.randn(self.model_.reward_size) + # preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) + + state = torch.from_numpy(state).type(FloatTensor) + + _, q = self.model_(Variable(state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False), + execmask=Variable(self.mem_last_mask.unsqueeze(0), requires_grad=False)) + + q = q[0, action].data + + if self.algorithm == 'naive': + wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) + if not terminal: + next_state = torch.from_numpy(next_state).type(FloatTensor) + hq, _ = self.model_(Variable(next_state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False), + execmask=Variable(self.mem_cur_mask.unsqueeze(0), requires_grad=False)) + hq = hq.data[0] + p = abs(wr + self.gamma * hq - q) + else: + self.w_kept = None + # if self.epsilon_decay: + # self.epsilon -= self.epsilon_delta + p = abs(wr - q) + elif self.algorithm == 'envelope': + wq = preference.dot(q) + wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) + if not terminal: + next_state = torch.from_numpy(next_state).type(FloatTensor) + hq, _ = self.model_(Variable(next_state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False), + execmask=Variable(self.mem_cur_mask.unsqueeze(0), requires_grad=False)) + hq = hq.data[0] + whq = preference.dot(hq) + p = abs(wr + self.gamma * whq - wq) + else: + self.w_kept = None + # if self.epsilon_decay: + # self.epsilon -= self.epsilon_delta + # if self.homotopy: + # self.beta += self.beta_delta + # self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta + p = abs(wr - wq) + p += 1e-5 + + self.priority_mem.append( + p + ) + if len(self.trans_mem) > self.mem_size: + self.trans_mem.popleft() + self.priority_mem.popleft() + + self.actToBeRecorded = None + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + # # normalising total return to -1~1 + # reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + + # # normalising total return to -1~1 + # reward /= 20.0 + + self.mem_last_state = self.mem_cur_state + self.mem_last_action = self.mem_cur_action + self.mem_last_mask = self.mem_cur_mask + self.mem_cur_state = np.vstack([np.expand_dims(x, 0) for x in [terminal_state]]) + self.mem_cur_action = None + self.mem_cur_mask = torch.zeros(self.action_dim).type(FloatTensor) + + state = self.mem_last_state + action = self.mem_last_action + next_state = self.mem_cur_state + terminal = True + + if state is not None: + self.trans_mem.append(self.trans( + torch.from_numpy(state).type(FloatTensor), # state + action, # action + torch.from_numpy(next_state).type(FloatTensor), # next state + torch.from_numpy(reward).type(FloatTensor), # reward + terminal, # terminal + self.mem_last_mask, # action mask + self.mem_cur_mask)) # next action mask + + # randomly produce a preference for calculating priority + preference = self.w_kept + # preference = torch.randn(self.model_.reward_size) + # preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) + + state = torch.from_numpy(state).type(FloatTensor) + + _, q = self.model_(Variable(state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False)) + + q = q.data[0, action] + + if self.algorithm == 'naive': + wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) + if not terminal: + next_state = torch.from_numpy(next_state).type(FloatTensor) + hq, _ = self.model_(Variable(next_state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False)) + hq = hq.data[0] + p = abs(wr + self.gamma * hq - q) + else: + self.w_kept = None + # if self.epsilon_decay: + # self.epsilon -= self.epsilon_delta + p = abs(wr - q) + elif self.algorithm == 'envelope': + wq = preference.dot(q) + wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) + if not terminal: + next_state = torch.from_numpy(next_state).type(FloatTensor) + hq, _ = self.model_(Variable(next_state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False)) + hq = hq.data[0] + whq = preference.dot(hq) + p = abs(wr + self.gamma * whq - wq) + else: + self.w_kept = None + # if self.epsilon_decay: + # self.epsilon -= self.epsilon_delta + # if self.homotopy: + # self.beta += self.beta_delta + # self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta + p = abs(wr - wq) + + p += 1e-5 + + self.priority_mem.append( + p + ) + if len(self.trans_mem) > self.mem_size: + self.trans_mem.popleft() + self.priority_mem.popleft() + + def convertStateAction(self, state, action): + ''' + nnType = 'dnn' + #nnType = 'rnn' + # expand one dimension to match the batch size of 1 at axis 0 + if nnType == 'rnn': + belief = np.expand_dims(belief,axis=0) + ''' + if isinstance(state, TerminalState): + if self.domainUtil.domainString == 'CamRestaurants': + return [0] * 268, action + elif self.domainUtil.domainString == 'CamHotels': + return [0] * 111, action + elif self.domainUtil.domainString == 'SFRestaurants': + return [0] * 633, action + elif self.domainUtil.domainString == 'SFHotels': + return [0] * 438, action + elif self.domainUtil.domainString == 'Laptops11': + return [0] * 257, action + elif self.domainUtil.domainString == 'TV': + return [0] * 188, action + else: + flat_belief = flatten_belief(state, self.domainUtil) + self.prev_state_check = flat_belief + + return flat_belief, action + + def convertDIPStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def nextAction(self, beliefstate, preference=None): + ''' + select next action + + :param beliefstate: + :param preference: + :returns: (int) next summary action + ''' + beliefVec = flatten_belief(beliefstate, self.domainUtil) + execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) + execMask = torch.Tensor(execMask).type(FloatTensor) + + if preference is None: + if self.w_kept is None: + self.w_kept = torch.randn(self.model_.reward_size) + self.w_kept = (torch.abs(self.w_kept) / torch.norm(self.w_kept, p=1)).type(FloatTensor) + preference = self.w_kept + + if self.is_training and (len(self.trans_mem) < self.batch_size*10 or torch.rand(1)[0] < self.epsilon): + admissible = [i for i, x in enumerate(execMask) if x == 0.0] + random.shuffle(admissible) + nextaIdex = admissible[0] + else: + state = np.reshape(beliefVec, (1, len(beliefVec))) + state = torch.from_numpy(state).type(FloatTensor) + if self.algorithm == 'naive': + _, Q = self.model_( + Variable(state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False), + Variable(execMask.unsqueeze(0), requires_grad=False)) + nextaIdex = np.argmax(Q.detach().cpu().numpy()) + elif self.algorithm == 'envelope': + _, Q = self.model_( + Variable(state, requires_grad=False), + Variable(preference.unsqueeze(0), requires_grad=False), + execmask=Variable(execMask.unsqueeze(0), requires_grad=False)) + Q = Q.view(-1, self.model_.reward_size) + Q = torch.mv(Q.data, preference) + action = Q.max(0)[1].cpu().numpy() + nextaIdex = int(action) + + self.stats[nextaIdex] += 1 + summaryAct = self.action_names[nextaIdex] + beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) + masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) + + return masterAct, nextaIdex + + def sample(self, pop, pri, k): + pri = np.array(pri).astype(np.float) + inds = np.random.choice( + range(len(pop)), k, + replace=False, + p=pri / pri.sum() + ) + return [pop[i] for i in inds] + + def actmsk(self, num_dim, index): + mask = ByteTensor(num_dim).zero_() + mask[index] = 1 + return mask.unsqueeze(0) + + def nontmlinds(self, terminal_batch): + mask = ByteTensor(terminal_batch) + inds = torch.arange(0, len(terminal_batch)).type(LongTensor) + inds = inds[mask.eq(0)] + return inds + + def train(self): + ''' + call this function when the episode ends + ''' + self.episodecount +=1 + if self.monitor is None: + self.monitor = Monitor("-" + self.algorithm) + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update naive morl policy parameters.") + + logger.info("Episode Num so far: %s" % (self.episodecount)) + + if len(self.trans_mem) > self.batch_size*10: + + self.update_count += 1 + + minibatch = self.sample(self.trans_mem, self.priority_mem, self.batch_size) + batchify = lambda x: list(x) * self.weight_num + state_batch = batchify(map(lambda x: x.s, minibatch)) + action_batch = batchify(map(lambda x: LongTensor([x.a]), minibatch)) + reward_batch = batchify(map(lambda x: x.r.unsqueeze(0), minibatch)) + next_state_batch = batchify(map(lambda x: x.s_, minibatch)) + terminal_batch = batchify(map(lambda x: x.d, minibatch)) + mask_batch = batchify(map(lambda x: x.ms.unsqueeze(0), minibatch)) + next_mask_batch = batchify(map(lambda x: x.ms_.unsqueeze(0), minibatch)) + + w_batch = np.random.randn(self.weight_num, self.model_.reward_size) + w_batch = np.abs(w_batch) / \ + np.linalg.norm(w_batch, ord=1, axis=1, keepdims=True) + w_batch = torch.from_numpy(w_batch.repeat(self.batch_size, axis=0)).type(FloatTensor) + + if self.algorithm == 'naive': + __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)), + Variable(w_batch), + Variable(torch.cat(mask_batch, dim=0))) + # detach since we don't want gradients to propagate + # HQ, _ = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True), + # Variable(w_batch, volatile=True)) + _, DQ = self.model(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), + Variable(w_batch, requires_grad=False), + Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False)) + _, act = self.model_(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), + Variable(w_batch, requires_grad=False), + Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False))[1].max(1) + HQ = DQ.gather(1, act.unsqueeze(dim=1)).squeeze() + + w_reward_batch = torch.bmm(w_batch.unsqueeze(1), + torch.cat(reward_batch, dim=0).unsqueeze(2) + ).squeeze() + + nontmlmask = self.nontmlinds(terminal_batch) + with torch.no_grad(): + Tau_Q = Variable(torch.zeros(self.batch_size * self.weight_num).type(FloatTensor)) + Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask] + Tau_Q += Variable(w_reward_batch) + + actions = Variable(torch.cat(action_batch, dim=0)) + + # Compute Huber loss + loss = F.smooth_l1_loss(Q.gather(1, actions.unsqueeze(dim=1)), Tau_Q.unsqueeze(dim=1)) + + elif self.algorithm == 'envelope': + action_size = self.model_.action_size + reward_size = self.model_.reward_size + __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)), + Variable(w_batch), + w_num=self.weight_num, + execmask=Variable(torch.cat(mask_batch, dim=0))) + + # detach since we don't want gradients to propagate + # HQ, _ = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True), + # Variable(w_batch, volatile=True), w_num=self.weight_num) + _, DQ = self.model(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), + Variable(w_batch, requires_grad=False), + execmask=Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False)) + w_ext = w_batch.unsqueeze(2).repeat(1, action_size, 1) + w_ext = w_ext.view(-1, self.model.reward_size) + _, tmpQ = self.model_(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), + Variable(w_batch, requires_grad=False), + execmask=Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False)) + + tmpQ = tmpQ.view(-1, reward_size) + # print(torch.bmm(w_ext.unsqueeze(1), + # tmpQ.data.unsqueeze(2)).view(-1, action_size)) + act = torch.bmm(Variable(w_ext.unsqueeze(1), requires_grad=False), + tmpQ.unsqueeze(2)).view(-1, action_size).max(1)[1] + + HQ = DQ.gather(1, act.view(-1, 1, 1).expand(DQ.size(0), 1, DQ.size(2))).squeeze() + + nontmlmask = self.nontmlinds(terminal_batch) + with torch.no_grad(): + Tau_Q = Variable(torch.zeros(self.batch_size * self.weight_num, + reward_size).type(FloatTensor)) + Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask] + # Tau_Q.volatile = False + Tau_Q += Variable(torch.cat(reward_batch, dim=0)) + + actions = Variable(torch.cat(action_batch, dim=0)) + + Q = Q.gather(1, actions.view(-1, 1, 1).expand(Q.size(0), 1, Q.size(2)) + ).view(-1, reward_size) + Tau_Q = Tau_Q.view(-1, reward_size) + + wQ = torch.bmm(Variable(w_batch.unsqueeze(1)), + Q.unsqueeze(2)).squeeze() + + wTQ = torch.bmm(Variable(w_batch.unsqueeze(1)), + Tau_Q.unsqueeze(2)).squeeze() + + # loss = F.mse_loss(Q.view(-1), Tau_Q.view(-1)) + # print self.beta + loss = self.beta * F.mse_loss(wQ.view(-1), wTQ.view(-1)) + loss += (1 - self.beta) * F.mse_loss(Q.view(-1), Tau_Q.view(-1)) + + self.optimizer.zero_grad() + loss.backward() + for param in self.model_.parameters(): + param.grad.data.clamp_(-1, 1) + self.optimizer.step() + + if self.update_count % self.update_freq == 0: + self.model.load_state_dict(self.model_.state_dict()) + + self.monitor.update(self.episodecount, loss=loss.data) + + self.savePolicyInc() # self.out_policy_file) + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + if self.episodecount % self.save_step == 0: + torch.save(self.model, "{}.{}.pkl".format(self.out_policy_file, self.algorithm)) + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.model_ = torch.load("{}.{}.pkl".format(filename, self.algorithm)) + self.model = copy.deepcopy(self.model_) + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.actToBeRecorded = None + self.w_kept = None + if self.epsilon_decay: + self.epsilon -= self.epsilon_delta + if self.homotopy: + self.beta += self.beta_delta + self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta + +# END OF FILE diff --git a/pydial/pydial.py b/pydial/pydial.py index 492d172..2c89ba1 100755 --- a/pydial/pydial.py +++ b/pydial/pydial.py @@ -445,8 +445,11 @@ def trainBatch(domain, configId, trainerr, ndialogs, source_iteration,seed=None) Settings.config.write(cf) error = float(trainerr) / 100.0 # run the system + roi = False + if policytype == "roi-morl": + roi = True simulator = Simulate.SimulationSystem(error_rate=error) - simulator.run_dialogs(ndialogs) + simulator.run_dialogs(ndialogs, roi) if gdeleteprevpolicy: if isSingleDomain: if inpolicy[-1] != '0': diff --git a/synthetic/crl/naive/meta.py b/synthetic/crl/naive/meta.py index 6410642..9ad366f 100644 --- a/synthetic/crl/naive/meta.py +++ b/synthetic/crl/naive/meta.py @@ -84,7 +84,7 @@ def act(self, state, preference=None): return action - def memorize(self, state, action, next_state, reward, terminal): + def memorize(self, state, action, next_state, reward, terminal, roi=False): self.trans_mem.append(self.trans( torch.from_numpy(state).type(FloatTensor), # state action, # action @@ -93,10 +93,12 @@ def memorize(self, state, action, next_state, reward, terminal): terminal)) # terminal # randomly produce a preference for calculating priority - # preference = self.w_kept - preference = torch.randn(self.model_.reward_size) - preference = (torch.abs(preference) / \ - torch.norm(preference, p=1)).type(FloatTensor) + if roi: + preference = self.w_kept + else: + preference = torch.randn(self.model_.reward_size) + preference = (torch.abs(preference) / \ + torch.norm(preference, p=1)).type(FloatTensor) state = torch.from_numpy(state).type(FloatTensor) diff --git a/synthetic/roijers_train.py b/synthetic/roijers_train.py index 0d27ad9..4398d9e 100644 --- a/synthetic/roijers_train.py +++ b/synthetic/roijers_train.py @@ -202,7 +202,7 @@ def train(env, agent, args): next_state, reward, terminal = env.step(action) if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) - agent.memorize(state, action, next_state, reward, terminal) + agent.memorize(state, action, next_state, reward, terminal, roi=True) loss += agent.learn(corner_w) if cnt > 100: terminal = True