In [1]:
"""
solving pendulum using actor-critic model
"""

import gym
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Softmax
from tensorflow.keras.layers import Add, Concatenate
from tensorflow.keras.optimizers import Adam
import tensorflow.compat.v1.keras.backend as K
from Blackjack import BlackjackEnv
import itertools

import tensorflow as tf

tf.compat.v1.disable_eager_execution()

import random
from collections import deque

def stack_samples(samples):
	array = np.array(samples)
	
	current_states = np.stack(array[:,0]).reshape((array.shape[0],-1))
	actions = np.stack(array[:,1]).reshape((array.shape[0],-1))
	rewards = np.stack(array[:,2]).reshape((array.shape[0],-1))
	new_states = np.stack(array[:,3]).reshape((array.shape[0],-1))
	dones = np.stack(array[:,4]).reshape((array.shape[0],-1))
	
	return current_states, actions, rewards, new_states, dones
	

# determines how to assign values to each state, i.e. takes the state
# and action (two-input model) and determines the corresponding value
class ActorCritic:
	def __init__(self, env, sess):
		self.env  = env
		self.action_dim=2
		self.state_dim=104
		self.sess = sess
        

		self.learning_rate = 0.0002
		self.epsilon = .99
		self.epsilon_decay = .99995
		self.gamma = .99
		self.tau   = .01

		self.alpha = tf.Variable([[1.]], trainable=True)
		self.target_entropy = -2.0
		# ===================================================================== #
		#                               Actor Model                             #
		# Chain rule: find the gradient of chaging the actor network params in  #
		# getting closest to the final value network predictions, i.e. de/dA    #
		# Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
		# ===================================================================== #
		self.ema = tf.train.ExponentialMovingAverage(decay=1-self.tau)
		self.memory = deque(maxlen=40000)
		self.actor_state_input, self.actor_model = self.create_actor_model() 
		self.logits = self.actor_model.output
		##action distribution
		self.action_dist = tf.nn.softmax(self.actor_model.output)
		self.action_dist = tf.expand_dims(self.action_dist, 1)
		##log distribution
		self.log_dist = tf.math.log(self.action_dist)
		self.log_dist = tf.reshape(self.log_dist,[-1,self.action_dim,1])
		print(self.action_dist)  
		print(self.log_dist)

		# ===================================================================== #
		#                              Critic Model                             #
		# ===================================================================== #

		self.action_input = tf.compat.v1.placeholder(tf.int32,shape = (None,1))
		self.critic_state_input, \
			self.critic_model = self.create_critic_model()
		self.target_state_input, self.target_critic_model = self.create_critic_model()

        #selected q value
		self.Q_value = tf.reshape(self.critic_model.output,[-1,2,1])
		self.one_hot = tf.expand_dims(tf.one_hot(self.action_input, self.action_dim),1)
		print(self.Q_value)
		print(self.one_hot)
		self.Q_value_sel = tf.squeeze(tf.linalg.matmul(self.one_hot,self.Q_value))
		print(self.Q_value_sel)
		self.target_Q_value = tf.reshape(self.target_critic_model.output,[-1,2,1])
		self.value = tf.squeeze(tf.linalg.matmul(self.action_dist,self.Q_value-self.alpha*self.log_dist))
		self.target_value = tf.squeeze(tf.linalg.matmul(self.action_dist,self.target_Q_value-self.alpha*self.log_dist))
		print(self.value)
        
		# ===================================================================== #
		#                              loss funtion                             #
		# ===================================================================== #
        
		##critic loss
		self.mask_target_value = tf.compat.v1.placeholder(tf.float32,shape=(None,1))
		self.critic_loss = tf.add_n([tf.compat.v1.losses.mean_squared_error(labels=self.mask_target_value, predictions=self.Q_value_sel, weights=0.5)])
		self.critic_opt = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.critic_loss)
        
		##actor loss
		self.actor_weights = self.actor_model.trainable_weights
		self.actor_loss = tf.linalg.matmul(self.action_dist,self.alpha*self.log_dist-self.Q_value)
		self.actor_grad = tf.gradients(self.actor_loss,self.actor_weights)
		grads = zip(self.actor_grad, self.actor_weights)
		self.actor_opt = tf.compat.v1.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
        
		##entropy loss
		self.entropy_loss = tf.linalg.matmul(self.action_dist,-self.alpha*(self.log_dist+self.target_entropy))
		print(self.entropy_loss)
		self.entropy_grad = tf.gradients(self.entropy_loss,self.alpha)
		ent_grads = zip(self.entropy_grad,itertools.repeat(self.alpha))
		self.entropy_opt = tf.compat.v1.train.AdamOptimizer(self.learning_rate).apply_gradients(ent_grads)

                
        
		# Initialize for later gradient calculations
		self.sess.run(tf.compat.v1.global_variables_initializer())
        
	# ========================================================================= #
	#                              Model Definitions                            #
	# ========================================================================= #

	def create_actor_model(self):
		state_input = Input(shape=(self.state_dim,))
		h1 = Dense(256, activation='relu')(state_input)
		h2 = Dense(256, activation='relu')(h1)
		logits = Dense(self.action_dim,activation='linear')(h2)

		model = Model([state_input], logits)
		adam  = Adam(lr=0.0001)
		model.compile(loss="mse", optimizer=adam)
		return state_input, model

	def create_critic_model(self):
		state_input = Input(shape=(self.state_dim,))
		state_h1 = Dense(256, activation='relu')(state_input)
		state_h2 = Dense(256, activation='relu')(state_h1)

		output = Dense(self.action_dim, activation='linear')(state_h2)
		model  = Model([state_input],output)

		adam  = Adam(lr=0.0001)
		model.compile(loss="mse", optimizer=adam)
		return state_input, model

	# ========================================================================= #
	#                               Model Training                              #
	# ========================================================================= #

	def remember(self, cur_state, action, reward, new_state, done):
		self.memory.append([cur_state, action, reward, new_state, done])

	def _train_actor(self, samples):
		cur_states, actions, rewards, new_states, _ =  stack_samples(samples)

		self.sess.run(self.actor_opt, feed_dict={
            self.actor_state_input: cur_states,
            self.critic_state_input:cur_states
		})

	def _train_critic(self, samples):
   
		cur_states, actions, rewards, new_states, dones = stack_samples(samples)
		future_rewards = self.sess.run(self.target_value,feed_dict={
            self.actor_state_input:new_states,
            self.target_state_input:new_states
        })
		rewards = rewards.reshape(future_rewards.shape) + self.gamma *np.multiply(future_rewards,(1 - dones.reshape(future_rewards.shape)))  
		rewards = rewards.reshape((rewards.shape[0],1))
		self.sess.run(self.critic_opt, feed_dict={
            self.mask_target_value:rewards,
            self.action_input: actions,
            self.critic_state_input:cur_states
        })
		#print(evaluation.history)
        
	def _train_alpha(self, samples):
		cur_states, actions, rewards, new_states, _ =  stack_samples(samples)   
		self.sess.run(self.entropy_opt, feed_dict={
			self.actor_state_input: cur_states
		})
        
        
	def train(self):
		batch_size = 256
		if len(self.memory) < batch_size:
			return

		rewards = []
		samples = random.sample(self.memory, batch_size)
		self.samples = samples
		self._train_critic(samples)
		self._train_alpha(samples)
		self._train_actor(samples)

	# ========================================================================= #
	#                         Target Model Updating                             #
	# ========================================================================= #

        
	def _update_critic_target(self):
		critic_model_weights  = self.critic_model.get_weights()
		critic_target_weights = self.target_critic_model.get_weights()
		
		for i in range(len(critic_target_weights)):
			critic_target_weights[i] = critic_model_weights[i]*self.tau + critic_target_weights[i]*(1-self.tau)
		self.target_critic_model.set_weights(critic_target_weights)
        
	def update_target(self):
		self._update_critic_target()

	# ========================================================================= #
	#                              Model Predictions                            #
	# ========================================================================= #

	def act(self, cur_state):
		logits = self.actor_model.predict(cur_state)
		samples = tf.random.categorical(logits, cur_state.shape[0])
		return self.sess.run(samples,feed_dict={self.actor_state_input: cur_state,})





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def state_preprosser(cur_state):
	player_state = cur_state[0]
	dealer_state = np.zeros(52)
	dealer_state[cur_state[1]] = 1
	return np.concatenate([player_state,dealer_state])

In [3]:
def main():
	sess = tf.compat.v1.Session()
	K.set_session(sess)
	env = BlackjackEnv({"one_card_dealer": True,"card_values":None})
	actor_critic = ActorCritic(env, sess)

	num_trials = 100000
	trial_len  = 10

	for i in range(num_trials):
		#print("trial:" + str(i))
		cur_state = env.reset()
		action = env.action_space.sample()
		reward_sum = 0
		for j in range(trial_len):
			#env.render()
			cur_state = state_preprosser(cur_state)
			cur_state = cur_state.reshape((1, cur_state.shape[0]))
			action = actor_critic.act(cur_state)
			action = action[0][0]
			#print(action)
			new_state, reward, done, _ = env.step(action)
			if j == (trial_len - 1):
				done = True
				#print(reward)

			if (j % 5 == 0):
				actor_critic.train()
				actor_critic.update_target()
                
			tmp_state = cur_state                
			cur_state = new_state
			new_state = state_preprosser(new_state)
			#new_state = new_state.reshape((1,new_state.shape[0]))

			actor_critic.remember(tmp_state, action, reward, new_state, done)
			if done:
				break



		if (i % 50 == 0):

			trj_rewards = []
			for traj in range(10): 
				cur_state = env.reset()
				rewards = 0
				for j in range(10):
					cur_state = state_preprosser(cur_state)
					env.render()
					cur_state = cur_state.reshape((1, cur_state.shape[0]))
					action = actor_critic.act(cur_state)
					action = action[0][0]

					new_state, reward, done, _ = env.step(action)
				#reward += reward
				#if j == (trial_len - 1):
					#done = True
					#print(reward)

				#if (j % 5 == 0):
				#    actor_critic.train()
				#    actor_critic.update_target()   
					rewards+=reward
					if done:
 						break


				#actor_critic.remember(cur_state, action, reward, new_state, done)
					cur_state = new_state
				trj_rewards.append(rewards)
			print(np.mean(np.asarray(trj_rewards)))

if __name__ == "__main__":
	main()

Tensor("ExpandDims:0", shape=(None, 1, 2), dtype=float32)
Tensor("Reshape:0", shape=(None, 2, 1), dtype=float32)
Tensor("Reshape_1:0", shape=(None, 2, 1), dtype=float32)
Tensor("ExpandDims_1:0", shape=(None, 1, 1, 2), dtype=float32)
Tensor("Squeeze:0", dtype=float32)
Tensor("Squeeze_1:0", dtype=float32)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Tensor("MatMul_4:0", shape=(None, 1, 1), dtype=float32)
0.6
0.5
0.4
0.4
0.6
0.8
0.4
0.8
0.7
0.7
0.8
0.5
0.9
0.5
0.5
0.8
0.6
0.8
0.8
0.6
0.6
0.5
0.7
0.7
0.9
0.6
0.8
0.9
0.6
0.6
0.8
0.9
1.0
0.7
0.6
0.7
0.8
1.0
0.9
0.7
1.0
0.7
0.6
0.8
0.8
0.7
0.6
0.7
0.8
1.0
0.6
1.0
1.0
0.8
0.8
1.0
0.8
1.0
0.7
1.0
0.9
0.9
1.0
0.9
1.0
0.9
1.0
1.0
0.9
0.9
1.0
0.9
1.0
0.9
0.8
1.0
1.0
0.9
0.9
0.8
1.0
0.9
1.0
1.0
1.0
1.0
0.9
0.9
0.9
0.9
1.0
0.8
0.9
1.0
1.0
0.9
0.9
0.8
1.0
1.0
0.9
0.9
0.9
0.9
1.0
0.8
0.9
1.0
1.0
1.0
1.0
0.9
0.9
0.9
0.8
1.0
0.9
0.9
1.0
0.9
1.0
0.7
1.0
0.7
1.0
1.0
0.9
1.0
1.0
0.8
0.8
1.0
1.0
0.8
0.9
0.9
1

KeyboardInterrupt: 

In [8]:
sess = tf.compat.v1.Session()
K.set_session(sess)
env = BlackjackEnv()
actor_critic = ActorCritic(env, sess)

num_trials = 100000
trial_len  = 10

for i in range(num_trials):
    #print("trial:" + str(i))
    cur_state = env.reset()
    action = env.action_space.sample()
    reward_sum = 0
    for j in range(trial_len):
        #env.render()
        cur_state = state_preprosser(cur_state)
        cur_state = cur_state.reshape((1, cur_state.shape[0]))
        action = actor_critic.act(cur_state)
        action = action[0][0]
        #print(action)
        new_state, reward, done, _ = env.step(action)
        if j == (trial_len - 1):
            done = True
            #print(reward)

        if (j % 5 == 0):
            actor_critic.train()
            actor_critic.update_target()

        tmp_state = cur_state                
        cur_state = new_state
        new_state = state_preprosser(new_state)
        #new_state = new_state.reshape((1,new_state.shape[0]))

        actor_critic.remember(tmp_state, action, reward, new_state, done)
        if done:
            break



    if (i % 50 == 0):

        trj_rewards = []
        for traj in range(10): 
            cur_state = env.reset()
            rewards = 0
            for j in range(10):
                cur_state = state_preprosser(cur_state)
                env.render()
                cur_state = cur_state.reshape((1, cur_state.shape[0]))
                action = actor_critic.act(cur_state)
                action = action[0][0]

                new_state, reward, done, _ = env.step(action)
            #reward += reward
            #if j == (trial_len - 1):
                #done = True
                #print(reward)

            #if (j % 5 == 0):
            #    actor_critic.train()
            #    actor_critic.update_target()   
                rewards+=reward
                if done:
                    break


            #actor_critic.remember(cur_state, action, reward, new_state, done)
                cur_state = new_state
            trj_rewards.append(rewards)
        print(np.mean(np.asarray(trj_rewards)))

Tensor("ExpandDims_6:0", shape=(None, 1, 2), dtype=float32)
Tensor("Reshape_9:0", shape=(None, 2, 1), dtype=float32)
Tensor("Reshape_10:0", shape=(None, 2, 1), dtype=float32)
Tensor("ExpandDims_7:0", shape=(None, 1, 1, 2), dtype=float32)
Tensor("Squeeze_9:0", dtype=float32)
Tensor("Squeeze_10:0", dtype=float32)
Tensor("MatMul_19:0", shape=(None, 1, 1), dtype=float32)
0.4
0.6
0.3
0.3
0.4
0.4
0.4
0.2
0.6
0.7
0.3
0.3
0.3
0.3
0.4
0.2
0.4
0.2
0.3
0.5
0.2
0.4
0.4
0.1
0.2
0.2
0.6
0.3
0.2
0.3
0.0
0.3
0.4
0.4
0.4
0.4
0.2
0.2
0.5
0.1
0.4
0.4
0.6
0.5
0.3
0.3
0.1
0.5
0.8
0.4
0.2
0.6
0.3
0.4
0.4
0.4
0.3
0.2
0.4
0.4
0.3
0.4
0.5
0.4
0.4
0.6
0.4
0.3
0.2
0.8
0.5
0.3
0.5
0.2
0.3
0.4
0.5
0.4
0.2
0.2
0.5
0.5
0.4
0.6
0.6
0.7
0.6
0.5
0.6
0.3
0.4
0.6
0.5
0.4
0.2
0.3
0.0
0.7
0.4
0.7
0.5
0.4
0.7
0.2
0.6
0.5
0.4
0.4
0.5
0.5
0.6
0.3
0.5
0.7
0.5
0.2
0.3
0.7
0.7
0.5
0.5
0.4
0.5
0.6
0.6
0.3
0.6
0.5
0.5
0.5
0.7
0.6
0.5
0.3
0.5
0.3
0.6
0.7
0.5
0.6
0.5
0.4
0.4
0.8
0.5
0.2
0.0
0.3
0.4
0.4
0.5
0.7
0.6
0.5
0.2
0.5
0.3
0.

KeyboardInterrupt: 

In [None]:
sess = tf.compat.v1.Session()
K.set_session(sess)
env = BlackjackEnv({"card_values": np.ones(52,)*2})
actor_critic = ActorCritic(env, sess)

num_trials = 100000
trial_len  = 10

for i in range(num_trials):
    #print("trial:" + str(i))
    cur_state = env.reset()
    action = env.action_space.sample()
    reward_sum = 0
    for j in range(trial_len):
        #env.render()
        cur_state = state_preprosser(cur_state)
        cur_state = cur_state.reshape((1, cur_state.shape[0]))
        action = actor_critic.act(cur_state)
        action = action[0][0]
        #print(action)
        new_state, reward, done, _ = env.step(action)
        if j == (trial_len - 1):
            done = True
            #print(reward)

        if (j % 5 == 0):
            actor_critic.train()
            actor_critic.update_target()

        tmp_state = cur_state                
        cur_state = new_state
        new_state = state_preprosser(new_state)
        #new_state = new_state.reshape((1,new_state.shape[0]))

        actor_critic.remember(tmp_state, action, reward, new_state, done)
        if done:
            break



    if (i % 50 == 0):

        trj_rewards = []
        for traj in range(10): 
            cur_state = env.reset()
            rewards = 0
            for j in range(10):
                cur_state = state_preprosser(cur_state)
                env.render()
                cur_state = cur_state.reshape((1, cur_state.shape[0]))
                action = actor_critic.act(cur_state)
                action = action[0][0]

                new_state, reward, done, _ = env.step(action)
            #reward += reward
            #if j == (trial_len - 1):
                #done = True
                #print(reward)

            #if (j % 5 == 0):
            #    actor_critic.train()
            #    actor_critic.update_target()   
                rewards+=reward
                if done:
                    break


            #actor_critic.remember(cur_state, action, reward, new_state, done)
                cur_state = new_state
            trj_rewards.append(rewards)
        print(np.mean(np.asarray(trj_rewards)))

Tensor("ExpandDims_2:0", shape=(None, 1, 2), dtype=float32)
Tensor("Reshape_3:0", shape=(None, 2, 1), dtype=float32)
Tensor("Reshape_4:0", shape=(None, 2, 1), dtype=float32)
Tensor("ExpandDims_3:0", shape=(None, 1, 1, 2), dtype=float32)
Tensor("Squeeze_3:0", dtype=float32)
Tensor("Squeeze_4:0", dtype=float32)
Tensor("MatMul_9:0", shape=(None, 1, 1), dtype=float32)
0.3
0.5
0.6
0.5
0.5
0.6
0.5
0.7
0.6
0.8
0.8
0.4
0.5
0.6
0.6
0.3
0.7
0.6


In [3]:
sess = tf.compat.v1.Session()
K.set_session(sess)
env = BlackjackEnv({"card_values": [3,  1,  3,  9,  6,  0,  7, -2,  2,  6,  8,  1,  3,
                                               4, -1,  4,  3,  9, -1,  4,  0,  4,  7, -2, -1,  5,
                                               2,  6, -3, -1,  2,  2, -1,  7,  1,  0,  7,  8,  4,
                                               5,  3, -1,  0,  3, -1,  3,  0,  6, -2,  4, -3,  4]})
actor_critic = ActorCritic(env, sess)

num_trials = 100000
trial_len  = 10

for i in range(num_trials):
    #print("trial:" + str(i))
    cur_state = env.reset()
    action = env.action_space.sample()
    reward_sum = 0
    for j in range(trial_len):
        #env.render()
        cur_state = state_preprosser(cur_state)
        cur_state = cur_state.reshape((1, cur_state.shape[0]))
        action = actor_critic.act(cur_state)
        action = action[0][0]
        #print(action)
        new_state, reward, done, _ = env.step(action)
        if j == (trial_len - 1):
            done = True
            #print(reward)

        if (j % 5 == 0):
            actor_critic.train()
            actor_critic.update_target()

        tmp_state = cur_state                
        cur_state = new_state
        new_state = state_preprosser(new_state)
        #new_state = new_state.reshape((1,new_state.shape[0]))

        actor_critic.remember(tmp_state, action, reward, new_state, done)
        if done:
            break



    if (i % 50 == 0):

        trj_rewards = []
        for traj in range(10): 
            cur_state = env.reset()
            rewards = 0
            for j in range(10):
                cur_state = state_preprosser(cur_state)
                env.render()
                cur_state = cur_state.reshape((1, cur_state.shape[0]))
                action = actor_critic.act(cur_state)
                action = action[0][0]

                new_state, reward, done, _ = env.step(action)
            #reward += reward
            #if j == (trial_len - 1):
                #done = True
                #print(reward)

            #if (j % 5 == 0):
            #    actor_critic.train()
            #    actor_critic.update_target()   
                rewards+=reward
                if done:
                    break


            #actor_critic.remember(cur_state, action, reward, new_state, done)
                cur_state = new_state
            trj_rewards.append(rewards)
        print(np.mean(np.asarray(trj_rewards)))

Tensor("ExpandDims:0", shape=(None, 1, 2), dtype=float32)
Tensor("Reshape:0", shape=(None, 2, 1), dtype=float32)
Tensor("Reshape_1:0", shape=(None, 2, 1), dtype=float32)
Tensor("ExpandDims_1:0", shape=(None, 1, 1, 2), dtype=float32)
Tensor("Squeeze:0", dtype=float32)
Tensor("Squeeze_1:0", dtype=float32)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Tensor("MatMul_4:0", shape=(None, 1, 1), dtype=float32)
0.7
0.3
0.4
0.5
0.9
0.6
0.6
0.6
0.6
0.3
0.3
0.4
0.3
0.4
0.5
0.2
0.3
0.5
0.3
0.7
0.7
0.6
0.5
0.6
0.3
0.6
0.2
0.4
0.6
0.5
0.5
0.5
0.4
0.4
0.6
0.4
0.2
0.4
0.5
0.3
0.4
0.4
0.5
0.4
0.4
0.6
0.1
0.3
0.5
0.4
0.5
0.4
0.4
0.3
0.7
0.4
0.5
0.4
0.6
0.6
0.4
0.5
0.5
0.4
0.5
0.4
0.3
0.3
0.4
0.4
0.4
0.5
0.3
0.7
0.5
0.4
0.6
0.7
0.4
0.7
0.8
0.5


KeyboardInterrupt: 