In [3]:
import tensorflow 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt

import random
import numpy as np

import gym

In [62]:
gamma = 0.95
alpha = 0.50
learning_rate = 0.01
epsilon = 0.999
epsilon_decay = 0.90

class DQN:

    def __init__(self, observation_space, action_space):
        
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.action_space = action_space
        self.observation_space = observation_space
        
        self.memory = []
        self.batch_size = 8

        self.model = Sequential()
        self.model.add(Dense(32, input_shape=(observation_space,), activation="selu", kernel_initializer='lecun_normal'))
        self.model.add(Dense(64, activation="selu", kernel_initializer='lecun_normal'))
        self.model.add(Dense(16, activation="selu", kernel_initializer='lecun_normal'))
        self.model.add(Dense(8, activation="selu", kernel_initializer='lecun_normal'))
        self.model.add(Dropout(0.1))
        self.model.add(BatchNormalization())
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=learning_rate))
    
    def get_scores(self):
        return self.scores
    
    def get_rewards(self):
        return self.rewards
        
    def act(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_space)
        q = self.model.predict(state)
        return np.argmax(q[0])

    def experience_replay(self):
        print(self.memory,self.batch_size)
        mem_sample = random.sample(self.memory, min(len(self.memory), self.batch_size))
        for state, action, reward, next_state, done in mem_sample:
            update_value = reward
            print(update_value)
            if not done:
                update_value = self.alpha * (reward + self.gamma * np.max(self.model.predict(next_state)[0]))
            q = self.model.predict(state)
            q[0][action] = update_value
            # print(state,type(state))
            # print(q,type(q))
            self.model.fit(np.array(state), q, verbose=0)
        self.epsilon *= epsilon_decay
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [63]:
def blackjack():
    env = gym.make("Blackjack-v0")
    observation_space_list = list(env.observation_space)
    observation_space, action_space = (observation_space_list[0].n, observation_space_list[1].n, observation_space_list[2].n), env.action_space.n
    epoch = 0
    dqn = DQN(observation_space[0], action_space)
    done = False

    while not done:
        score = 0
        epoch += 1
        state = env.reset()
        print(state)
        #state = np.reshape(state, [1, observation_space])
        while True:
            score += 1
            action = dqn.act(state)
            next_state, reward, done, info = env.step(action)
            reward = reward if not done else -reward
            dqn.experience_replay()
            state = next_state
            if done:
                print ("Round: " + str(epoch) + " Score: " + str(score))
                break
            dqn.update()

In [64]:
blackjack()

(12, 5, False)
[] 8
Round: 1 Score: 1
