In [4]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import deque
import random

print(gym.__version__)

env = gym.make('CartPole-v1', render_mode='human')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Tham số DQN
episode = 10
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 64
memory = deque(maxlen=2000)

# Xây dựng mô hình mạng nơ-ron
def build_model():
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
    return model

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

# Hàm chọn hành động theo epsilon-greedy
def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    act_values = model.predict(state, verbose=0)
    return np.argmax(act_values[0])

# Huấn luyện mạng DQN
def replay():
    global epsilon
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            # target_model được dùng để tạo nên sự ổn định
            target = reward + gamma * np.amax(target_model.predict(next_state, verbose=0)[0])
        target_f = model.predict(state, verbose=0)
        # cập nhật Q-value cho action đã từng thực hiện
        target_f[0][action] = target
        model.fit(state, target_f, epochs=1, verbose=0)
    # Giảm dần xác suất chọn action ngẫu nhiên theo thời gian
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Cập nhật target network
def update_target_model():
    target_model.set_weights(model.get_weights())

# Huấn luyện DQN
for e in range(episode):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        print(time, end = ' ')
        action = act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        memory.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            print(f"\nEpisode: {e + 1}/{episode}, Score: {time}, Epsilon: {epsilon:.2}")
            break
        if len(memory) > batch_size:
            replay()
    update_target_model()


0.25.2
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 
Episode: 1/10, Score: 16, Epsilon: 1.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
Episode: 2/10, Score: 21, Epsilon: 1.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Episode: 3/10, Score: 15, Epsilon: 1.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 
Episode: 4/10, Score: 18, Epsilon: 0.96
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
Episode: 5/10, Score: 21, Epsilon: 0.86
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 
Episode: 6/10, Score: 70, Epsilon: 0.61
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
Episode: 7/10, Score: 43, Epsilon: 0.49
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
Episode: 8/10, Score: 4

In [5]:
import time

# Hàm chọn hành động theo epsilon-greedy cho việc chơi game
def act_play(state, epsilon):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    act_values = model.predict(state, verbose=0)
    return np.argmax(act_values[0])

# Hàm để mô hình chơi game và hiển thị
def play(epsilon):
    for e in range(10):  # Chơi 10 lần
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time_t in range(500):
            env.render()
            action = act_play(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            state = next_state
            if done:
                print(f"Episode: {e + 1}/10, Score: {time_t}")  # In ra kết quả
                break
            time.sleep(0.05)
    #env.close()

# Gọi hàm chơi game với epsilon mong muốn
play(epsilon)

Episode: 1/10, Score: 123
Episode: 2/10, Score: 62
Episode: 3/10, Score: 112
Episode: 4/10, Score: 348
Episode: 5/10, Score: 267
Episode: 6/10, Score: 52
Episode: 7/10, Score: 134
Episode: 8/10, Score: 325
Episode: 9/10, Score: 290
Episode: 10/10, Score: 125
