In [2]:
import tensorflow as tf
import keras as ks
import random
import numpy as np
import gymnasium as gym


In [3]:
def create_model(in_dim, out_dim):
    model = ks.models.Sequential()
    model.add(ks.layers.Input([1,in_dim]))
    model.add(ks.layers.Dense(128, activation='relu'))
    model.add(ks.layers.Dense(64, activation='relu'))
    model.add(ks.layers.Dense(out_dim, activation='softmax'))
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

In [4]:
def eps_greedy(env, model, state, epsilon):
    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(model.predict(tf.expand_dims(np.array([state]),0)))
    return action

In [5]:
def train(q, q_target, buffer):
    buffer_samples = 10
    gamma = 0.99
    if len(buffer) >= buffer_samples:
        samples = random.sample(buffer, buffer_samples)

        for (state, action, reward, next_state) in samples:
            x = tf.expand_dims(np.array([state]),0)
            x_ = tf.expand_dims(np.array([next_state]),0)
            q_values = q.predict(x, verbose=0)
            q_values_target = q_target.predict(x_, verbose=0)
            q_values[0][0][action] = reward + gamma * np.max(q_values_target[0])
            q.fit(np.array([[state]]), q_values, epochs=1, verbose=0)


In [6]:
def dqn(env, num_episodes=3, epsilon=0.1):
    buffer_size = 50
    c = 20
    buffer = []
    q = create_model(
        in_dim = env.observation_space.shape[0],
        out_dim = env.action_space.n
    )
    q_target = tf.keras.models.clone_model(q)

    for episode in range(num_episodes):
        state = env.reset()[0] 
        action = eps_greedy(env, q, state, epsilon)

        done = False
        count = 0
        total_reward = 0
        while not done:
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            buffer.append((state, action, reward, next_state))
            train(q, q_target, buffer)
            if len(buffer) > buffer_size:
                buffer.pop(0)
            if count % c == 0:
                q_target = tf.keras.models.clone_model(q)
                print("Count = ", count, end='\r')
            count += 1

        if episode % 1 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward}")
            env.render()
    return q



In [None]:
env = gym.make("Acrobot-v1", render_mode = "rgb_array")
env = gym.wrappers.record_video.RecordVideo(env, video_folder="./videos", episode_trigger=lambda e: e % 50 == 0)
with tf.device('CPU:0'):
    q = dqn(env)

env.close()
