In [77]:
!pip install tensorflow
!pip install gym
!pip install keras
!pip install keras-rl2
!pip install pygame

Collecting pygame
  Downloading pygame-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.3


In [88]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
from matplotlib import pyplot as plt
import matplotlib.animation
from IPython.display import display, clear_output
import random
import pygame

In [89]:
code2action = {
    0:np.array([0, 0, 0, 0]),
    1:np.array([0, 0, 1, 0]),
    2:np.array([0, 0, -1, 0]),
    3:np.array([0, 0, 0, 1]),
    4:np.array([0, 0, 0, -1]),
    5:np.array([0, 0, 1, 1]),
    6:np.array([0, 0, 1, -1]),
    7:np.array([0, 0, -1, 1]),
    8:np.array([0, 0, -1, -1])
}

class ShrinkingCircleEnv(Env):
    width = height = d = 10
    time = 60
    terminated_penalty = 1
    window = None
    window_size = 512
    clock = None
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 15}
    
    def __init__(self):
        self.action_space = Discrete(9)
        self.bounds = np.array([self.height, self.width, self.height, self.width]) - 1
        self.observation_space = Box(np.zeros(4), self.bounds)
        
        
        self.state = (np.random.random_sample(size=4)*self.bounds).astype(int)
        
        self.circle_radius = 0
        
        self.timestamp = self.time
        
        # for rendering
        self.log_circles = []
        self.log_hero = []

        
        
    def get_reward(self):
        circle, hero = self.state[:2], self.state[2:]
        return - np.linalg.norm(hero - circle)
        
    def step(self, action):
        new_state = self.state + code2action[action]
        terminated = not ((0 <= new_state).all() and (new_state < self.bounds).all())
        
        self.timestamp -= 1 
        self.truncated = self.timestamp<=0
        
        reward = self.get_reward()
        
        # move center of circle with probability 0.2
        if np.random.binomial(1, 0.2):
            self.state = np.hstack((
                new_state[:2] + random.choice(list(code2action.values()))[2:],
                new_state[2:]
            ))
        
        # to stay on the playing field
        self.state = np.clip(self.state, np.zeros(4), self.bounds)
            
        info = {}
        return self.state, reward, self.truncated, info
    
    def render(self, mode):
        self.render_mode = mode
        circle, hero = self.state[:2], self.state[2:]
        self.log_circles.append(circle)
        self.log_hero.append(hero)
        
        return self._render_frame()
    
    # inspired by https://www.gymlibrary.dev/content/environment_creation/
    def _render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))
        if self.clock is None and self.render_mode == "human":
            self.clock = pygame.time.Clock()
        
        
        circle, hero = self.state[:2], self.state[2:]
        
        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = (
            self.window_size / self.d
        )  # The size of a single grid square in pixels

        # First we draw the hero
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * hero,
                (pix_square_size, pix_square_size),
            ),
        )
        # Now we draw the circle
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (circle + 0.5) * pix_square_size,
            pix_square_size / 3,
        )

        # Finally, add some gridlines
        for x in range(self.d + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )
        
        font = pygame.font.Font(None, 25)
        text = font.render(f"Time remaining: {str(self.timestamp)}", True,(0, 0, 128))
        text_rect = text.get_rect(center=(self.window_size/6, self.window_size/15))
        
        if self.render_mode == "human":
            # The following line copies our drawings from `canvas` to the visible window
            self.window.blit(canvas, canvas.get_rect())
            self.window.blit(text, text_rect)
            pygame.event.pump()
            pygame.display.update()

            # We need to ensure that human-rendering occurs at the predefined framerate.
            # The following line will automatically add a delay to keep the framerate stable.
            self.clock.tick(self.metadata["render_fps"])
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )
        
    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()
    
    def reset(self):
        
        self.state = (np.random.random_sample(size=4)*self.bounds).astype(int)
        self.circle_radius = 0 #min(self.height, self.width)
        self.timestamp = self.time
        
        self.log_circles = []
        self.log_hero = []
        self.window = None
        self.clock = None
        return self.state
    

In [90]:
env = ShrinkingCircleEnv()

In [91]:
episodes = 3
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        env.render('human')
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

error: No available video device

### Let's learn the model

In [92]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, CategoryEncoding
from tensorflow.keras.optimizers.legacy import Adam

In [93]:
def get_model(states, n_actions=9):
    model = Sequential() 
    model.add(keras.Input(shape=(1,states[0])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Flatten()) 
    model.add(Dense(n_actions, activation='linear'))
    return model

In [94]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [95]:
def get_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [96]:
n_states = env.observation_space.shape
n_actions = env.action_space.n

model = get_model(n_states, n_actions)
dqn = get_agent(model, n_actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

2023-02-24 20:43:09.763821: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_18/kernel/Assign' id:4321 op device:{requested: '', assigned: ''} def:{{{node dense_18/kernel/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_18/kernel, dense_18/kernel/Initializer/stateless_random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [56]:
dqn.fit(env, nb_steps=20000, visualize=False, verbose=1)

Training for 20000 steps ...
Interval 1 (0 steps performed)


2023-02-24 20:00:38.557947: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_11/BiasAdd' id:2594 op device:{requested: '', assigned: ''} def:{{{node dense_11/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_11/MatMul, dense_11/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-02-24 20:00:38.637886: W tensorflow/c/c_api.cc:291] Operation '{name:'total_12/Assign' id:2839 op device:{requested: '', assigned: ''} def:{{{node total_12/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_12, total_12/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after runni

    1/10000 [..............................] - ETA: 35:38 - reward: -3.6056

2023-02-24 20:00:38.788645: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_11_1/BiasAdd' id:2710 op device:{requested: '', assigned: ''} def:{{{node dense_11_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_11_1/MatMul, dense_11_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-02-24 20:00:39.104717: W tensorflow/c/c_api.cc:291] Operation '{name:'loss_15/AddN' id:2970 op device:{requested: '', assigned: ''} def:{{{node loss_15/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_15/mul, loss_15/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-02-

  510/10000 [>.............................] - ETA: 2:01 - reward: -4.2579done, took 6.800 seconds


<keras.callbacks.History at 0x7f85986bafe0>

In [87]:
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...


  updates=self.state_updates,
2023-02-24 20:08:00.256401: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_17/BiasAdd' id:3921 op device:{requested: '', assigned: ''} def:{{{node dense_17/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_17/MatMul, dense_17/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-02-24 20:08:00.397395: W tensorflow/c/c_api.cc:291] Operation '{name:'count_22/Assign' id:4191 op device:{requested: '', assigned: ''} def:{{{node count_22/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](count_22, count_22/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either 

human


error: No available video device

In [12]:
# https://stackoverflow.com/questions/25333732/matplotlib-animation-not-working-in-ipython-notebook-blank-plot