# CartPole mit DQN

## Aufgabe 3
Löse das CartPole-v0 Environment mittels DQN.

In [5]:
%run ../setup.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
from lib.statistics import plot
import time
import numpy as np
from collections import deque
from contextlib import suppress

def interact_with_environment(env, agent, n_episodes=400, max_steps=200, train=True, verbose=True):      
    statistics = []
    
    with suppress(KeyboardInterrupt):
        for episode in range(n_episodes):
            done = False
            total_reward = 0
            state = env.reset()
            episode_start_time = time.time()

            for t in range(max_steps):
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)

                if train:
                    agent.train((state, action, next_state, reward, done))

                state = next_state
                total_reward += reward

                if done:
                    break

            if verbose and episode % 10 == 0:
                speed = t / (time.time() - episode_start_time)
                print(f'episode: {episode}/{n_episodes}, score: {total_reward}, steps: {t}, '
                      f'e: {agent.epsilon:.3f}, speed: {speed:.2f} steps/s')

            statistics.append({
                'episode': episode,
                'score': total_reward,
                'steps': t
            })
        
    return statistics

### 3.1
Implementiere in **agent.py** einen Agenten, der in der Lage ist das CartPole Environment zu lösen.

In [7]:
import gym
env = gym.make('CartPole-v0')

In [88]:
from agent import DQN

action_size = env.action_space.n
action_space = env.action_space
state_size = env.observation_space.shape[0]

# Hyperparams
annealing_steps = 1000  # not episodes!
gamma = 0.95
epsilon = 0.5 #1
epsilon_min = 0.01
epsilon_decay = (epsilon - epsilon_min) / annealing_steps
alpha = 0.01
batch_size = 32
memory_size = 10000
start_replay_step = 2000
target_model_update_interval = 1000

agent = DQN(action_size=action_size, action_space=action_space, state_size=state_size, gamma=gamma, 
            epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, 
            alpha=alpha, batch_size=batch_size, memory_size=memory_size,
            start_replay_step=start_replay_step, 
            target_model_update_interval=target_model_update_interval)
statistics = interact_with_environment(env, agent, verbose=True)
plot(statistics)

episode: 0/400, score: 28.0, steps: 27, e: 0.500, speed: 7.80 steps/s
episode: 10/400, score: 36.0, steps: 35, e: 0.500, speed: 565.37 steps/s
episode: 20/400, score: 28.0, steps: 27, e: 0.500, speed: 486.59 steps/s
episode: 30/400, score: 28.0, steps: 27, e: 0.500, speed: 13.85 steps/s
episode: 40/400, score: 27.0, steps: 26, e: 0.500, speed: 504.30 steps/s
episode: 50/400, score: 29.0, steps: 28, e: 0.500, speed: 440.52 steps/s
[array([-0.07762913, -0.02343718], dtype=float32), array([-0.02148188, -0.01207254], dtype=float32), array([-0.13627025, -0.18034978], dtype=float32), array([-0.10471494, -0.02923844], dtype=float32), array([-0.15162337, -0.00257874], dtype=float32), array([-0.17414017, -0.10144279], dtype=float32), array([-0.06817366, -0.02675323], dtype=float32), array([-0.10323325, -0.15365313], dtype=float32), array([-0.10056753, -0.15734318], dtype=float32), array([-0.05619368, -0.01410444], dtype=float32), array([-0.11153773, -0.02487017], dtype=float32), array([-0.02626

ValueError: When using data tensors as input to a model, you should specify the `steps_per_epoch` argument.

In [None]:
from gym.wrappers import Monitor
# capture every episode and clean 'video' folder before each run
env = Monitor(env, './video', video_callable=lambda episode_id: True, force=True)
statistics = interact_with_environment(env, agent, n_episodes=10, train=False, verbose=False)
plot(statistics, y_limits=(0,200))