# Reinforcement Learning - Deep Q Network

In [6]:
import random
import torch
import numpy as np

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys

assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

## 1. Deep Q-Network (DQN)

In [8]:
import gym
from project.environments.customEnvironments import WindyGridWorld
env = gym.envs.make("CartPole-v1")
# env = gym.envs.make("MountainCar-v0")
# env = WindyGridWorld()


In [9]:
from project.networks import SARSANetwork, DeepQNetwork
from project.policies import EpsilonGreedyPolicy

from project.train_network import train_episodes
from project.test_network import test_episodes

### 2.5 Put it all together

In [10]:
# Let's run it!
num_episodes = 1000
batch_size = 64
discount_factor = 0.75



learn_rate = 1e-3
lr_step_size = 500
lr_gamma = 0.1

semi_grad=False
use_replay=True

architecture = [128, 256, 128]


dq_network = DeepQNetwork(in_features=4, out_features=2, discount_factor=discount_factor, architecture=architecture)
dq_policy = EpsilonGreedyPolicy(dq_network)
dq_policy = EpsilonGreedyPolicy(dq_network, end_e=0.01)
episode_durations, losses, episode_rewards, _ = train_episodes(env, dq_policy, num_episodes, batch_size, learn_rate, semi_grad=semi_grad, use_replay=use_replay,
                                                           lr_step_size=lr_step_size, lr_gamma=lr_gamma)

test_episodes(env, dq_policy, 10)


[99m Episode 0 finished after 10 steps
[99m Episode 10 finished after 10 steps
[92m Episode 20 finished after 301 steps
[92m Episode 30 finished after 240 steps
[99m Episode 40 finished after 42 steps
[99m Episode 50 finished after 158 steps
[99m Episode 60 finished after 13 steps
[99m Episode 70 finished after 131 steps
[99m Episode 80 finished after 98 steps
[99m Episode 90 finished after 144 steps
[92m Episode 100 finished after 500 steps
[99m Episode 110 finished after 47 steps
[99m Episode 120 finished after 50 steps
[99m Episode 130 finished after 95 steps
[99m Episode 140 finished after 158 steps
[99m Episode 150 finished after 147 steps
[99m Episode 160 finished after 40 steps
[99m Episode 170 finished after 126 steps
[99m Episode 180 finished after 113 steps
[99m Episode 190 finished after 68 steps
[99m Episode 200 finished after 125 steps
[99m Episode 210 finished after 68 steps
[92m Episode 220 finished after 270 steps
[99m Episode 230 finished after 

([146, 166, 140, 144, 137, 136, 137, 151, 140, 130],
 [146.0, 166.0, 140.0, 144.0, 137.0, 136.0, 137.0, 151.0, 140.0, 130.0])

## 1. Deep SARSA-Network (DQN)

In [11]:
def run_deepSARSA():
    from DSN import SARSANetwork, run_episodes

    # Let's run it!
    num_episodes = 100
    batch_size = 64
    discount_factor = 0.8
    learn_rate = 1e-3
    num_hidden = 128
    seed = 42  # This is not randomly chosen

    # We will seed the algorithm (before initializing QNetwork!) for reproducibility
    random.seed(seed)
    torch.manual_seed(seed)
    env.seed(seed)


    my_DSN = SARSANetwork(in_features=4, num_hidden=num_hidden, out_features=2, discount_factor=discount_factor)
    episode_durations = run_episodes(env, my_DSN, num_episodes, batch_size, learn_rate, semi_grad=False)
    
    # And see the results
    def smooth(x, N):
        cumsum = np.cumsum(np.insert(x, 0, 0)) 
        return (cumsum[N:] - cumsum[:-N]) / float(N)

    plt.plot(smooth(episode_durations, 10))
    plt.title('Episode durations per episode')

# run_deepSARSA()

If you want to test/submit your solution **restart the kernel, run all cells and submit the dqn_autograde.py file into codegrade.**

In [12]:
from project.networks import SARSANetwork, DeepQNetwork
from project.policies import EpsilonGreedyPolicy

from project.train_network import train_episodes
from project.test_network import test_episodes

In [14]:
# env = WindyGridWorld()
env = gym.envs.make("CartPole-v1")

num_episodes = 500
batch_size = 1
discount_factor = 1
learn_rate = 1e-3
semi_grad = True
use_replay = False
seed = 42

print(env.reset())
dq_network = SARSANetwork(in_features=4, out_features=2, discount_factor=discount_factor)
dq_policy = EpsilonGreedyPolicy(dq_network, steps_e=10000)
episode_durations, losses, episode_rewards, _ = train_episodes(env, dq_policy, num_episodes, batch_size, learn_rate, semi_grad=semi_grad, use_replay=use_replay)



[-0.01642634 -0.04672059 -0.04208646  0.03716467]
[99m Episode 0 finished after 21 steps
[99m Episode 10 finished after 22 steps
[99m Episode 20 finished after 19 steps
[99m Episode 30 finished after 9 steps
[99m Episode 40 finished after 31 steps
[99m Episode 50 finished after 24 steps
[99m Episode 60 finished after 22 steps
[99m Episode 70 finished after 14 steps
[99m Episode 80 finished after 22 steps
[99m Episode 90 finished after 15 steps
[99m Episode 100 finished after 18 steps
[99m Episode 110 finished after 20 steps
[99m Episode 120 finished after 12 steps
[99m Episode 130 finished after 13 steps
[99m Episode 140 finished after 11 steps
[99m Episode 150 finished after 21 steps
[99m Episode 160 finished after 12 steps
[99m Episode 170 finished after 15 steps
[99m Episode 180 finished after 18 steps
[99m Episode 190 finished after 11 steps
[99m Episode 200 finished after 17 steps
[99m Episode 210 finished after 13 steps
[99m Episode 220 finished after 21 st