# Reinforcement Learning - Deep Q Network

In [1]:
import random
import torch
import numpy as np

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys

assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

## 1. Deep Q-Network (DQN)

In [3]:
import gym
from project.environments.customEnvironments import WindyGridWorld
env = gym.envs.make("CartPole-v1")
# env = gym.envs.make("MountainCar-v0")
# env = WindyGridWorld()


  result = entry_point.load(False)


In [4]:
from project.networks import SARSANetwork, DeepQNetwork
from project.policies import EpsilonGreedyPolicy

from project.train_network import train_episodes
from project.test_network import test_episodes

### 2.5 Put it all together

In [19]:
# Let's run it!
num_episodes = 1000
batch_size = 64
discount_factor = 0.75



learn_rate = 1e-3
lr_step_size = 500
lr_gamma = 0.1

semi_grad=False
use_replay=True

architecture = [128, 256, 128]


dq_network = DeepQNetwork(in_features=4, out_features=2, discount_factor=discount_factor, architecture=architecture)
dq_policy = EpsilonGreedyPolicy(dq_network, end_e=0.01)
episode_durations, losses, episode_rewards = train_episodes(env, dq_policy, num_episodes, batch_size, learn_rate, semi_grad=semi_grad, use_replay=use_replay,
                                                           lr_step_size=lr_step_size, lr_gamma=lr_gamma)

test_episodes(env, dq_policy, 10)


 Episode 0 finished after 12 steps
 Episode 10 finished after 39 steps
 Episode 20 finished after 325 steps
 Episode 30 finished after 92 steps
 Episode 40 finished after 300 steps
 Episode 50 finished after 378 steps
 Episode 60 finished after 20 steps
 Episode 70 finished after 125 steps
 Episode 80 finished after 61 steps
 Episode 90 finished after 80 steps
 Episode 100 finished after 14 steps
 Episode 110 finished after 498 steps
 Episode 120 finished after 240 steps
 Episode 130 finished after 85 steps
 Episode 140 finished after 61 steps
 Episode 150 finished after 181 steps
 Episode 160 finished after 452 steps
 Episode 170 finished after 162 steps
 Episode 180 finished after 159 steps
 Episode 190 finished after 235 steps
 Episode 200 finished after 60 steps
 Episode 210 finished after 170 steps
 Episode 220 finished after 132 steps
 Episode 230 finished after 251 steps
 Episode 240 finished after 151 steps
 Episode 250 finished after 219 steps
 Episode 260 finished after 120 s

([182, 220, 194, 491, 259, 224, 210, 273, 128, 219],
 [182.0, 220.0, 194.0, 491.0, 259.0, 224.0, 210.0, 273.0, 128.0, 219.0])

## 1. Deep SARSA-Network (DQN)

In [6]:
def run_deepSARSA():
    from DSN import SARSANetwork, run_episodes

    # Let's run it!
    num_episodes = 100
    batch_size = 64
    discount_factor = 0.8
    learn_rate = 1e-3
    num_hidden = 128
    seed = 42  # This is not randomly chosen

    # We will seed the algorithm (before initializing QNetwork!) for reproducibility
    random.seed(seed)
    torch.manual_seed(seed)
    env.seed(seed)


    my_DSN = SARSANetwork(in_features=4, num_hidden=num_hidden, out_features=2, discount_factor=discount_factor)
    episode_durations = run_episodes(env, my_DSN, num_episodes, batch_size, learn_rate, semi_grad=False)
    
    # And see the results
    def smooth(x, N):
        cumsum = np.cumsum(np.insert(x, 0, 0)) 
        return (cumsum[N:] - cumsum[:-N]) / float(N)

    plt.plot(smooth(episode_durations, 10))
    plt.title('Episode durations per episode')

# run_deepSARSA()

If you want to test/submit your solution **restart the kernel, run all cells and submit the dqn_autograde.py file into codegrade.**

In [7]:
from project.networks import SARSANetwork, DeepQNetwork
from project.policies import EpsilonGreedyPolicy

from project.train_network import train_episodes
from project.test_network import test_episodes

In [9]:
# env = WindyGridWorld()
env = gym.envs.make("CartPole-v1")

num_episodes = 500
batch_size = 1
discount_factor = 1
learn_rate = 1e-3
semi_grad = True
use_replay = False
seed = 42

print(env.reset())
dq_network = SARSANetwork(in_features=4, out_features=2, discount_factor=discount_factor)
dq_policy = EpsilonGreedyPolicy(dq_network, steps_e=10000)
episode_durations, losses = train_episodes(env, dq_policy, num_episodes, batch_size, learn_rate, semi_grad=semi_grad, use_replay=use_replay)



[ 0.00839396  0.02602053 -0.0090631   0.003036  ]
 Episode 0 finished after 15 steps
 Episode 10 finished after 18 steps
 Episode 20 finished after 9 steps
 Episode 30 finished after 25 steps
 Episode 40 finished after 10 steps
 Episode 50 finished after 12 steps
 Episode 60 finished after 20 steps
 Episode 70 finished after 13 steps
 Episode 80 finished after 28 steps
 Episode 90 finished after 19 steps
 Episode 100 finished after 13 steps
 Episode 110 finished after 18 steps
 Episode 120 finished after 11 steps
 Episode 130 finished after 28 steps
 Episode 140 finished after 17 steps
 Episode 150 finished after 26 steps
 Episode 160 finished after 9 steps
 Episode 170 finished after 42 steps
 Episode 180 finished after 14 steps
 Episode 190 finished after 15 steps
 Episode 200 finished after 13 steps
 Episode 210 finished after 15 steps
 Episode 220 finished after 13 steps
 Episode 230 finished after 13 steps
 Episode 240 finished after 13 steps
 Episode 250 finished after 8 steps
 E