In [1]:
import random
import time
from collections import deque

import gym
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from config import *

In [2]:
env = gym.make('CartPole-v1')

In [3]:
print(env.action_space)
print(env.observation_space)

no_actions = env.action_space.n
no_observations = env.observation_space.shape[0]

Discrete(2)
Box(4,)


In [5]:
# noinspection PyShadowingNames
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.replay_memory = deque(maxlen=EXPERIENCE_SIZE)

        self.net = nn.Sequential(
            nn.Linear(no_observations, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, no_actions)
        )
        self.net = self.net.cuda()
        self.optim = optim.Adam(self.net.parameters(), lr=LEARNING_RATE)

    def act(self, state, test=False):
        if (np.random.uniform(0, 1) > EPSILON) or test:
            q_values = self.net(torch.tensor(state).float().unsqueeze(0).cuda())
            action = torch.argmax(q_values.squeeze()).item()
        else:
            action = env.action_space.sample()
        return action

    def learn(self, episode):
        if len(self.replay_memory) < BATCH_SIZE:
            return
        train_batch = random.sample(self.replay_memory, BATCH_SIZE)

        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*train_batch)

        batch_state = torch.tensor(batch_state).float().cuda()
        batch_next_state = torch.tensor(batch_next_state).float().cuda()
        batch_action = torch.tensor(batch_action).cuda().unsqueeze(1)
        batch_done = torch.tensor(batch_done).cuda().to(dtype=torch.int)
        batch_reward = torch.tensor(batch_reward).cuda().to(dtype=torch.int)

        current_q_values = self.net(batch_state).gather(1, batch_action)
        target_q_values = batch_reward + (GAMMA * self.net(batch_next_state).max(1)[0])
#         target_q_values = batch_reward + (torch.ones_like(batch_done) - batch_done) * (GAMMA * self.net(batch_next_state).max(1)[0])

        loss = F.smooth_l1_loss(current_q_values, target_q_values)
        if not episode%30:
            print(loss)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def remember(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))
        
dqn = DQN()
        

In [6]:
# noinspection PyUnusedLocal,PyShadowingNames
def test():
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action = dqn.act(state, True)
        next_state, reward, done, _ = env.step(action)
        env.render()
#         time.sleep(0.07)
        total_reward += reward
        state = next_state
    env.close()
    print(f"Evaluation Score : {total_reward}")
    print()

In [7]:
rewards = []
rewards_dq = deque(maxlen=100)

for episode in range(MAX_EPISODES):
    # noinspection PyRedeclaration
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action = dqn.act(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        dqn.remember(state, action, reward, next_state, done)
        dqn.learn(episode)
        EPSILON = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-1 * DECAY_RATE * episode)
        state = next_state
        
    rewards.append(total_reward)
    rewards_dq.append(total_reward)
    
    if not episode % 30:
        print(f'Episode : {episode}')
        print(f'Best Reward : {max(rewards)}')
        print(f'Mean over last 50 : {np.mean(rewards_dq)}')
        print(f'Epsilon : {EPSILON}')
        print()
        if np.mean(rewards_dq) > 195:
            break
        test()
    

Episode : 0
Best Reward : 33.0
Mean over last 50 : 33.0
Epsilon : 1.0

Evaluation Score : 9.0





tensor(0.4907, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4752, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4902, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4801, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4848, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4851, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4726, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4869, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4653, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4779, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4711, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4856, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4996, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4852, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4753, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4850, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4831, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4859, device='cuda:0',

Evaluation Score : 10.0

tensor(0.5174, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4590, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.5016, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4899, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4779, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.5079, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4845, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4752, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4828, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.5209, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4760, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4914, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4755, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4975, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4633, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4644, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.4840, device='cuda:0', grad_fn=<MeanBackward0>)
tensor

KeyboardInterrupt: 

In [None]:
rewards = []
rewards_dq = deque(maxlen=100)

for episode in range(MAX_EPISODES):
    # noinspection PyRedeclaration
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action = dqn.act(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        dqn.remember(state, action, reward, next_state, done)
        dqn.learn(episode)
        EPSILON = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-1 * DECAY_RATE * episode)
        state = next_state
        
    rewards.append(total_reward)
    rewards_dq.append(total_reward)
    
    if not episode % 30:
        print(f'Episode : {episode}')
        print(f'Best Reward : {max(rewards)}')
        print(f'Mean over last 50 : {np.mean(rewards_dq)}')
        print(f'Epsilon : {EPSILON}')
        print()
        if np.mean(rewards_dq) > 195:
            break
        test()
env.close()