In [24]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys
from collections import deque,namedtuple
from itertools import count
import random
import math
from IPython.core.debugger import set_trace
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [26]:
env = gym.make('CartPole-v0').unwrapped
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print("action_size",n_actions)
print("state_size",n_states)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_size 2
state_size 4


In [38]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size, seed):
        
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
    
    def add(self, state, action, reward, next_state, done):
        
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
        
    def sample(self):
        
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.hstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.hstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [39]:
# DQN network

class DQN(nn.Module):
    
    def __init__(self, n_states, n_actions):
        
        super(DQN, self).__init__()
        
        self._fc1 = nn.Linear(in_features=n_states, out_features=24)
        self._relu1 = nn.ReLU()
        self._fc2 = nn.Linear(in_features=24, out_features=24)
        self._relu2 = nn.ReLU()
        self._fc3 = nn.Linear(in_features=24, out_features=n_actions)
        
        
    def forward(self, x):
        
        
        x = self._fc1(x)
        x = self._relu1(x)
        x = self._fc2(x)
        x = self._relu2(x)        
        x = self._fc3(x)
        
        return x

In [29]:
def epsilon_greedy_policy(policy_net, elapsed_time, state):

    epsilon_val = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * elapsed_time/ EPS_DECAY)
  
    
    if random.random() <= epsilon_val:
        return env.action_space.sample()
        
    return policy_net(state).max(1)[1].view(1, 1).item()


In [40]:
def fit(episode):       

    states, actions, rewards, next_states, done = experience_replay.sample()
    Q_s_a = policy_net(states).squeeze().gather(1, actions)
    Q_next_s_a = target_net(next_states).squeeze().detach().max(1)[0]    
    Q_target = rewards + GAMMA * Q_next_s_a * (1.0 - done)
    TD_error = criterion(Q_s_a, Q_target.unsqueeze(1))
    optimizer.zero_grad()
    TD_error.backward()
    optimizer.step()
    
    return TD_error.item()
    
        

In [41]:
# Network Parameter initialization
policy_net=DQN(n_states,n_actions).to(device)
target_net=DQN(n_states,n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
criterion=nn.MSELoss().to(device)
optimizer=optim.Adam(policy_net.parameters(),lr=0.001)

In [46]:
# Hyperparameter update 
buffer_size=10000
BATCH_SIZE = 32
random_seed=121
GAMMA     = 0.95
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 1000 # After 1k steps epsilon is fixed to 0.01
num_episodes = 100
TARGET_UPDATE = 100 # 1st priority in tuning
prev_state = None
episodes = []
scores_per_episode = []
steps_done = 0
experience_replay = ReplayBuffer(buffer_size,BATCH_SIZE,random_seed) 

In [53]:
for episode in range(num_episodes):
    
    state = env.reset()
    state = np.reshape(state, (1, n_states))
    done = False
    reward = 0
    score = 0
    
    for n in count():
        a = torch.FloatTensor(state)        
        action = epsilon_greedy_policy(policy_net, steps_done, a.to(device))
        steps_done +=1
        next_state, reward, done, _ = env.step(action)

        if done:
            next_state = np.zeros(np.shape(state))
        else:
            next_state = np.reshape(next_state, (1, n_states))
        
        score += reward       

        experience_replay.add(state, action, reward, next_state, done)
        
        state = next_state

            
        if done:
            
            scores_per_episode.append(score)
            print(f'Episode {episode} with reward {score} in {n} iterations')
            
            break
            
        if len(experience_replay.memory) > BATCH_SIZE:

            loss = fit(steps_done)
           
            
        if steps_done % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
    
    if score>2000:
        break
    

    

Episode 0 with reward 194.0 in 193 iterations
Episode 1 with reward 927.0 in 926 iterations
Episode 2 with reward 100.0 in 99 iterations
Episode 3 with reward 138.0 in 137 iterations
Episode 4 with reward 171.0 in 170 iterations
Episode 5 with reward 2693.0 in 2692 iterations


In [55]:
# play using trained network
state = env.reset()
state = np.reshape(state, (1, n_states))
returns = 0    
for n in count():
        a = torch.FloatTensor(state)        
        action = epsilon_greedy_policy(policy_net, steps_done, a.to(device))
        steps_done +=1
        next_state, reward, done, _ = env.step(action)
        if done:
            next_state = np.zeros(np.shape(state))
        else:
            next_state = np.reshape(next_state, (1, n_states))
        score += reward            
        if done:
            print(f'total reward obtained is {score}')            
            break
        

total reward obtained is 2702.0
