In [1]:
import torch
import torch.autograd as autograd         # computation graph
from torch import Tensor                  # tensor node in the computation graph
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim               # optimizers e.g. gradient descent, ADAM, etc.
import gym
from collections import deque
import collections, itertools
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import pickle                             #for saving objects

In [2]:
#if training with google colab
from google.colab import drive
drive.mount('/content/gdrive')
drive_folder="/content/gdrive/My Drive/DQN_save_files/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
'''    #it took 3000+ episodes to solve with 120 neurons per layer and just 300 episodes with 500 neurons per layer
class DQN(nn.Module):
    def __init__(self,obs_array_len, num_actions):
        super(DQN, self).__init__()
        self.fc =nn.Linear(obs_array_len,120)
        self.fc1= nn.Linear(120,120)
        self.output = nn.Linear(120, num_actions) 
        
    def forward(self, obs):
        x=F.relu(self.fc(obs))
        x=F.relu(self.fc1(x))
        x=self.output(x)
        return x 
'''
#Trying out more neurons per layer
class DQN(nn.Module):
    def __init__(self,obs_array_len, num_actions):
        super(DQN, self).__init__()
        self.fc =nn.Linear(obs_array_len,500) 
        self.fc1= nn.Linear(500,500)
        self.output = nn.Linear(500, num_actions) 
        
    def forward(self, obs):
        x=F.relu(self.fc(obs))
        x=F.relu(self.fc1(x))
        x=self.output(x)
        return x 

In [4]:
#named tupule creates a 'class' that stores state action reward... we can then create a list or queue to store instances of the class
Experience = collections.namedtuple('Experience',['state', 'action', 'reward', 'done', 'state1']) #done is important for when we calculate the losses

class Experience_replay(): #experience replay datatype
    def __init__(self, REPLAY_SIZE):
        self.memory= collections.deque(maxlen = REPLAY_SIZE) #deque with max size given

    def insert(self, experience):
        self.memory.append(experience)
        
    def size(self):
        return len(self.memory)
    
    def sample(self, batch_size): #return a batch to calculate loss
        random_indexes = np.random.choice(len(self.memory), batch_size, replace = False) #create list of random
        
        states, actions, rewards, dones, state1s = zip(* [self.memory[index] for index in random_indexes])
        
        states = np.array(states)                
        actions = np.array(actions)                
        rewards = np.array(rewards, dtype=np.float32)        
        dones = np.array(dones, dtype=np.uint8)        
        state1s = np.array(state1s)
        
        return states, actions, rewards, dones, state1s
            
def to_torch(i):
    return torch.from_numpy(i)   


In [5]:
tb=SummaryWriter(drive_folder+"runs/Cartpole/Train")

In [6]:
def train(resume=False):

    #enable cuda
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    
    #hyperparameters

    GAMMA=0.99                      #Gamma for bellman approx.
    BATCH_SIZE=64                   #size of batch to sample from replay memory
    REPLAY_SIZE=10000               #size of replay memory
    LR=0.000146 #0.001                       #learning rate
    EPSILON_START=1.0               #exploration 
    EPSILON_FINAL=0.05
    EPSILON_DECAY_LENGTH=2000        #was 200
    epsilon=EPSILON_START

    #make environment
    
    env = gym.make('CartPole-v1')  
    obs=env.reset()
    done = False

    #misc declarations

    avg_reward=collections.deque(maxlen = 100) #init a deque with a size of 100. 

    count =0
    episode_reward=0
    done_count=0
    mean_reward=0
    
    #Network
    
    net= DQN(len(obs),env.action_space.n).to(device)          #define networks
    target_net= DQN(len(obs),env.action_space.n).to(device)
    target_net.load_state_dict(net.state_dict())              #we want the same parameters
    target_net.eval()                                         #we want to eventuall copy the parameters of net, not optimize target_net

    #Loss
    loss_fn=nn.MSELoss()

    #Optimizer
    optimizer= optim.Adam(net.parameters(), lr=LR)         #was Adam
    
    #Replay Memory
    mem_buffer = Experience_replay(REPLAY_SIZE)

    #If we are resuming training
    if (resume == True):
      #load Network

      checkpoint = torch.load(drive_folder+"CartPole-v1.pth")
      net.load_state_dict(checkpoint['model_state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      count = checkpoint['epoch']
      loss = checkpoint['loss']

      target_net.load_state_dict(net.state_dict())

      #load saved replay memory

      with open(drive_folder+'mem_buffer.deque', 'rb') as mem_buffer_file:
        mem_buffer = pickle.load(mem_buffer_file)                     

    
    while (mem_buffer.size() < REPLAY_SIZE):    #play random actions to fill memory buffer
        if done:
            obs=env.reset()
            done=False
        action = env.action_space.sample()
        
        obs1, reward, done, info = env.step(action)
        experience = Experience(obs, action, reward, done, obs1)
        obs = obs1
        mem_buffer.insert(experience)
        
    
    policy=torch.empty(0)    #initilize empty tensor(no values at all) 
    target=torch.empty(0)    #can use torch.randn(0) too
    
#-------------------------------------------------------------------------------------------------------------------------------------------------------#

    while True:
        
        if (count==EPSILON_DECAY_LENGTH):
            print("we have reached final decay")
        
        count+=1
        
        epsilon = max(EPSILON_FINAL, EPSILON_START - count/EPSILON_DECAY_LENGTH) #decay epsilon
        
        #np.random.random() returns random floats in the half-open interval [0.0, 1.0). 
        #with epsilon getting smaller, theres less chance that the random number will be within the range of epsilon
  
        if np.random.random() < epsilon:   
          action = env.action_space.sample()
        
        else:
          action= torch.argmax(net(to_torch(obs).to(device).float())) #output action.
          action= action.item()       
        
        obs1, reward, done, info = env.step(action) 

        #reward = reward if not done else -reward 

        experience = Experience(obs, action, reward, done, obs1) #create transition
        obs = obs1       #update observation for next time step
        mem_buffer.insert(experience) #store transition in memory buffer

        episode_reward+=reward

        if done:
            obs=env.reset()
            done=False
            done_count+=1 #using this to calculate average reward
            avg_reward.append(episode_reward)
            print('Episode Reward: ',episode_reward)
            episode_reward=0
        
        #Record training performance to tensorboard---------------Average reward over 100 episodes
        
        if (count%100==0):  
          if (done_count>101): 
            mean_reward=np.mean(list(itertools.islice(avg_reward, 0, 99)))
            tb.add_scalar('Mean Reward', mean_reward, done_count)
            print('Mean Reward: ',mean_reward, 'Steps',count,'number of episodes:',done_count,'e:', epsilon)
            
            if (mean_reward>195):
              
              PATH= drive_folder+"CartPole-v1.pth"
              torch.save({
                  'epoch':count,
                  'model_state_dict': net.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss':loss
              },
              PATH)
          
              print("We Are DONE Training!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
              break              
        
        #sample from minibatch
        
        obs_batch, action_batch, reward_batch,done_batch, obs1_batch= mem_buffer.sample(BATCH_SIZE)
        
        obs_batch= to_torch(obs_batch).to(device).float()
        obs1_batch= to_torch(np.array(obs1_batch)).to(device).float()
        reward_batch=to_torch(np.array(reward_batch)).to(device)
                

        for i in range(len(obs_batch)):
            if done_batch[i] == True:
                target= torch.cat((target.to(device), torch.unsqueeze(reward_batch[i],0) ),0)#for every reward at index i, concatenate to the end of tensor
                
            else:
                #unsqueeze Returns a new tensor with a dimension of size one inserted at the specified position.
                target= torch.cat((target.to(device), torch.unsqueeze(reward_batch[i], 0)+( GAMMA*torch.unsqueeze(torch.max(target_net(obs1_batch[i]).detach()), 0)) ),0).to(device)
            policy= torch.cat((policy.to(device), torch.unsqueeze(torch.max(net(obs_batch[i])),0) ), 0)
 

        loss=loss_fn(target,policy)
        optimizer.zero_grad()
        loss.backward()
        tb.add_scalar('Loss Per Step', loss.item(), done_count) #record loss in tensorboard
        for param in net.parameters():
            param.grad.data.clamp_(-1, 1)              #clip gradients          
        optimizer.step()
        
        policy=torch.empty(0)     #empty tensors to use again in loop
        target=torch.empty(0) 
             
        
        #take the chance to update the target model
        if(done_count%10==0): # update every 10 episodes
          target_net.load_state_dict(net.state_dict())

        '''
        #Save model
        PATH= drive_folder+"CartPole-v1.pth"
               
        if(count%10000==0): 
            torch.save({
                'epoch':count,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss':loss
            },
            PATH)

            #We also save the transitions deque into a file
            with open(drive_folder+'mem_buffer.deque', 'wb') as mem_buffer_file:
              pickle.dump(mem_buffer, mem_buffer_file)      
        '''
                
        
    env.close()    

In [7]:
train(resume = False)

Episode Reward:  10.0
Episode Reward:  16.0
Episode Reward:  19.0
Episode Reward:  15.0
Episode Reward:  11.0
Episode Reward:  31.0
Episode Reward:  13.0
Episode Reward:  16.0
Episode Reward:  54.0
Episode Reward:  12.0
Episode Reward:  49.0
Episode Reward:  19.0
Episode Reward:  23.0
Episode Reward:  19.0
Episode Reward:  13.0
Episode Reward:  11.0
Episode Reward:  10.0
Episode Reward:  19.0
Episode Reward:  37.0
Episode Reward:  27.0
Episode Reward:  20.0
Episode Reward:  30.0
Episode Reward:  16.0
Episode Reward:  42.0
Episode Reward:  11.0
Episode Reward:  38.0
Episode Reward:  10.0
Episode Reward:  11.0
Episode Reward:  15.0
Episode Reward:  18.0
Episode Reward:  14.0
Episode Reward:  16.0
Episode Reward:  12.0
Episode Reward:  11.0
Episode Reward:  15.0
Episode Reward:  13.0
Episode Reward:  12.0
Episode Reward:  13.0
Episode Reward:  19.0
Episode Reward:  27.0
Episode Reward:  14.0
Episode Reward:  19.0
Episode Reward:  12.0
Episode Reward:  12.0
Episode Reward:  14.0
Episode Re

In [8]:
 import gym
env = gym.make("CartPole-v1")
observation = env.reset()
print(env.action_space)
for _ in range(100000):
  #env.render()
  action = env.action_space.sample() # your agent here (this takes random actions)
  observation, reward, done, info = env.step(action)

  if done:
    observation = env.reset()
env.close()

Discrete(2)
