In [1]:
import torch
import torch.autograd as autograd         # computation graph
from torch import Tensor                  # tensor node in the computation graph
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim               # optimizers e.g. gradient descent, ADAM, etc.
import gym
from collections import deque
import collections, itertools
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [None]:
#if training with google colab
'''
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive
'''

In [2]:
class DQN(nn.Module):
    def __init__(self,obs_array_len, num_actions):
        super(DQN, self).__init__()
        self.fc =nn.Linear(obs_array_len,256)
        self.fc1= nn.Linear(256,256)
        self.output = nn.Linear(256, num_actions) #according to env.action_space, there are 4 actions that can be taken
        
    def forward(self, obs):
        x=F.relu(self.fc(obs))
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc1(x))
        x=self.output(x)
        return x 

In [3]:
#named tupule creates a 'class' that stores state action reward... we can then create a list or queue to store instances of the class
Experience = collections.namedtuple('Experience',['state', 'action', 'reward', 'done', 'state1']) #done is important for when we calculate the losses

class Experience_replay(): #experience replay datatype
    def __init__(self, REPLAY_SIZE):
        self.memory= collections.deque(maxlen = REPLAY_SIZE) #deque with max size given

    def insert(self, experience):
        self.memory.append(experience)
        
    def size(self):
        return len(self.memory)
    
    def sample(self, batch_size): #return a batch to calculate loss
        random_indexes = np.random.choice(len(self.memory), batch_size, replace = False) #create list of random
        
        states, actions, rewards, dones, state1s = zip(* [self.memory[index] for index in random_indexes])
        
        states = np.array(states)                
        actions = np.array(actions)                
        rewards = np.array(rewards, dtype=np.float32)        
        dones = np.array(dones, dtype=np.uint8)        
        state1s = np.array(state1s)
        
        return states, actions, rewards, dones, state1s
            
def to_torch(i):
    return torch.from_numpy(i)   


In [4]:
tb=SummaryWriter("runs/Train")

In [5]:
def train():
    #enable cuda
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    
    #hyperparameters

    GAMMA=0.99                #Gamma for bellman approx.
    BATCH_SIZE=32             #size of batch to sample from replay memory
    REPLAY_SIZE=10000         #size of replay memory
    LR=1e-4                   #learning rate
    EPSILON_START=1.0         #exploration 
    EPSILON_FINAL=0.1
    EPSILON_DECAY_LENGTH=150000
    epsilon=EPSILON_START
    count =0
    episode_reward=0
    
    avg_reward=collections.deque(maxlen = 100)
    
    #make environment
    
    env = gym.make('Breakout-ram-v0')  
    obs=env.reset()
    done = False
    
    #Network
    
    net= DQN(len(obs),4).to(device)       #define networks
    target_net= DQN(len(obs),4).to(device)
    target_net.load_state_dict(net.state_dict())   #we want the same parameters
    target_net.eval()                              #we want to eventuall copy the parameters of net, not optimize target_net

    #Loss

    loss_fn=nn.MSELoss()

    #Optimizer

    optimizer= optim.RMSprop(net.parameters(), lr=LR)
    
    mem_buffer = Experience_replay(REPLAY_SIZE)
    
    while (mem_buffer.size() < REPLAY_SIZE):    #play random actions to fill memory buffer
        if done:
            obs=env.reset()
            done=False
        action = env.action_space.sample()
        obs1, reward, done, info = env.step(action)
        experience = Experience(obs, action, reward, done, obs1)
        obs = obs1
        mem_buffer.insert(experience)
    
    
    inp=torch.empty(0)    #initilize empty tensor(no values at all)
    target=torch.empty(0)  #can use torch.randn(0) too
    
    while True:
        
        optimizer.zero_grad()
        
        if (count==EPSILON_DECAY_LENGTH):
            print("we have reached final decay")
        
        count+=1
        epsilon = max(EPSILON_FINAL, EPSILON_START - count/EPSILON_DECAY_LENGTH) #decay epsilon
        
        #np.random.random() returns random floats in the half-open interval [0.0, 1.0). 
        #with epsilon getting smaller, theres less chance that the random number will be within the range of epsilon
        
        if np.random.random() < epsilon:   
            action = env.action_space.sample()
            
        else:
            action= torch.argmax(net(to_torch(obs).to(device).float())) #output action.            
        
        
        obs1, reward, done, info = env.step(action)
        
        experience = Experience(obs, action, reward, done, obs1) #create transition
        obs = obs1       #update observation for next time step
        mem_buffer.insert(experience) #store transition in memory buffer
        
        if done:
            obs=env.reset()
            done=False
        
        #env.render()
        
        #Record training performance to tensorboard---------------Average reward over 100 episodes
        episode_reward+=reward
        if done:
          avg_reward.append(reward)
          episode_reward=0
        
        if (count%5000==0):
            mean_reward=np.mean(list(itertools.islice(avg_reward, 0, 99)))
            tb.add_scalar('Mean Reward', mean_reward, count)
            print(mean_reward, count)
        
        #sample from minibatch
        
        obs_batch, action_batch, reward_batch,done_batch, obs1_batch= mem_buffer.sample(BATCH_SIZE)

        for i in range(len(obs_batch)):
            if done_batch[i] == True:
                inp= torch.cat((inp.to(device), torch.unsqueeze(to_torch(np.array(reward_batch[i])).to(device), 0) ),0) #for every reward at index i, concatenate to the end of tensor
            else:
                #unsqueeze Returns a new tensor with a dimension of size one inserted at the specified position.
                inp= torch.cat((inp.to(device), torch.unsqueeze(to_torch(np.array(reward_batch[i])), 0).to(device) + GAMMA*torch.unsqueeze(torch.max(net(to_torch(np.array(obs_batch[i])).to(device).float())), 0)),  0) 
                inp= inp.to(device)
            target= torch.cat((target.to(device), torch.unsqueeze(torch.max(target_net(to_torch(obs1_batch[i]).to(device).float())),0) ), 0)
            
        
        loss=loss_fn(inp,target)
        loss.backward()
        optimizer.step()
        
        inp=torch.empty(0)     #empty tensors to use again in loop
        target=torch.empty(0)      
        
        
        #save model
        PATH="breakout-model.pth"
        PATH_for_target="breakout-target-model.pth"
        
        if(count%10000==0):
            torch.save({
                'epoch':count,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss':loss
            },
            PATH)

            target_net.load_state_dict(net.state_dict())
            target_net.eval()        
        
    
    
    env.close()    

In [6]:
train()

KeyboardInterrupt: 

In [None]:
x = torch.empty(0)
y= torch.randn(1)
z = torch.randn(1)
GAMMA=3
a= torch.tensor(2)
a= torch.unsqueeze(a,-1)

x=torch.cat((x,y,z),0)
print (x)
print(a*GAMMA)