In [4]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random
import pygame


pygame 2.1.2 (SDL 2.0.16, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
pip install pygame

Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.7 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2


In [5]:
#hyperparameters inspired from the openai base example 
gamma = 0.99
batch_size = 32
buffer_size = 50000
min_replay_size = 1000
epsilon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 10000 #revaluation of epsilon from start to the end in 10000 steps
target_update_freq= 1000

In [6]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()
        in_features = int(np.prod(env.observation_space.shape))
        
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n))
        
        
    def forward (self, x):
        return self.net(x)
    
    def act (self, obs): #selects an action 
        obs_t = torch.as_tensor(obs, dtype = torch.float32) #torch tensor
        q_values = self(obs_t.unsqueeze(0)) #q values for this specific observation. unsqueeze = because we dont have the batch dimension
        max_q_index = torch.argmax(q_values, dim = 1)[0]
        action = max_q_index.detach().item() #action indecies, 
        return action 

In [None]:
#creating our environment - CartPole

env = gym.make('CartPole-v1')
replay_buffer = deque(maxlen = buffer_size) 
reward_buffer = deque([0.0], maxlen=100)#rewards earned by an agent in a signle episode
                        
episode_reward = 0.0

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())

optimizer = torch.optim.Adam(online_net.parameters(), lr = 5e-4)

#replay buffer
obs = env.reset()
for _ in range(min_replay_size):
    action = env.action_space.sample()
    
    new_obs, rew, done, info = env.step(action)
    transition = (obs, action, rew, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs
    
    if done:#
        obs = env.reset()
        
#main train loop
obs = env.reset()

for step in itertools.count():
    epsilon = np.interp(step, [0, epsilon_decay], [epsilon_start, epsilon_end]) #to facilitate exploration in the environment
    random_sample = random.random()
    if random_sample <= epsilon:
        action = env.action_space.sample()
    else:
        action = online_net.act(obs)
        
    new_obs, rew, done, info = env.step(action)
    transition = (obs, action, rew, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs
    episode_reward +=rew
    
    if done:
        obs = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
        
    #visulisation
    # if len(reward_buffer) >= 100:
    #     if np.mean(reward_buffer) >= 195:
    #         while True:
    #             action = online_net.act(obs)
    #             obs, _, done, _ = env.step(action)
    #             env.render()
    #             if done:
    #                 env.reset()
        
        
    #starting gradient step
    transitions = random.sample(replay_buffer, batch_size) #samples batch size from our transitions from the replay buffer
    #creating a list from transitions for each element in seperate arays
    #pytorch much faster for making a torch tensor from numpy arrays than from python array  
    obses = np.asarray([t[0] for t in transitions]) 
    actions = np.asarray([t[1] for t in transitions])
    rews = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[3] for t in transitions])
    new_obses = np.asarray([t[4] for t in transitions])
    #transforming to tensors
    obses_t = torch.as_tensor(obses, dtype = torch.float32)
    actions_t =  torch.as_tensor(actions, dtype = torch.int64).unsqueeze(-1) #its index, so int 64
    rews_t =  torch.as_tensor(rews, dtype = torch.float32).unsqueeze(-1)#unsqueez adding dimensions
    dones_t =  torch.as_tensor(dones, dtype = torch.float32).unsqueeze(-1)
    new_obses_t =  torch.as_tensor(new_obses, dtype = torch.float32)
    
    
    #computer targets
    target_q_values = target_net(new_obses_t) #a set of q values for each observations
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0] #max value in dimension 1
    
    #targets
    targets = rews_t + gamma*(1-dones_t)*max_target_q_values
    
    #compute loss
    q_values = online_net(obses_t)#q values for each observation
    action_q_values = torch.gather(input = q_values, dim = 1, index=actions_t) # avalue for an actual action taken in the transition 
    
    loss = nn.functional.smooth_l1_loss(action_q_values, targets)
    
    #gradient descent
    optimizer.zero_grad()
    loss.backward()#compute gradients
    optimizer.step()#apply gradients
    
    #update target network 
    if step % target_update_freq == 0:
        target_net.load_state_dict(online_net.state_dict())
        
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg reward', np.mean(reward_buffer))
        

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 266000
Avg reward 500.0

Step 267000
Avg reward 500.0

Step 268000
Avg reward 500.0

Step 269000
Avg reward 500.0

Step 270000
Avg reward 500.0

Step 271000
Avg reward 500.0

Step 272000
Avg reward 500.0

Step 273000
Avg reward 500.0

Step 274000
Avg reward 500.0

Step 275000
Avg reward 500.0

Step 276000
Avg reward 500.0

Step 277000
Avg reward 500.0

Step 278000
Avg reward 500.0

Step 279000
Avg reward 500.0

Step 280000
Avg reward 500.0

Step 281000
Avg reward 500.0

Step 282000
Avg reward 500.0

Step 283000
Avg reward 500.0

Step 284000
Avg reward 500.0

Step 285000
Avg reward 500.0

Step 286000
Avg reward 500.0

Step 287000
Avg reward 500.0

Step 288000
Avg reward 500.0

Step 289000
Avg reward 500.0

Step 290000
Avg reward 500.0

Step 291000
Avg reward 500.0

Step 292000
Avg reward 500.0

Step 293000
Avg reward 500.0

Step 294000
Avg reward 500.0

Step 295000
Avg reward 500.0

Step 296000
Avg reward 500.0

Step 