# CartPole-v0

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

In [None]:
# With no optimization & no instructions
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(800):
    env.render()
    env.step(env.action_space.sample())
env.close()

## Importing Libraries

In [None]:
import math
import random
import numpy as np
import gym

from collections import namedtuple
from itertools import count
from PIL import Image

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 7

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

## Set-Up Display

In [None]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

## Deep Q-Network

In [None]:
# nn.Module is the base class
class DQN(nn.Module):
    def __init__(self, img_height, img_width):
        super().__init__()
        
        # specifing layers
        self.fc1 = nn.Linear(in_features=img_height*img_width*3, out_features=24)    # *3 refers to "rgb"
        self.fc2 = nn.Linear(in_features=24, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=2)    # 2 refers to left & right
        
    def forward(self, t):
        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)
        return t

## Experience Class

In [None]:
Experience = namedtuple('Experience', ('state', 'action', 'next_state', 'reward')) 

## Replay Memory

In [None]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []    # for storing experiences
        self.push_count = 0    # to keep track of how many experience is added to memory
        
    # adding experience to memory
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1
    
    # returns random sample of experiences for training DQN
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    # for checking the batch_size
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

## Epsilon Greedy Strategy

In [None]:
class EpsilonGreedyStrategy():
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
    
    # returns exploration rate which helps in exploit-explore decision
    def get_exploration_rate(self, current_step):
        return self.end + (self.start - self.end) * math.exp(-1 * current_step * self.decay)

## Reinforcement Learning Agent

In [None]:
class Agent():
    # strategy refers to EpsilonGreedyStrategy
    # num_actions refers to number of available actions; in our case, it will always be 2 (left/right)
    # device specifies whether to use cpu or gpu
    def __init__(self, strategy, num_actions, device):
        self.current_step = 0
        self.strategy = strategy
        self.num_actions = num_actions
        self.device = device
    
    # policy_net refers to DQN that is train to learn optimal policy
    def select_action(self, state, policy_net):
        rate = strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        
        if rate > random.random():
            action = random.randrange(self.num_actions)    # explore
            return torch.tensor([action]).to(device)
        else:
            with torch.no_grad():    # to turn-off gradient tracking (used for reference & not for training)
                return policy_net(state).argmax(dim=1).to(device)    # exploit

In [None]:
class CartPoleEnvManager():
    def __init__(self, device):
        self.device = device
        self.env = gym.make('CartPole-v0').unwrapped    # unwrapped gives us access to behind-the-scenes of environment
        self.env.reset()    # reset to starting state
        self.current_screen = None
        self.done = False    # checking whether the episode has ended or not
        
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
    
    def render(self, mode='human'):
        return self.env.render(mode)    # render current state to the screen
    
    # returns number of actions available to an agent in the environment i.e., 2 (left/right)
    def num_actions_available(self):
        return self.env.action_space.n
    
    def take_action(self, action):
        # action is of tensor type so item() is used to return the value of this tensor
        _, reward, self.done, _ = self.env.step(action.item())
        return torch.tensor([reward], device=self.device)
    
    def crop_screen(self, screen):
        screen_height = screen.shape[1]
        
        # Strip-off top & bottom
        top = int(screen_height * 0.4)
        bottom = int(screen_height * 0.8)
        screen = screen[:, top:bottom, :]
        return screen
    
    def transform_screen_data(self, screen):
        # converting to float, rescaling & then to tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        # using torchvision package to compose image transforms
        resize = T.Compose([T.ToPILImage(), T.Resize((40, 90)), T.ToTensor()])
        # adding extra dimension which is referred to as batch dimension by calling unsqueeze
        return resize(screen).unsqueeze(0).to(self.device)
    
    def get_processed_screen(self):
        # renders the environment as an rgb array & then transposes it into order of channels, height & width
        screen = self.render('rgb_array').transpose((2, 0, 1))    # Pytorch DQN expectection
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)
    
    def just_starting(self):
        return self.current_screen is None
    
    # returns current state of the environment in the form of processed image of the screen
    # which will be difference between current screen and previous screen
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2 - s1
        
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]

## Example of Non-processed screen

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
em = CartPoleEnvManager(device)
em.reset()
screen = em.render('rgb_array')

plt.figure()
plt.imshow(screen)
plt.title('Non-processed screen example')
plt.show()

## Tensor Processing

In [None]:
def extract_tensors(experiences):
    batch = Experience(*zip(*experiences))
    
    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)
    
    return (t1, t2, t3, t4)

## Q-Value Calculator

In [None]:
class QValues():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    @staticmethod
    def get_current(policy_net, states, actions):
        return policy_net(states).gather(dim=1, index=actions.unsqueeze(-1))
    
    @staticmethod
    def get_next(target_net, next_states):
        final_state_locations = next_states.flatten(start_dim=1).max(dim=1)[0].eq(0).type(torch.bool)
        non_final_state_locations = (final_state_locations == False)
        non_final_states = next_states[non_final_state_locations]
        batch_size = next_states.shape[0]
        values = torch.zeros(batch_size).to(QValues.device)
        values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
        return values

## Utility Functions - Plotting

In [None]:
def get_moving_average(period, values):
    values = torch.tensor(values, dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(dimension=0, size=period, step=1).mean(dim=1).flatten(start_dim=0)
        moving_avg = torch.cat((torch.zeros(period-1), moving_avg))
        return moving_avg.numpy()
    else:
        moving_avg = torch.zeros(len(values))
        return moving_avg.numpy()
    
def plot(values, moving_avg_period):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(values)
    
    moving_avg = get_moving_average(moving_avg_period, values)
    plt.plot(moving_avg)
    plt.pause(0.0001)
    print("Episode", len(values), "\n", moving_avg_period, "episode moving average: ", moving_avg[-1])
    if is_ipython:
        display.clear_output(wait=True)

In [None]:
plot(np.random.rand(300), 100)

## Main Program

In [None]:
batch_size = 256
gamma = 0.999    # discount factor
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10    # frequently in terms of episodes will the weights of targeted network be updated by policy network
memory_size = 100000    # Replay memory capacity
lr = 0.001    # learning rate
num_episodes = 1000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
em = CartPoleEnvManager(device)
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, em.num_actions_available(), device)
memory = ReplayMemory(memory_size)

policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net.load_state_dict(policy_net.state_dict())    # cloning policy network for weights
target_net.eval()    # not in training mode
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)

In [None]:
episode_durations = []
# for each episode
for episode in range(num_episodes):
    em.reset()
    # initialize the starting state
    state = em.get_state()
    
    # for each timestep
    for timestep in count():
        # select an action (explore or exploit)
        action = agent.select_action(state, policy_net)
        # execute selected action in an emulator & get reward, next state
        reward = em.take_action(action)
        next_state = em.get_state()
        # store experience in replay memory
        memory.push(Experience(state, action, next_state, reward))
        
        # preprocess states from batch
        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = extract_tensors(experiences)
            
            # passing batch of preprocessed states to policy network
            current_q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states)
            target_q_values = rewards + (gamma * next_q_values) 
            
            # calculate loss between output Q-values & targeted Q-values
            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad()    # sets the gradients of all the weights & biases of policy network to zero
            loss.backward()    # computes the weights & biases for policy network
            optimizer.step()    # updates the weights & biases which were computed during backward()
            
        if em.done:
            episode_durations.append(timestep)
            plot(episode_durations, 100)
            break
    
    # for updating the weights of target network with policy network
    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())
        
em.close()