# Welcome!
Below, we will learn to implement and train a policy to play atari-pong, using only the pixels as input. We will use convolutional neural nets, multiprocessing, and pytorch to implement and train our policy. Let's get started!

(I strongly recommend you to try this notebook on the Udacity workspace first before running it locally on your desktop/laptop, as performance might suffer in different environments)

In [None]:
# install package for displaying animation
!pip install JSAnimation
!pip install dataclasses

# custom utilies for displaying animation, collecting rollouts and more
import pong_utils

%matplotlib inline

# check which device is being used. 
# I recommend disabling gpu until you've made sure that the code runs
device = pong_utils.device
print("using device: ",device)

In [None]:
# render ai gym environment
import gym
import time

# PongDeterministic does not contain random frameskip
# so is faster to train than the vanilla Pong-v4 environment
env = gym.make('PongDeterministic-v4')

print("List of available actions: ", env.unwrapped.get_action_meanings())

# we will only use the actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5
# the 'FIRE' part ensures that the game starts again after losing a life
# the actions are hard-coded in pong_utils.py

# Preprocessing
To speed up training, we can simplify the input by cropping the images and use every other pixel



In [None]:
import matplotlib
import matplotlib.pyplot as plt

# show what a preprocessed image looks like
env.reset()
_, _, _, _ = env.step(0)
# get a frame after 20 steps
for _ in range(20):
    frame, _, _, _ = env.step(1)

plt.subplot(1,2,1)
plt.imshow(frame)
plt.title('original image')

plt.subplot(1,2,2)
plt.title('preprocessed image')

# 80 x 80 black and white image
plt.imshow(pong_utils.preprocess_single(frame), cmap='Greys')
plt.show()



# Policy

## Exercise 1: Implement your policy
 
Here, we define our policy. The input is the stack of two different frames (which captures the movement), and the output is a number $P_{\rm right}$, the probability of moving left. Note that $P_{\rm left}= 1-P_{\rm right}$

In [None]:
class Crap():
    
    def __init__(self):
        self.a = 0
        self.b = 1
        print("Nothing to do")
        
crap = Crap()
keys = crap.__dict__.keys()
print(keys)

crap.__dict__['a']



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init
from dataclasses import dataclass
import math
from enum import Enum


# set up a convolutional neural net
# the output is the probability of moving right
# P(left) = 1-P(right)
@dataclass
class ConvLayerParams:
    input_ch: int
    output_ch:int 
    kernel: int
    stride: int

class LayerInitialization(Enum):
    ZERO = 0
    NORMAL = 1
    XAVIER = 2
    
class Policy(nn.Module):

    def __init__(self, input_size: tuple,conv_layers_params: ConvLayerParams = None, normalization="zero"):
        super(Policy, self).__init__()
        
        
    ########
    ## 
    ## Modify your neural network
    ##
    ########
        self.normalization = normalization
        if conv_layers_params:
            self.use_complex_model = True
            self.create_complex_NN(input_size,conv_layers_params, normalization)
        else:
            self.use_complex_model = False
            self.create_simple_NN()
    

    def create_complex_NN(self, input_size, conv_layers_params,normalization):
        self.conv_layer_names=[]
        self.conv_layer_output_size=[]
        layer_input_size = input_size
        for i, conv_layer_params in enumerate(conv_layers_params):
            created_conv_layer = self.create_conv_layer(conv_layer_params.input_ch, 
                                       conv_layer_params.output_ch, 
                                       conv_layer_params.kernel,
                                       conv_layer_params.stride, 
                                       normalization)
            print("created_conv_layer:{}".format(created_conv_layer))
            conv_layer_name = "conv_layer" + str(i)
            self.add_module(conv_layer_name,created_conv_layer)
            self.conv_layer_names.append(conv_layer_name)
            
            layer_input_size = self.compute_conv_layer_output_size(layer_input_size, conv_layer_params.kernel, conv_layer_params.stride)
            self.conv_layer_output_size.append(layer_input_size)
        
        final_conv_layer_output_size = self.conv_layer_output_size[-1]        
        
        self.fc = nn.Linear(final_conv_layer_output_size[0]*final_conv_layer_output_size[1]*conv_layer_params.output_ch, 1)
        self.sig = nn.Sigmoid()
        
        print("Summary: the following conv layers were created: {}\nFinal output size: {}".format(self.conv_layer_names,final_conv_layer_output_size))   
        
    def create_simple_NN(self):
        self.conv1 = nn.Conv2d(2, 4, kernel_size=4, stride=4)  
        # 80x80 to outputsize x outputsize
        # outputsize = (inputsize - kernel_size + stride)/stride 
        # (round up if not an integer)
        self.size1=4*20*20
        
        self.conv2 = nn.Conv2d(4,16, kernel_size = 2, stride=2)
        self.size2 = 16*10*10 
        
        
        # 1 fully connected layer
        self.fc = nn.Linear(self.size2, 1)
        self.sig = nn.Sigmoid()
        
    def create_conv_layer(self, input_ch, output_ch, kernel_size, stride, normalization):
        conv_layer=nn.Conv2d(input_ch, output_ch, kernel_size=kernel_size, stride=stride)
        self.normalize_conv_layer(conv_layer,normalization)
        return conv_layer
        
    
    def compute_conv_layer_output_size(self,input_size,kernel_size,stride):
        return (               
                math.ceil((input_size[0]-kernel_size+stride)/stride),
                math.ceil((input_size[1]-kernel_size+stride)/stride)
        )
    
    def normalize_conv_layer(self, conv_layer, normalization):
        if normalization == LayerInitialization.ZERO:
            conv_layer.weight.fill_(0)
            conv_layer.bias.fill_(0)
        elif normalization == LayerInitialization.NORMAL:
            torch.nn.init.normal_(conv_layer.weight.data)
            torch.nn.init.normal_(conv_layer.bias.data)
        elif normalization == LayerInitialization.XAVIER:
            torch.nn.init.xavier_(conv_layer.weight.data)
            torch.nn.init.xavier_(conv_layer.bias.data)
        else:
            NotImplementedError
    
    def forward(self,x):
        if self.use_complex_model:
            self.forward_complex_NN(x)
        else:
            self.forward_simple_NN(x)
        
    def forward_complex_NN(self,x):
        print(f"x has the following shape: {x.shape}")
        for conv_layer_name in self.conv_layer_names:#
            conv_layer = self.__getattr__(conv_layer_name)
            print(conv_layer)
            x = F.relu(conv_layer(x))
        print(f"done with conv layers with output x={x} with properties x.shape={x.shape}")
        print(f"conv_layer.out_channels={conv_layer.out_channels}")
        
        x = x.view(-1,self.conv_layer_output_size[-1][0]*self.conv_layer_output_size[-1][1]*conv_layer.out_channels)
        return self.sig(self.fc(x))
                            
    def forward_simple_NN(self, x):
            
        x = F.relu(self.conv1(x))
        # TODO: use batch normalization
        x = F.relu(self.conv2(x))
        # TODO: use batch normalization
        # flatten the tensor
        x = x.view(-1,self.size2)
        return self.sig(self.fc(x))
    
    
    def create_policy_file_name_prefix(self):
        if self.use_complex_model:
            ["c{}x{}_k{}x{}_s{}x{}_{}".format(c.in_channels,
                                            c.out_channels,
                                            c.kernel_size[0],
                                            c.kernel_size[1],
                                            c.stride[0],
                                            c.stride[1],
                                            self.normalization) for c in self.conv]
        else:
            return "reinforce_simple_policy"

# use your own policy!
#policy=Policy().to(device)
#print(policy.conv1.weight.data)

#policy=pong_utils.Policy().to(device)


### Choose Policy


In [None]:
def create_checkpoint_name_suffix(t_max,episode):
    return "tMax{}_episode{}".format(t_max,episode)

def create_policy_filename(policy_fname_prefix,checkpoint_fname_suffix):
    policy_fname_prefix + "-" + checkpoint_fname_suffix + ".pth"

In [None]:
import ipywidgets as widgets
from enum import Enum, auto

    
class PolicyFromModel(Enum):
    MINE_SIMPLE_UNTRAINED = auto()
    MINE_SIMPLE_TRAINED = auto()
    MINE_CONV_LAYER_PARAMS = auto()
    MINE_MODULE_LIST = auto()
    SOLUTION_UNTRAINED = auto()
    SOLUTION_TRAINED = auto()
    


In [None]:
policy_from_model = PolicyFromModel.MINE_CONV_LAYER_PARAMS

if policy_from_model == PolicyFromModel.SOLUTION_TRAINED:
    policy = torch.load("REINFORCE_solution.policy")
elif policy_from_model == PolicyFromModel.SOLUTION_UNTRAINED:
    policy=pong_utils.Policy().to(device)
elif policy_from_model == PolicyFromModel.MINE_SIMPLE_UNTRAINED:
    policy=Policy((80,80)).to(device)
elif policy_from_model == PolicyFromModel.MINE_SIMPLE_TRAINED:
    policy = torch.load(
        create_policy_filename(Policy((80,80)).create_policy_filename_prefix,
                               create_checkpoint_name_suffix(5,320)).to(device)
    )
elif policy_from_model == PolicyFromModel.MINE_CONV_LAYER_PARAMS:
    l1=ConvLayerParams(2,4,4,4)
    l2=ConvLayerParams(4,16,4,2)
    policy = Policy(input_size=(80,80),conv_layers_params=[l1,l2])
elif policy_from_model == PolicyFromModel.MINE_MODULE_LIST:
    raise NotImplementedError
    nn.ModuleList[nn.Conv2d(2,4,4,4), 
                  nn.ReLU(),
                 nn.Conv2d(4,16,4,2),
                 nn.ReLU,
                 nn.Linear()]
    

# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
print(policy)
import torch.optim as optim
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

In [None]:
widget = widgets.Dropdown(
    options=[PolicyFromModel.MINE_SIMPLE_UNTRAINED,
             PolicyFromModel.MINE_SIMPLE_TRAINED,
             PolicyFromModel.MINE_CONV_LAYER_PARAMS,
             PolicyFromModel.MINE_MODULE_LIST,
             PolicyFromModel.SOLUTION_UNTRAINED,
             PolicyFromModel.SOLUTION_TRAINED             
            ],
    
    value=PolicyFromModel.MINE_SIMPLE_UNTRAINED,
    description="Policy",
    disabled=False,
    func=get_method,
    min_width='2000px'
)
a = 1
def on_change(change, a):
    if change['type'] == 'change' and change['name'] == 'value':
        print("changed to %s" % change['new'])
        a=2
        return change['new']

widget.observe(on_change)
display(f"a={a}")
display(widget)

In [None]:
print(policy)


# Game visualization
pong_utils contain a play function given the environment and a policy. An optional preprocess function can be supplied. Here we define a function that plays a game and shows learning progress

In [None]:
pong_utils.play(env, policy, time=200) 
# try to add the option "preprocess=pong_utils.preprocess_single"
# to see what the agent sees

# Rollout
Before we start the training, we need to collect samples. To make things efficient we use parallelized environments to collect multiple examples at once

In [None]:
envs = pong_utils.parallelEnv('PongDeterministic-v4', n=8, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs, policy, tmax=100)


In [None]:
reward

In [None]:
print(state[0].shape)
print(torch.stack(state).shape)
print(len(prob))
print(prob[0])

# Function Definitions
Here you will define key functions for training. 

## Exercise 2: write your own function for training
(this is the same as policy_loss except the negative sign)

### REINFORCE
you have two choices (usually it's useful to divide by the time since we've normalized our rewards and the time of each trajectory is fixed)

1. $\frac{1}{T}\sum^T_t R_{t}^{\rm future}\log(\pi_{\theta'}(a_t|s_t))$
2. $\frac{1}{T}\sum^T_t R_{t}^{\rm future}\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)}$ where $\theta'=\theta$ and make sure that the no_grad is enabled when performing the division

In [None]:
import numpy as np
def surrogate(policy, old_probs, states, actions, rewards,
              discount = 0.995, beta=0.01, debug = False):

    ## Documentation by Anas:
    # rewards.shape is list of length M of N sized arrays 
    # states is a list of length M of tensors of the size [N, 2, 80, 80]
    
    # where
    #   M is the number of time steps and 
    #   N is the num of parallel environments
    
    
    ########
    ## 
    ## WRITE YOUR OWN CODE HERE
    ##
    ########
    
    # ANAS
    
    num_steps = len(rewards)
    
    gamma=0.99
    
    discount = gamma**np.arange(num_steps)
    discounted_rewards = np.asarray(rewards)*discount[:,np.newaxis]
    rewards_future = discounted_rewards[::-1].cumsum(axis=0)[::-1]
    
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10
    rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis]
    
    if debug:
        print(f"discounted_rewards: \n {discounted_rewards}")
        print(f"rewards_future: \n {rewards_future}")
        print(f"rewards_normalized: \n {rewards_normalized}")
    
    actions = torch.tensor(actions, dtype=torch.int8, device=device)

    
    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)
    new_probs_log = torch.log(new_probs)
    
    
    # include a regularization term
    # this steers new_policy towards 0.5
    # which prevents policy to become exactly 0 or 1
    # this helps with exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    
    old_probs_tensor = torch.from_numpy(np.asarray(old_probs)).to(device)
    entropy = -(new_probs*torch.log(old_probs_tensor+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs_tensor+1.e-10))

    if debug:
        print(f"old_probs={old_probs}")
        print(f"new_probs={new_probs}")
        print(f"old_probs_tensor={old_probs_tensor}")
        
        
    final_term = torch.mean(new_probs_log*torch.from_numpy(rewards_normalized).to(device).float() + beta*entropy) # mean ensures division by num_steps
    #return torch.mean(beta*entropy)
    return final_term

Lsur= surrogate(policy, prob, state, action, reward, debug=True)

print(Lsur)

In [None]:
## First some understanding
arr1 = np.array([1,2])
arr2 = np.array([3,4])
arr3 = np.array([5,6,7])
arrList = [arr1,arr2]
print(arrList)
print(np.asarray(arrList))
arrList2 = [arr1,arr2,arr3]
print(arrList2)
arrList2AsArray = np.asarray(arrList2)
print(arrList2AsArray)
print(type(arrList2AsArray))
print(arrList2AsArray.shape)

In [None]:
a = np.array([[1,2],[3,4]])
m = np.array([10,20])
print(f"np.dot(a,m)={np.dot(a,m)}")
print(f"a*m={a*m}")
print(f"a*m[:,np.newaxis]={a*m[:,np.newaxis]}")


In [None]:
a = np.array([[1,2],[3,4]])
s = np.array([10,20])
print(f"a-m={a-m}")

print(f"a-m[:,np.newaxis]={a-m[:,np.newaxis]}")



In [None]:

test_data = [torch.from_numpy(np.random.rand(8,2,80,80)).float(),
             torch.from_numpy(np.random.rand(8,2,80,80)).float()]
pong_utils.states_to_prob(policy,test_data)


# Training
We are now ready to train our policy!
WARNING: make sure to turn on GPU, which also enables multicore processing. It may take up to 45 minutes even with GPU enabled, otherwise it will take much longer!

In [None]:
debug = False
use_my_surrogate = False
import time
from parallelEnv import parallelEnv
import numpy as np
# WARNING: running through all 800 episodes will take 30-45 minutes

# training loop max iterations
episode = 100
# episode = 800

# widget bar to display progress
!pip install progressbar
import progressbar as pb
widget = ['training loop: ', pb.Percentage(), ' ', 
          pb.Bar(), ' ', pb.ETA() ]
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()

# initialize environment
envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234)

discount_rate = .99
beta = .01
tmax = 320

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    old_probs, states, actions, rewards = \
        pong_utils.collect_trajectories(envs, policy, tmax=tmax)
    if debug:
        print(f"len(rewards)={len(rewards)}")
        print(f"rewards[0].shape={rewards[0].shape}")
        print(f"type(rewards[0])={type(rewards[0])}")
        print(f"len(states)={len(states)}")
        print(f"states[0].shape={states[0].shape}")
        print(f"len(actions)={len(actions)}")
        print(f"actions[0].shape={actions[0].shape}")
        time.sleep(10)
        
    total_rewards = np.sum(rewards, axis=0)

    # this is the SOLUTION!
    # use your own surrogate function
    # L = -surrogate(policy, old_probs, states, actions, rewards, beta=beta)
    
    if use_my_surrogate:
        L = -surrogate(policy, old_probs, states, actions, rewards, beta=beta)
    else:
        L = -pong_utils.surrogate(policy, old_probs, states, actions, rewards, beta=beta)
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
    del L
        
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.995
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    timer.update(e+1)
    
timer.finish()

torch.save(policy,create_checkpoint_name(episode,tmax))



In [None]:
import os
os.listdir(".")

In [None]:
# play game after training!
pong_utils.play(env, policy, time=2000) 

In [None]:
plt.plot(np.asarray(mean_rewards),label="mean_rewards")
plt.plot(np.asarray(mean_rewards_prior_run),label="mean_rewards_prior_run")
plt.legend()

In [None]:
# save your policy!
torch.save(policy, 'REINFORCE.policy')

# load your policy if needed
# policy = torch.load('REINFORCE.policy')

# try and test out the solution!
# policy = torch.load('PPO_solution.policy')

In [None]:
original_policy=torch.load('REINFORCE_solution.policy')

In [None]:
original_policy