<a href="https://colab.research.google.com/github/Thelmaandherls/AI-ML/blob/Udemy/A3C_for_Kung_Fu_Partial_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A3C for Kung Fu

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting shimmy<1.0,>=0.1.0 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[accept-rom-license,atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl.metadata (2.3 kB)
Collecting AutoROM.accep

### Importing the libraries

In [None]:
import cv2
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
import torch.distributions as distributions
from torch.distributions import Categorical
import gymnasium as gym
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [None]:
class Brain(nn.Module):
#network class enables a brain and eye of ai to be formed

  def __init__(self, action_size):
    #activate the Network calss with the super fucntion
    #network calss wil inhertiyt ftom the network module
    super(Brain, self).__init__()
    #define the conlution operatiosn, created as an instance of the conv2d class imported from nn module from torch library
    #will have 4 grey scale frames from kungfu env
    #dcn - besy to inc the num of filters per conv but in a3c it's fine to not icn
    self.conv1 = torch.nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(3,3), stride=2)
    self.conv2 = torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)
    self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)
    #creating the flattenign layer wothin an obj self.flatten
    self.flatten = torch.nn.Flatten()
    #fully connected layers
    self.fcl1 = torch.nn.Linear(512, 128)
    #moving directly to final output layers - the action values - the sum of rewards starting from the current states - in dqn and dcn
    #also have the state value - single output of the network that provides an estimate of the value of the current state
    #value of current state = prediction of the epxetcc return from the current state if the agent follows the current polcity
    #this will output the action values / q values for each of the action represnign ghe expected return if you play the action fromt he cirrent state
    self.fcl2a = torch.nn.Linear(128, action_size)
    #state value - excteing only 1 state value so outptu will be 1
    self.fcl2s = torch.nn.Linear(128, 1)

#forward propr sig from input images to final 2 output layers of the action values and state value
#state = input frames
  def forward(self, state):
    x = self.conv1(state)
    #use relu to activate the sig using the rectivfier activation func
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = self.conv3(x)
    x = F.relu(x)
    #forward prop from conv layer to flattenig layer
    x = self.flatten(x)
    #forward propr to only intermediary layer
    x = self.fcl1(x)
    #have to activate the sig forwrdprop from flatte to fcl1
    x = F.relu(x)
    #forward propr to 2 final output layers
    action_values = self.fcl2a(x)
    #use index 0 to get the state value itself not in any vector or arrya
    state_value = self.fcl2s(x)[0]
    return action_values, state_value

## Part 2 - Training the AI

### Setting up the environment

In [None]:
class PreprocessAtari(ObservationWrapper):

  def __init__(self, env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4):
    super(PreprocessAtari, self).__init__(env)
    self.img_size = (height, width)
    self.crop = crop
    self.dim_order = dim_order
    self.color = color
    self.frame_stack = n_frames
    n_channels = 3 * n_frames if color else n_frames
    obs_shape = {'tensorflow': (height, width, n_channels), 'pytorch': (n_channels, height, width)}[dim_order]
    self.observation_space = Box(0.0, 1.0, obs_shape)
    self.frames = np.zeros(obs_shape, dtype = np.float32)

  def reset(self):
    self.frames = np.zeros_like(self.frames)
    obs, info = self.env.reset()
    self.update_buffer(obs)
    return self.frames, info

  def observation(self, img):
    img = self.crop(img)
    img = cv2.resize(img, self.img_size)
    if not self.color:
      if len(img.shape) == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img.astype('float32') / 255.
    if self.color:
      self.frames = np.roll(self.frames, shift = -3, axis = 0)
    else:
      self.frames = np.roll(self.frames, shift = -1, axis = 0)
    if self.color:
      self.frames[-3:] = img
    else:
      self.frames[-1] = img
    return self.frames

  def update_buffer(self, obs):
    self.frames = self.observation(obs)

def make_env():
  env = gym.make("KungFuMasterDeterministic-v0", render_mode = 'rgb_array')
  env = PreprocessAtari(env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4)
  return env

env = make_env()

state_shape = env.observation_space.shape
number_actions = env.action_space.n
print("Observation shape:", state_shape) # 4 layers of a swaure 42x42
print("Number actions:", number_actions)
print("Action names:", env.env.env.get_action_meanings())

Observation shape: (4, 42, 42)
Number actions: 14
Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']


  logger.deprecation(
  logger.warn(


### Initializing the hyperparameters

In [None]:
#hyperparamter - lr  of atom optimiser to update module paramter / weights of the nn as back propr info
learning_rate = 1e-4
#hyperparam used in formual to compute target state value
discount_factor = 0.99
#hyperparam - trianing xx agent sin xxx envs in parallel whch are running their own proccesses inside with indepent agnts running their own env
number_environments = 10

### Implementing the A3C class

In [None]:
#inetgrating the leanr method into the step method
class Agent():

  def __init__(self, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.action_size = action_size
    self.Brain = Brain(action_size).to(self.device)
    #tool to update the models prama / weight sof thebrain in the nn when backprop the loss through the nn
    #whenever using an optimsier, had to consider the lr
    self.optimizer = torch.optim.Adam(self.Brain.parameters(), lr = learning_rate)

  #implemnt act method -> agent can implent an action in a certain state using softmax strtegy / ploicy
  def act(self, state):
    #state needs to be in a batch as it's what nn in pytorch expects
    #ndim = dimension
    #==3 - states are a 4 frame buffer - stack of 4 gray scale dimension - 1 simension for which fram ein the 4 frames, and then the other 2 dimension for the 42x42
    #if 3 = not in a batch thf single dimension sclae
    if state.ndim == 3:
      #this ensures state is in a batch and thf makesit 4 by adding na extra dimension
      state = [state]
    #convert state to torch tensor
    state = torch.tensor(state, dtype= torch.float32, device = self.device)
    #get the action vqalues only thf use , and _ to not include state valyes
    action_values, _ = self.Brain(state)
    #softmax ploicy to selct a specific action
    #convert action values into proabilitesi - highest will be used
    #dim=-1 - softmax should be applied across the last dimension
    policy = F.softmax(action_values, dim = -1)
    #state is a batch of sevral state so goin to return batch of several actions each corrsponsibding with specific states in batch
    #plociy is also a list of xx polidies for each state in the batch
    #loop over each ploicy for eahc state within a batch in the list
    #going to loop over one specifi state in the batch and then repeat for other batches
    #choice func - enables distribution of probabilities to be drawn out
    #len (p) = number of actions
    #p=p is 1 policy coreesponding to a specific state in the batch of the policy tensor
    #looping over each policy
    #.detcah to detcah policy tenspr from computational graphs as won't be using gradients, send to cpu and convert tesnor to np array
    return np.array([np.random.choice(len(p), p = p) for p in policy.detach().cpu().numpy()])

#step method- method to be called when the agent takes a step in the env - recives xx vars which then update the model params/weight of nn -> trainign -> better score
#the argument passed in this method are batches in numpy arrays
  def step(self, state, action, reward, next_state, done):
    batch_size = state.shape[0]
    #1st dimension in a torhc tensor reps #states of obsv
    #converting arguments from np arrays to torch tensors
    state = torch.tensor(state, dtype= torch.float32, device = self.device)
    next_state = torch.tensor(next_state, dtype= torch.float32, device = self.device)
    reward = torch.tensor(reward, dtype= torch.float32, device = self.device)
    #done was already a bool alue - done or not - have to ensure all values are in the same format to change it to a float format at the end
    done = torch.tensor(done, dtype= torch.bool, device = self.device).to(dtype= torch.float32)
    action_values, state_value = self.Brain(state) #batch of states
    #need next state value for deep q learning and not the next action values
    _, next_state_value = self.Brain(next_state)
    #computing target state value using the bellman equation
    target_state_value = reward + discount_factor * next_state_value * (1 - done)
    #advatnge part of a3c
    advantage = target_state_value - state_value
    #ciritic of a3c - computing total loss which will back prop into nn - total loss = critic loss - actors loss (adv = [prt of actors loss])
    #req entropy to compute actor loss
    #req distirbution of probabilities over the action values and the log probablities over the same action values to calc entropy
    #probs = same as the ploicy in the act memehtod within this class
    probs = F.softmax(action_values, dim = -1)
    log_probs = F.log_softmax(action_values, dim = -1) #tensor
    #entropy is the negative sum of probs and logprobs - axis = axis dimension in which doing the sum in
    entropy = -torch.sum(probs * log_probs, axis = -1)
    #now need log probs of actions that are actually selected from the whole bacth of states
    #creating an array of batch sizes and then using it to select th elog probs of the actions taken in the bacth
    #.arange - dtermines the size of the array - here i say that the size of the array is the size of the batch_size
    batch_idx = np.arange(batch_size)
    #using batch_idx to slect log pobs of actions actually playes
    #calling logprobs and then accessing the indexes of actions taken by the batch which are given by the action var
    #takign all the rows in the logprobs tensor to get all of the log probabilities ofr each action selected by entering batch_idx as the first dimension
    logp_actions = log_probs[batch_idx, action]
    #actors loss = -ve mean of the product of the log probs of the actions taken x the advs
    #simultaneously encouraging exploration by adding the mean of entropy x a hyperparam (0.001) that will balance the improtance of entropy
    #adding .deatch to adv tensor to prevent gradients from flowinf into the citic network durng the actors update
    actor_loss = -(logp_actions * advantage.detach()).mean() - 0.001 * entropy.mean()
    #criticis loss = mean swaured error loss b/w tareget state and actual state value
    critic_loss = F.mse_loss(target_state_value.detach(), state_value)
    total_loss = actor_loss + critic_loss
    #backprop total loss into a3c nn and then use optimiser to update nn weights
    #have to reset optimiser first ebfore bakc prop ALWAYS
    #zero_grad method resets te optimsier by zeroingin out the gradients to prevent acumualtion from previous iterations
    self.optimizer.zero_grad()
    #back prop total loss
    total_loss.backward()
    #update nn weight with optimsier - .step updates the weight of nn to minimise total_loss
    self.optimizer.step()




### Initializing the A3C agent

In [None]:
agent = Agent(number_actions)
#takes action size which is num of actions from the preprocess atari class which is 14

### Evaluating our A3C agent on a single episode

In [None]:
#evaluating the a3c agent on a specific num of eps  - will ahe the numof eps as a hyperparam
#evaluate func returns a list of total rewards of each number of action in a specific num of episodes
def evaluate(agent, env, n_eps = 1):
  #initlaised as empty list but will contain total accumkated rwards for each epsthe agent is evaluated on
  episodes_rewards = []
  #use _ as not using loop var - looping over diff eps
  for _ in range(n_eps):
    #staring a new ep so need to initialise the state using reset method from env object - only want state
    state, _ = env.reset()
    total_reward = 0
    #doing an infinite loop that will break if the epsiodeis done
    while True:
      #dealing w a specific ep within a specific time step, so know the agent needs to play an action to reach the next state
      action = agent.act(state)
      #getting the next state which si the state the agnet reaches after laying the actions, the reward, dones, info and other var - callstep emthod which takes index 0 as action is an np.array and take index 0 as assume evaluting agent in non-batch mode
      state, reward, done, info, _ = env.step(action[0])
      total_reward += reward
      if done:
        break
    #let the while loop, now appending the total_reward from the whle loop to the epsioddes reward within the for loop
    episodes_rewards.append(total_reward)
    #evaluate func has to return list of certain rewards over a specific number of epsidoes
  return episodes_rewards




### Testing multiple agents on multiple environments at the same time

In [None]:
#creating a class to handle multiple env simultaneously in the context of a parallesized reinforcement learning - what a3c algo is about
#this can be done for exmale by faiclitating the simualtneous steppign and ressting of xxx env to use aa3c env more readily
class EnvBatch:
  #n_envs = num of env tomanage simualtenously - hard cording so instance will always be 10 envs
  def __init__(self, n_envs = 10):
    #list of xx env
    #use make_env func through a for loop within a list to create xxx envs in one var
    #_ = loop varibale that is disregarded as will not be used explicitly
    self.envs = [make_env() for _ in range (n_envs)]

  #reset method to reset xxx envs at simultaneously thf can have xxx management of env
  def reset(self):
    #list of initalsied state that is gotten from self env obj containing diff envs
    _states = []
    #create list of reset/initlaised states for each env in self.envs - done with for loop to loop over diff env in self.envs - env = env wihin xx envs
    for env in self.envs:
      #getting initialsied states in each of the self.env using the reset method - reset method returns more than the statesso use index 0
      _states.append(env.reset()[0])
      #return list of initalised list in form of np array
    return np.array(_states)

  #method that enables us to step in xx env simul
  #take actions as gotten to that state based on action - doinf xxx agent thdf ahve xx actions
  def step(self, actions):
    #trasnforming method from single step in 1 env to  xx envs
    #doing a double loop action w var a called in step and also in the for loop so get 1 action per env
    #zip func finsihes the double loop - loop in the self.env xx var obj and also actions
    #have to return as np array thf do zip at the start to transpose list of tupples / to chnage to format of np array
    #have to add * to do the transposition -can't directly use np array func as usezip and transpose so have to use map func called onnp.array
    #this line of code gets the group of next states, rewards, ddones and info all converted into np arrays
    next_states, rewards, dones, infos, _ = map(np.array, zip(*[env.step(a) for env, a in zip(self.envs, actions)]))
    #have to chekc if any of the xx envs have finished, if it has, then it needs to be reset
    #len(Self.envs) to see the number of envs
    for i in range(len(self.envs)):
      #if dones boolean of env i is true then reset states of env which is given by [i]
      if dones[i]:
        #next state of specific env, now goign to reset it by updating it - calling reset func from specific env, accessed by self.envs obj as it contains xx envs
        #index 0 to get onnly the state
        next_states[i] = self.envs[i].reset()[0]
    return next_states, rewards, dones, infos




### Training the A3C agent

In [None]:
import tqdm #module displays progress bar of the loops

#instance of the env bacth class which will be 10, thf have 10 agents being changed in 10 diff envs -> better and faster trainign env that reaches traning very fast
#using num envs and no n-envs as n_envs = name of arg
env_batch = EnvBatch(number_environments)
#reset batch of states - have batch of states in c=xx envs from env batch class - use reset method called foem insatnce of env batch class
batch_states = env_batch.reset()

#growing progress bar for training using t.range func in tqdm module - contians rnage of iteratiosn the mdoel will be trianed in - include upper bound as prinving avg score every 10 eps
with tqdm.trange(0, 3001) as progress_bar:
  for i in progress_bar:
    #batch of state has been reset - now multiple agents in xx envs - eahc agent needs to play an actions in the intilaised sate simultaneously usng act method
    batch_actions = agent.act(batch_states)
    batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
    #xx env se=tep method for xx actions in xx envs
    #rl trick to stabilise the trianing - dec th emagntiude of rewards to stabilise the trianing
    batch_rewards *= 0.01
    #calling step method from agent class which performs step of the trainign and will compute the total loss and then back rop with atom optimiser
    agent.step(batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones)
    batch_states = batch_next_states
    #printing avg reward every 100 iteratiosn so progress bar wil grow 3 times
    if i % 1000 == 0:
      #compute acucmulated reards over specific num of eps and then get the mean of the output of the evlaute func - every 10 eps
      print("Avergae agent reward: ", np.mean(evaluate(agent, env, n_eps= 10)))



  logger.deprecation(
  critic_loss = F.mse_loss(target_state_value.detach(), state_value)
  state = torch.tensor(state, dtype= torch.float32, device = self.device)
  0%|          | 5/3001 [00:44<5:31:04,  6.63s/it] 

Avergae agent reward:  500.0


 33%|███▎      | 1005/3001 [02:01<1:41:02,  3.04s/it]

Avergae agent reward:  870.0


 67%|██████▋   | 2005/3001 [03:31<54:03,  3.26s/it]  

Avergae agent reward:  1750.0


100%|██████████| 3001/3001 [04:50<00:00, 10.34it/s]

Avergae agent reward:  710.0





## Part 3 - Visualizing the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env):
  state, _ = env.reset()
  done = False
  frames = []
  while not done:
    frame = env.render()
    frames.append(frame)
    action = agent.act(state)
    state, reward, done, _, _ = env.step(action[0])
  env.close()
  imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, env)

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()