AICU Lane following with stable baselines3



Check the recieved GPU (necessary minimum of 16gb GPU RAM)

In [None]:
!nvidia-smi

Installation section:
 - Stable baselines 3
 - Optuna
 - swig cmake
 - GYM (OpenAI)
 - pip (latest version)
 - GYM Duckietown
 - Virtual display librarys:
  - pyvirtualdisplay
  - piglet
  - xvfb
  - python-opengl





In [None]:
#Install stable baselines 3
!pip install stable-baselines3[extra] box2d box2d-kengz

#Install Optuna (hyperparam. optimizer tool)
!pip install optuna

#Install swig, cmake
!apt install swig cmake

#Clone and install OPENAI - GYM
!git clone https://github.com/openai/gym.git
%cd gym/
!pip3 install -e .

#Install and upgrade pip
!python3 -m pip install --upgrade pip
%cd ..

#Clone and install Duckietown-GYM
!git clone https://github.com/duckietown/gym-duckietown.git
%cd gym-duckietown/
!git checkout dcf8dd3
!pip install -e .

#Create virtual monitor
%cd /
!pip install pyvirtualdisplay
!pip install piglet
!apt install xvfb -y
!apt-get install python-opengl -y
!apt-get install ffmpeg freeglut3-dev xvfb 

#Start display
print("\n Starting virtual display... \n")
from pyvirtualdisplay import Display
display = Display(visible=0, size=(640, 480))
display.start()

#Install duckietown world for visualization
#!git clone https://github.com/duckietown/duckietown-world.git
#!git chekcout e069378
#%cd duckietown-world/
#!pip install -r requirements.txt





In [None]:
#Chosing the right tensorflow version for compatibility reasons
%tensorflow_version 1.x

Import the necessary librarys:
- gym
- gym envs
- duckietown envs
- display
- os
- numpy
- plt
- PPO
- CnnPolicy
- BaseCallback
- evaluate policy
- OPTUNA
- joblib


In [None]:
%cd .. 

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt

%cd content
%cd gym
from gym import envs
import gym

%cd ..
%cd gym-duckietown/
import gym_duckietown.envs.duckietown_env
env1 = 'Duckietown-small_loop-v0'   #Small loop map
env2 = 'Duckietown-udem1-v0'        #More complex urbun environment
env3 = 'Duckietown-straight_road-v0'#Straight road map

from stable_baselines3 import PPO, A2C #Algorithm and policy import 
from stable_baselines3.ppo.policies import CnnPolicy
from stable_baselines3.a2c.policies import CnnPolicy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy

from pyvirtualdisplay import Display
display = Display(visible=0, size=(640, 480))
display.start()

import optuna
import joblib as joblib
from optuna.samplers import TPESampler

#For data visualization
from google.colab import files

#In case you want to save best models automaticly
#!pip install pickle-mixin
#import pickle

Start the basic training with Stable baselines 3 algorithm



In [None]:
#Create model
model = PPO(CnnPolicy, env1, verbose=0, n_steps=512, batch_size=128, gamma=0.99, 
            gae_lambda=0.9, n_epochs=20, ent_coef=0.0, sde_sample_freq=4, 
           max_grad_norm=0.5, vf_coef=0.5, learning_rate=0.00003, use_sde=True, clip_range= 0.4)

In [None]:
#OPTIONAL
#Callback function for debugging, and superviseing


#class SaveOnBestTrainingRewardCallback(BaseCallback):
    
    
# #   Callback for saving a model (the check is done every ``check_freq`` steps)
# #   based on the training reward (in practice, we recommend using ``EvalCallback``).

# #   :param check_freq: (int)
# #   :param log_dir: (str) Path to the folder where the model will be saved.
# #     It must contains the file created by the ``Monitor`` wrapper.
# #   :param verbose: (int)
    
#    def __init__(self, check_freq: int, log_dir: str, verbose=1):
#        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
#        self.check_freq = check_freq
#        self.log_dir = log_dir
#        self.save_path = os.path.join(log_dir, 'best_model')
#        self.best_mean_reward = -np.inf

#    def _init_callback(self) -> None:
#        # Create folder if needed
#        if self.save_path is not None:
#            os.makedirs(self.save_path, exist_ok=True)

#    def _on_step(self) -> bool:
#        if self.n_calls % self.check_freq == 0:

#          # Retrieve training reward
#          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
#          if len(x) > 0:
#              # Mean training reward over the last 100 episodes
#              mean_reward = np.mean(y[-100:])
#              if self.verbose > 0:
#                print(f"Num timesteps: {self.num_timesteps}")
#                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

#              # New best model, you could save the agent here
#              if mean_reward > self.best_mean_reward:
#                  self.best_mean_reward = mean_reward
#                  # Example for saving best model
#                  if self.verbose > 0:
#                    print(f"Saving new best model to {self.save_path}.zip")
#                  self.model.save(self.save_path)

#        return True


In [None]:
#OPTIONAL
##Create callback and check every 10 steps
#callback = SaveOnBestTrainingRewardCallback(check_freq=10, log_dir='/log_dir')

In [None]:
#Evaluate pretrained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) 

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
#Train agent
#model.learn(total_timesteps=int(100), callback=callback) #With optional Callback function
model.learn(total_timesteps=int(100))

In [None]:
#Evaluate trained model
environment = model.get_env()
mean_reward, std_reward = evaluate_policy(model, environment, n_eval_episodes=10)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
%cd ..


Hyperparam tuning with optuna

In [None]:
#PPO optimizer function

def optimize_ppo(trial):
  #Adjust hyperparams 
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': int(trial.suggest_int('n_steps', 32, 512)),
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
        'n_epochs': trial.suggest_int('n_epochs',3,5)
    }

#A2C optimizer fucntion

def optimize_a2c(trial):
  #Adjust hyperparams 
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': int(trial.suggest_int('n_steps', 32, 512)),
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
    }


In [None]:
#Agent optimizer function for PPO

def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ppo(trial)
    env = gym.make('Duckietown-straight_road-v0')
    #env = gym.make('Duckietown-small_loop-v0')
    #For multiprocessing
    #env = SubprocVecEnv([lambda: gym.make('Duckietown-small_loop-v0') for i in range(n_cpu)])
    model = PPO(CnnPolicy, env1, verbose=0, **model_params)
    model.learn(100)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 100000:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(-1 * last_reward, n_episodes)

    #Handle pruning based on the intermediate value.
    if trial.should_prune():
      raise optuna.TrialPruned()

    return -1 * last_reward

#Agent optimizer function for A2C

def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_a2c(trial)
    env = gym.make('Duckietown-straight_road-v0')
    #env = gym.make('Duckietown-small_loop-v0')
    #For multiprocessing
    #env = SubprocVecEnv([lambda: gym.make('Duckietown-small_loop-v0') for i in range(n_cpu)])
    model = A2C(CnnPolicy, env1, verbose=0, **model_params)
    model.learn(100)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 100000:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(-1 * last_reward, n_episodes)

    #Handle pruning based on the intermediate value.
    if trial.should_prune():
      raise optuna.TrialPruned()

    return -1 * last_reward


In [None]:
#Setting upt the hyperparameter tuner study with pruner 
sampler = TPESampler(seed=3) 
study = optuna.create_study(pruner=optuna.pruners.PercentilePruner(percentile=50,n_startup_trials=20,n_warmup_steps=10000,interval_steps=1),sampler=sampler,)

#Start optimization (Also handling NaN cases)
study.optimize(optimize_agent, n_trials=50, gc_after_trial=True,catch=(float('nan'),))
#study.optimize(optimize_agent, n_trials=1000, n_jobs=4) #
study.best_params

In [None]:
#Extracting the params
BestParams = study.best_params
print(BestParams)
b_n_steps = BestParams["n_steps"]
b_gamma = BestParams["gamma"]
b_learning_rate = BestParams["learning_rate"]
b_ent_coef = BestParams["ent_coef"]
b_n_epochs = BestParams["n_epochs"]

#Creating a model with the calculated hyperparams
model = PPO(CnnPolicy, env1, verbose=0, n_epochs=b_n_epochs , ent_coef=b_ent_coef, gamma=b_gamma, learning_rate=b_learning_rate, n_steps=b_n_steps)
#modela2c = A2C(CnnPolicy, env1, verbose=0, ent_coef=b_ent_coef, gamma=b_gamma, learning_rate=b_learning_rate, n_steps=b_n_steps)


In [None]:
#Train agent
#model.learn(total_timesteps=int(100), callback=callback) #With optional Callback function
model.learn(total_timesteps=int(100))

In [None]:
#Evaluate trained model with hyperparameters
mean_reward, std_reward = evaluate_policy(model, env1, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Resume Hyperparameter tuning. To be able to optimize better we need more computing capacity so we save and load the optimized models timely (in case of a crash the models were saved this way)





In [None]:
for i in range(10):
  #study.optimize(optimize_agent, n_trials=10, gc_after_trial=True, catch=(float('nan'),))
  study.optimize(optimize_agent, n_trials=20, gc_after_trial=True,catch=(float('NaN'),))
  study.best_params
  joblib.dump(study, 'PPOstraightV1.pkl')
  study = joblib.load('PPOstraightV1.pkl')
  print(i)

In [None]:
#Printing the best trial and its values
study = joblib.load('/content/PPOstraightV1.pkl')
print('Best trial until now:')
print(' Value: ', study.best_trial.value)
print(' Params: ')
for key, value in study.best_trial.params.items():
    print(f'    {key}: {value}')

In [None]:
#Save tunings into variables
study1 = joblib.load('/content/PPOstraightV1.pkl')
study2 = joblib.load('/content/studyStraightv2seed3a2c.pkl')

Visualize the tuning results

In [None]:
#Creates a list of the hyperparameter importances
optuna.importance.get_param_importances(study1)

In [None]:
#Empirical distribution function, cumulative probability
#Demonstrates that what is the possibility of given Objective values ranges
#Study1 - PPO, Study2 - A2C
optuna.visualization.plot_edf([study1, study2])

In [None]:
#Connects the hyperparam values of each trial and shows their objective values
optuna.visualization.plot_parallel_coordinate(study1)

In [None]:
#Shows the each hyperparameters cluster
optuna.visualization.plot_slice(study1)