In [None]:
# Must run this, the one installed is missing mpi
!pip install stable-baselines[mpi]==2.10.0

In [None]:
import numpy as np

from stable_baselines import PPO2
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.policies import MlpPolicy

from stable_baselines.common.vec_env import SubprocVecEnv, VecNormalize

from differential_drive_env_v2_wrappers import DifferentialDriveEnvV2Unscaled, RLAgentUnscalingWrapper
import baseline_integration as bi

from stable_baselines.gail import generate_expert_traj
from stable_baselines.gail import ExpertDataset
from stable_baselines.common.evaluation import evaluate_policy 


In [None]:
# Define the environment
init_pose = [0.3, 0.3, np.pi]
env = DifferentialDriveEnvV2Unscaled(init_position=init_pose,goal_threshold = 0.1, L=0.5, r=0.17, max_duration = 500)
env = make_vec_env(lambda: env, n_envs=1)
env = VecNormalize(env)

# name of the model to use to create the expert dataset
# maybe one of these
#    "ppo2_gaussian_012"
#    "ppo2_gaussian_016"
#    "ppo2_gaussian_017"
#    "test_corrected_env"
model_name = ""

# Load the model
model = PPO2.load(model_name)
model.set_env(env)
# generate_expert_traj(model,file_name,n_episodes)
generate_expert_traj(model, 'expert', n_episodes=50)

del model 
del env # (you can delete them now, not useful anymore)

In [None]:
env = DifferentialDriveEnvV2Unscaled(init_position=init_pose,goal_threshold = 0.1, L=0.5, r=0.16, max_duration = 500)
env = make_vec_env(lambda: env, n_envs=4)
env = VecNormalize(env)

In [None]:
dataset = ExpertDataset(expert_path='expert.npz',
                        traj_limitation=1, batch_size=128)

# define the model to pretrain on the generated dataset
model = PPO2(MlpPolicy, env, verbose=1)

# Pretrain the PPO2 model
model.pretrain(dataset, n_epochs=5000)


In [None]:
# Further train the model if necessary. It may be better to detach the current environment and make one with multiple
# environments in the make_vec function for parallelized and faster learning.
# Uncomment the following line if you want to 
# 
# del env
# env = DifferentialDriveEnvV2Unscaled(init_position=init_pose,goal_threshold = 0.1, L=0.5, r=0.16, max_duration = 500)
# env = make_vec_env(lambda: env, n_envs=4)
# env = VecNormalize(env)
# model.set_env(env)

model.learn(total_timesteps=500000)

In [None]:
# Give the new model a name and save it
pretrained_model_name = "pretraining_attempt"
model.save(pretrained_model_name)

In [None]:
# Evaluate the model; it returns the mean reward per episode, and the std 
evaluate_policy(model,env.envs[0],n_eval_episodes = 5)

In [None]:
# Visualize the trajectory obtained. 

ppo2_model_name = "pretraining_attempt" # Change this
env_class_name = DifferentialDriveEnvV2Unscaled
rl_agent_wrapper_class = RLAgentUnscalingWrapper
rl_agent_wrapper_params = {"state_scaling_factors": [1.0, 1.0, np.pi], "action_scaling_factors": [3.0, 3.0]}
init_robot_pose = {'x': 0.3, 'y': 0.3, 'theta': np.pi/2}
#init_robot_pose = {'x': 0.3, 'y': 0.3, 'theta': 0}
obss, actions = bi.load_and_run_model(ppo2_model_name, 500, 0.50, 0.16, env_class_name, list(init_robot_pose.values()), rl_agent_wrapper_class, rl_agent_wrapper_params)
print("Ho {} obss e {} actions".format(len(obss),len(actions)))
print("X \n {}".format(obss))
print("Commands \n {}".format(actions))
bi.show_rl_trajectory(obss,actions,0.50,0.16)