# 1. Instaling dependecies via pip comand

In [None]:
!pip install stable-baselines3[extra]

In [None]:
!pip install gym[all]

In [None]:
import gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Loading environment

In [None]:
env_name = "CartPole-v0"
env =  gym.make (env_name)

In [None]:
episodes = 5
for episode in range (1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print("Episode: {} , Score: {}".format(episode,score))
env.close()

# 2.1 Understanding the Env

https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [None]:
env.action_space
#this describes the space we have in the action within the env.
#Discrete(2) means we have two possibles discrete values: 0 and 1.

In [None]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

In [None]:
env.observation_space
#this describes the space we have with in the observation with in the env.
# Box means we have a list of lists, four values in each list, in a array of 4,0 and type flot32.

In [None]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

# 3. Training RL Model

In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000)

# 4. Save and Reload Models

In [None]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_model_cartpole')
PPO_Path

In [None]:
model.save(PPO_Path)

In [None]:
del model

In [None]:
model = PPO.load(PPO_Path, env=env)

# 5. Evaluation

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

# 6. Testing Model

In [None]:
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode: {} , Score: {}".format(episode,score))
env.close()

In [None]:
obs = env.reset() # reseting the initial conditions of environment
obs

In [None]:
action, _ = model.predict(obs) # predicting an action with the new set of conditions/observations

In [None]:
env.step(action) # step with the action taken and reword of 1, we succeed in mainting the pole in the upright position

# 7. Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_4') #last succesfull model trained.
training_log_path

In [None]:
!tensorboard --logdir = {training_log_path} # ! is a magic command. Issuing it here makes it execute in a command line.

# 8. Call Back to stop on desired reward (training stage)

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [None]:
save_path = os.path.join('Training', 'Saved Models')

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose=1
                            )

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

   ## 8.1 Testing model with callback on rewardthreshold=200

In [None]:
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode: {} , Score: {}".format(episode,score))
env.close()

# 9 Changing Policies (network architecture of MLP)

In [None]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

## 9.1 Testing with the new MLP architecture

In [None]:
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode: {} , Score: {}".format(episode,score))
env.close()

# 10. Alternate Algorithms

In [None]:
from stable_baselines3 import DQN

In [None]:
model_DQN = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model_DQN.learn(total_timesteps=20000, log_interval=4)

## 10.1 Testing DQN Algorithm trained

In [None]:
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model_DQN.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode: {} , Score: {}".format(episode,score))
env.close()

### as we can see this model do not performs very well with the above settings.

In [None]:
DQN_Path = os.path.join('Training', 'Saved Models', 'DQN_model_cartpole')

In [None]:
model_DQN.save(DQN_Path)