# Soft Actor Critic (SAC) Notebook

## Install PyBullet for Physics based environments

In [1]:
! pip install pybullet
! pip install Box2D

Collecting pybullet
  Downloading pybullet-3.2.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (90.8 MB)
[K     |████████████████████████████████| 90.8 MB 241 bytes/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.1
Collecting Box2D
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.1 MB/s 
[?25hInstalling collected packages: Box2D
Successfully installed Box2D-2.3.10


In [2]:
! nvidia-smi

Mon Jan 24 20:49:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Clone the Repository

In [3]:
## uncomment only if running from google.colab
# clone the git reposetory
!git clone https://github.com/RoyElkabetz/SAC_with_PyTorch
# add path to .py files for import
import sys
sys.path.insert(1, "/content/SAC_with_PyTorch/src")

Cloning into 'SAC_with_PyTorch'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (135/135), done.[K
remote: Total 161 (delta 70), reused 95 (delta 24), pack-reused 0[K
Receiving objects: 100% (161/161), 3.36 MiB | 29.41 MiB/s, done.
Resolving deltas: 100% (70/70), done.


## Get imports

In [4]:
import os
import gym
import numpy as np
import pybullet_envs

from utils import plot_learning_curve
from agents import Agent

## Set Agent + Env arguments (similar to the command line API from repo's Readme)

In [8]:
env_name = 'HalfCheetahBulletEnv-v0'
dir = 'tmp'
n_games = 1000


gamma = 0.99
alpha = 3e-4
beta = 3e-4
fc1_dim = 256
fc2_dim = 256
memory_size = 1000000
batch_size = 256
tau = 0.005
update_period = 2
reward_scale = 2.
warmup = 1000
reparam_noise_lim = 1e-6
play = False
load_checkpoint = False
gpu_to_cpu = False


## Run SAC

In [9]:
env = gym.make(env_name)
dir_path = os.path.join(dir, env_name)
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

agent = Agent(gamma=gamma, alpha=alpha, beta=beta, state_dims=env.observation_space.shape,
              action_dims=env.action_space.shape, max_action=env.action_space.high[0],
              fc1_dim=fc1_dim, fc2_dim=fc2_dim, memory_size=memory_size,
              batch_size=batch_size, tau=tau, update_period=update_period,
              reward_scale=reward_scale, warmup=warmup, reparam_noise_lim=reparam_noise_lim,
              name='SAC_'+env_name, ckpt_dir=dir_path)

scores, avg_scores = [], []
best_score = -np.inf

if play:
    # env.render(mode='human') (cannot render in colab)
    pass

if load_checkpoint:
    agent.load_model(gpu_to_cpu=gpu_to_cpu)

for game in range(n_games):
    observation = env.reset()
    done = False
    score = 0

    while not done:
        if play:
            action = agent.choose_action(observation, deterministic=True, reparameterize=False)
        else:
            action = agent.choose_action(observation, deterministic=False, reparameterize=False)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_transition(observation, action, reward, observation_, done)
        if not play:
            agent.learn()
        observation = observation_
    scores.append(score)
    avg_score = np.mean(scores[-100:])
    avg_scores.append(avg_score)

    print(f'| Game: {game:6.0f} | Score: {score:10.2f} | Best score: {best_score:10.2f} | '
          f'Avg score {avg_score:10.2f} | Learning inter: {agent.learn_iter:10.0f} |')

    if avg_score > best_score:
        best_score = avg_score
        if not play:
            agent.save_model()
env.close()

if not play:
    plot_learning_curve(scores, agent.full_path)


| Game:      0 | Score:   -1362.79 | Best score:       -inf | Avg score   -1362.79 | Learning inter:       1000 |
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
| Game:      1 | Score:   -1096.40 | Best score:   -1362.79 | Avg score   -1229.60 | Learning inter:       2000 |
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
| Game:      2 | Score:   -1037.66 | Best score:   -1229.60 | Avg score   -1165.62 | Learning inter:       3000 |
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
| Game:      3 | Score:    -894.48 | Best score:   -1165.62 | Avg score   -1097.83 | Learning inter:       4000 |
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
... saving checkpoint ...
| Game:      4 | Score: 

KeyboardInterrupt: ignored