In [1]:
%matplotlib inline
%matplotlib notebook
%load_ext autoreload
%autoreload 2

In [2]:
from env.SurrogateModel import SurrogateModel
from env.Decoder import Decoder
from env.VQVAE_environment import VQVAE_Env, RenderCallback
import torch
import os
import torch.nn as nn
import pandas as pd 
import numpy as np
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
import wandb
from wandb.integration.sb3 import WandbCallback

In [3]:
# Load Surrogate Model
surrogate_model = SurrogateModel('/Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/models/surrogate_model.json')

In [4]:
# Parameters for Decoder and VQ-VAE
x_dim = 22
h_nodes = 512
scale = 2
num_layers = 5
embed_dim = 8
dropout = 0.2
num_embeddings = 14
max_allowed_action = 200

In [5]:
# Load Decoder Model 
decoder_model = Decoder(x_dim, 
                        embed_dim= embed_dim, 
                        h_nodes = h_nodes, 
                        dropout = dropout, 
                        scale = scale, 
                        num_layers= num_layers, 
                        load_path = '/Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/models/decoder_model.pth').to('cpu')

Decoder model loaded from:  /Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/models/decoder_model.pth


In [6]:
# Load codebook 
codebook = torch.load('/Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/models/codebook.pth', map_location='cpu')

In [7]:
codebook.shape

torch.Size([14, 8])

In [8]:
# Initialize your environment
env = VQVAE_Env(embed_dim=embed_dim, num_embeddings=num_embeddings, max_allowed_actions=200,
                surrogate_model=surrogate_model, decoder=decoder_model, codebook=codebook,
                num_previous_actions=4)

In [9]:
# Using check_env from stable baselines 3 to check if the environment is compatible with stable baselines
check_env(env, warn=True)

### Train RL Agent

In [10]:
model_dir = 'models'
log_dir = 'logs'
os.makedirs(model_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [11]:
# Instantiate the env
vec_env = make_vec_env(VQVAE_Env, n_envs=1, env_kwargs=dict(embed_dim=embed_dim, num_embeddings=num_embeddings, max_allowed_actions=200,
                surrogate_model=surrogate_model, decoder=decoder_model, codebook=codebook,
                num_previous_actions=4, render_mode = 'human', 
                render_data = '/Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/render/architectures_trained_on.npy',
                render_labels = '/Users/tawab/Desktop/columbia/Courses/Spring2024/HPML/Project/Analog_NAS/env/render/labels.npy'))

In [12]:
vec_env.reset()

OrderedDict([('action_history', array([[-1, -1, -1, -1]], dtype=int32)),
             ('latent_vector',
              array([[ 1.1818045 , -0.6451698 , -1.0985265 , -1.054006  , -0.97384953,
                      -1.4133068 , -0.14092945,  0.12463607]], dtype=float32))])

In [13]:
render_callback = RenderCallback(render_every = 1)
config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": 25000
}

run = wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-Train_NAS",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33masaficontact[0m ([33mtrex-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
# Train the agent
model = PPO(config['policy'], vec_env, verbose=1, tensorboard_log=log_dir)
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=[WandbCallback(
        model_save_path=f"models/{run.id}",
        verbose=2,
    ), render_callback],
)
run.finish()

Using cpu device
Logging to logs/PPO_35
Rendering the environment...
(1200, 8) (14, 8) (2048, 8)


<IPython.core.display.Javascript object>

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.7     |
|    ep_rew_mean     | 0.771    |
| time/              |          |
|    fps             | 134      |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 2048     |
---------------------------------
Rendering the environment...
(1200, 8) (14, 8) (2048, 8)


<IPython.core.display.Javascript object>

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 80.1        |
|    ep_rew_mean          | 0.777       |
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 2           |
|    time_elapsed         | 30          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020814458 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.71       |
|    explained_variance   | -6.87       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0667     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0593     |
|    value_loss           | 0.0693      |
-----------------------------------------


KeyboardInterrupt: 

In [None]:
render_callback = RenderCallback(render_every = 1)
config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": 25000
}

run = wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-Train_NAS",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33masaficontact[0m ([33mtrex-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Test the trained agent
# using the vecenv
obs = vec_env.reset()
n_steps = 125
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print(f"Step {step + 1}")
    print("Action: ", action)
    obs, reward, done, info = vec_env.step(action)
    print("obs=", obs, "reward=", reward, "done=", done)
    vec_env.render()
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break

Step 1
Action:  [24]
obs= OrderedDict([('action_history', array([[-1, -1, -1, 24]], dtype=int32)), ('latent_vector', array([[ 0.02487027,  0.5467654 ,  0.44229373,  1.4774871 ,  0.12495244,
        -1.3780882 ,  0.25097743, -1.3842576 ]], dtype=float32))]) reward= [0.9364191] done= [False]
Step 2
Action:  [84]
obs= OrderedDict([('action_history', array([[-1, -1, 24, 84]], dtype=int32)), ('latent_vector', array([[ 0.02487027,  0.5467654 ,  0.44229373,  1.4774871 ,  0.02469034,
        -1.3780882 ,  0.25097743, -1.3842576 ]], dtype=float32))]) reward= [0.0018602] done= [False]
Step 3
Action:  [84]
obs= OrderedDict([('action_history', array([[-1, 24, 84, 84]], dtype=int32)), ('latent_vector', array([[ 0.02487027,  0.5467654 ,  0.44229373,  1.4774871 ,  0.02469034,
        -1.3780882 ,  0.25097743, -1.3842576 ]], dtype=float32))]) reward= [0.] done= [False]
Step 4
Action:  [107]
obs= OrderedDict([('action_history', array([[ 24,  84,  84, 107]], dtype=int32)), ('latent_vector', array([[ 0.0



Step 63
Action:  [21]
obs= OrderedDict([('action_history', array([[89, 11, 92, 21]], dtype=int32)), ('latent_vector', array([[ 0.02487027, -0.03147931,  0.44229373,  0.04938179,  0.04833326,
         0.05207963,  0.0509036 , -1.3842576 ]], dtype=float32))]) reward= [0.] done= [False]
Step 64
Action:  [62]
obs= OrderedDict([('action_history', array([[11, 92, 21, 62]], dtype=int32)), ('latent_vector', array([[ 0.02487027, -0.03147931,  0.44229373,  0.04938179,  0.04833326,
         0.05207963,  0.0509036 , -1.3842576 ]], dtype=float32))]) reward= [0.] done= [False]
Step 65
Action:  [89]
obs= OrderedDict([('action_history', array([[92, 21, 62, 89]], dtype=int32)), ('latent_vector', array([[ 0.02487027, -0.03147931,  0.44229373,  0.04938179,  0.04833326,
         0.05207963,  0.0509036 , -1.3842576 ]], dtype=float32))]) reward= [0.] done= [False]
Step 66
Action:  [11]
obs= OrderedDict([('action_history', array([[21, 62, 89, 11]], dtype=int32)), ('latent_vector', array([[ 0.02487027, -0.031



In [None]:
predicted_state = obs['latent_vector']
predicted_state

array([[ 0.02487027, -0.03147931,  0.44229373,  0.04938179,  0.04833326,
         0.05207963,  0.0509036 , -1.3842576 ]], dtype=float32)

In [None]:
decoded_state = decoder_model(torch.from_numpy(predicted_state))

In [None]:
pred_accuracy = surrogate_model.evaluate(decoded_state)

In [None]:
pred_accuracy

array([0.75800395], dtype=float32)