In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'       #Disactivate multiprocessing for numpy
import numpy as np
import matplotlib.pyplot as plt
import json
import gymnasium as gym
import yaml
from datetime import datetime

import stable_baselines3

from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, SubprocVecEnv
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, CheckpointCallback, StopTrainingOnNoModelImprovement

from sogym.mmc_optim import run_mmc
from sogym.env import sogym
from sogym.expert_generation import generate_expert_dataset, generate_mmc_solutions, generate_dataset
from sogym.utils import profile_and_analyze,ImageDictExtractor, CustomBoxDense
from sogym.callbacks import FigureRecorderCallback, MaxRewardCallback, GradientNormCallback, GradientClippingCallback
from sogym.pretraining import pretrain_agent, ExpertDataSet

import torch
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split, Dataset
from IPython.display import display

%load_ext autoreload
%autoreload 2

print('SB3 version:', stable_baselines3.__version__)
# Let's make the code device agnostic:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)

---
### Environment test and visualization:

In [4]:
# Let's test the environment using the check_env util from SB3:
observation_type = 'topopt_game'
train_env = sogym(mode='train',observation_type=observation_type,vol_constraint_type='hard',resolution=50,check_connectivity = True)
eval_env = sogym(mode='test',observation_type=observation_type,vol_constraint_type='hard',resolution=50,check_connectivity=False)

In [None]:
import matplotlib.pyplot as plt

reward = 0.0
while reward == 0.0:
    obs, info = train_env.reset()
    dones = False
    while not dones:
        action = train_env.action_space.sample()
        obs, reward, dones, truncated, info = train_env.step(action)

    fig = train_env.plot()
fig.savefig('env_test.png')

In [35]:
from sogym.utils import visualize_expert_trajectory
#visualize an expert trajectory:

file_path = '/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/unique_narval/20240414-174038-790212.json'
visualize_expert_trajectory(train_env, file_path)

In [50]:
# It will check your custom environment and output additional warnings if needed
check_env(sogym(mode='train',observation_type='topopt_game'))

In [None]:
# Example usage
# Specify the number of episodes to run
num_episodes = 20
# Call the profile_and_analyze function
result_df = profile_and_analyze(num_episodes, train_env)
# Print the resulting DataFrame
result_df

In [None]:
obs = train_env.reset()
cfg = {
            'optimizer':'mma', #optimiser choice
            'xInt':0.25, #initial interval of components in x
            'yInt':0.25, #initial interval of components in y
            'E':1.0, #Young's modulus
            'nu':0.3, #Poisson ratio
            'h':1, #thickness
            'dgt0':5, #significant digit of sens.
            'scl':1, #scale factor for obj
            'p':6,  #power of super ellipsoid
            'lmd':100, #power of KS aggregation   
            'maxiter':500, # maximum number of outer iterations
            'alpha':1e-9, # This is the threshold level in the Heaviside function
            'epsilon':0.2, #This is the regularization term in the Heaviside function
            'maxinnerinit':1, # This is the maximum number of inner iterations for GCMMA
            'switch':-0.000002, # This is the switch criteria for the hybrid optimizer
            'convergence_threshold':2e-4, #This is the threshold for the relative change in the objective function
            'xmin':(0.0, 0.0, 0.0, 0.00, 0.00, -np.pi),
            'xmax':(train_env.dx, train_env.dy, 0.7*min(train_env.dx,train_env.dy), 0.05*min(train_env.dx,train_env.dy),0.05*min(train_env.dx,train_env.dy), np.pi)
        }

#run_mmc(train_env.conditions,train_env.nelx,train_env.nely,train_env.dx,train_env.dy,plotting='contour',verbose=0,cfg=cfg)
dataset_folder = "/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/mmc"
#generate_mmc_solutions(key=0,dataset_folder="/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/mmc")
generate_dataset(dataset_folder= dataset_folder, num_threads=32, num_samples=20000)

In [None]:
# Create a figure and axes for the subplots
fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(10, 10))
axes = axes.flatten()

# Initialize the index for the current subplot
subplot_index = 0

# Let's visualize the training environment on a random problem statement and visualize a 'successful' solution:
reward = 0.0
while reward == 0.0:
    obs = train_env.reset()
    done = False
    while not done:
        action = train_env.action_space.sample()
        obs, reward, done, truncated, info = train_env.step(action)
        
        # Plot the current observation image
        axes[subplot_index].imshow(obs['strain_energy'].T, cmap='gray')
        axes[subplot_index].axis('off')
        axes[subplot_index].set_title(f"Timestep {subplot_index+1}")
        
        # Increment the subplot index
        subplot_index += 1
        
        # If all subplots are filled, display the plot and reset the index
        if subplot_index == len(axes):
            plt.tight_layout()
            plt.show()
            subplot_index = 0

# Print the reward
print("Reward:", reward)

# Plot the final state of the training environment
train_env.plot()

# Display any remaining subplots
if subplot_index > 0:
    for i in range(subplot_index, len(axes)):
        axes[i].axis('off')
    plt.tight_layout()
    plt.show()


In [3]:
# Specify the number of permutations to generate
num_permutations = None
observation_type = "topopt_game"

# Specify the environment configuration (optional)
env_kwargs = {
    'mode': 'train',
    'observation_type': observation_type,
    'vol_constraint_type': 'hard',
    'seed': 42,
    'resolution' : 50, 
    'check_connectivity':True
}

directory_path = "/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies_narval"
generate_expert_dataset(directory_path,env_kwargs, plot_terminated=False,num_permutations = num_permutations, file_fraction=1.0)

Processing files: 100%|██████████| 38855/38855 [3:42:43<00:00,  2.91file/s]  


7


In [8]:
# Copy the files in /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/unique_narval 
# and the files in /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/holodeck_may12
# to: /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/combined
!find /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/cortex_may13 -name "*.json" -print0 | xargs -0 -I {} cp {} /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/combined/
!find /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/holodeck_may12 -name "*.json" -print0 | xargs -0 -I {} cp {} /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/combined/
!find /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/narval_may13 -name "*.json" -print0 | xargs -0 -I {} cp {} /home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/combined/


  pid, fd = os.forkpty()


In [9]:
from sogym.expert_generation import check_duplicates, copy_unique_files
# Specify the folder path containing the .json files
folder_path = '/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/combined'

# Adjust the percentage as needed, e.g., 50 for 50%
check_duplicates(folder_path, percentage=100)


# Specify the path to the unique_files.txt file
unique_files_file = 'unique_files.txt'
# Specify the destination folder for the unique files
destination_folder = '/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/unique_combined'

# Copy the unique files to the destination folder
copy_unique_files(unique_files_file, destination_folder)


  self.pid = os.fork()
Processing files: 100%|█████████▉| 37859/37860 [03:57<00:00, 159.13it/s]


Duplicates found. Check 'duplicate.txt' for the list of duplicate files.
Unique files listed in 'unique_files.txt'.
Unique files copied to '/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/unique_combined'.


In [1]:
from sogym.expert_generation import generate_expert_dataset
import pickle

# Specify the number of permutations to generate
num_permutations = None
observation_type = "topopt_game"

# Specify the environment configuration (optional)
env_kwargs = {
    'mode': 'train',
    'observation_type': observation_type,
    'vol_constraint_type': 'hard',
    'seed': 42,
    'resolution' : 50,
    'check_connectivity':True
}

directory_path = "/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/topologies/unique_combined"
generate_expert_dataset(directory_path,env_kwargs,observation_type=observation_type, plot_terminated=False,num_permutations = num_permutations, file_fraction=1.0)

  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(dpi=100)
  fig = plt.figure(d

In [2]:
from sogym.pretraining import load_expert_dataset
chunk_dir = '/home/thomas/Documents/scratch_thomas/GitHub/sogym_v2/dataset/expert/unique_combined_topopt_game_20240515_071024'
expert_dataset = load_expert_dataset(chunk_dir, train_env)
#print length of expertdataset:
print(len(expert_dataset)/8)

NameError: name 'train_env' is not defined

In [7]:

# Get a random sample from the dataset
sample_idx = np.random.randint(len(expert_dataset))
sample = expert_dataset[sample_idx]

# Extract the observation and reward from the sample
observation, action = sample

# Subplot with image, strain_energy, and structure_strain_energy observations:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot image observation
axes[0].imshow(observation['image'].T, cmap='gray', origin='lower')
axes[0].axis('on')
axes[0].set_title("Image Observation")

# Plot strain_energy observation
axes[1].imshow(observation['structure_strain_energy'].T, origin='lower')
axes[1].axis('on')
axes[1].set_title("Structure Strain Energy Observation")

print(action)
plt.tight_layout()
plt.savefig('expert_observation.png', dpi=300)
plt.show()

  axes[0].imshow(observation['image'].T, cmap='gray', origin='lower')


tensor([-0.8048,  1.0000,  0.3147, -0.3081,  1.0000,  1.0000],
       dtype=torch.float64)


In [5]:
obs = train_env.reset()

#use action and plot the result
obs, rewards, dones,truncated, info = train_env.step(np.array(action))

In [6]:
plt.imshow(obs['image'].T,origin='lower')
plt.savefig('expert_action.png', dpi=300)

In [24]:
chosen_policy = "MlpPolicy" if observation_type == 'box_dense' else "MultiInputPolicy"

feature_extractor = ImageDictExtractor if observation_type == 'image' or observation_type == 'topopt_game' else CustomBoxDense

# Load the YAML file
env=train_env

with open("algorithms.yaml", "r") as file:
    config = yaml.safe_load(file)

# Extract the parameters for the desired algorithm
algorithm_name = "PPO"  # or "TD3"
algorithm_params = config[algorithm_name]

policy_kwargs = dict(
    features_extractor_class=feature_extractor,
    net_arch = config['common']['net_arch'],
    share_features_extractor = False
)

# Create the model based on the algorithm name and parameters
if algorithm_name == "SAC":
    model = SAC(env=env,
                policy = chosen_policy, 
                policy_kwargs=policy_kwargs,
                device=device, 
                **algorithm_params)

elif algorithm_name == "PPO":
    model = PPO(env=env, 
                policy = chosen_policy, 
                policy_kwargs=policy_kwargs,
                device = device, 
                **algorithm_params)
    
    

elif algorithm_name == "TD3":
    # Create the action noise object
    n_actions = env.action_space.shape[-1]
    action_noise_params = algorithm_params.pop("action_noise")
    action_noise = NormalActionNoise(mean=action_noise_params["mean"] * np.ones(n_actions),
                                     sigma=action_noise_params["sigma"] * np.ones(n_actions))
    model = TD3(env=env,
                policy =chosen_policy, 
                policy_kwargs=policy_kwargs,
                action_noise=action_noise,
                device=device, 
                **algorithm_params)

# Get the current date and time
current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create the tb_log_name string
tb_log_name = f"{algorithm_name}_{current_datetime}"

AttributeError: module 'datetime' has no attribute 'now'

In [None]:
from torchinfo import summary
total_params = sum(p.numel() for p in model.policy.parameters())
print(f"Total number of parameters: {total_params:,}")
data = {k: v for k, v in observation.items()}
# Assuming you have a PyTorch model named 'model' and the input size is (3, 224, 224)
summary(model.policy)


Total number of parameters: 18,875,917


Layer (type:depth-idx)                   Param #
MultiInputActorCriticPolicy              6
├─ImageDictExtractor: 1-1                --
│    └─ReLU: 2-1                         --
│    └─ModuleDict: 2-2                   --
│    │    └─Sequential: 3-1              22,784
│    │    └─Sequential: 3-2              93,248
│    │    └─Sequential: 3-3              16,768
│    │    └─Sequential: 3-4              16,768
│    │    └─Sequential: 3-5              93,248
│    │    └─Sequential: 3-6              16,768
├─ImageDictExtractor: 1-2                (recursive)
│    └─ReLU: 2-3                         --
│    └─ModuleDict: 2-4                   (recursive)
│    │    └─Sequential: 3-7              (recursive)
│    │    └─Sequential: 3-8              (recursive)
│    │    └─Sequential: 3-9              (recursive)
│    │    └─Sequential: 3-10             (recursive)
│    │    └─Sequential: 3-11             (recursive)
│    │    └─Sequential: 3-12             (recursive)
├─ImageDictExtractor

In [25]:
todays_date = datetime.now().strftime("%Y%m%d")
pretrain_agent(
    model,
    expert_dataset,
    env,
    test_env = eval_env,
    batch_size=4096,
    epochs=2000,
    scheduler_gamma=1.0,
    learning_rate= 3e-2,
    log_interval=5,
    no_cuda=False,
    seed=1,
    verbose=True,
    test_batch_size=1024,
    early_stopping_patience=100,
    plot_curves=True,
    tensorboard_log_dir="tb_logs/imitation/PPO_{}".format(todays_date)",
    checkpoint_dir="checkpoints/imitation/PPO_{}".format(todays_date),
    load_checkpoint=None,
    comet_ml_api_key="No20MKxPKu7vWLOUQCFBRO8mo",
    comet_ml_project_name="pretraining_rl",
    comet_ml_experiment_name="PPO_{}".format(todays_date),
    eval_freq = 5,
    l2_reg_strength=0.001,
    max_grad_norm = 10.0
)

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/thomasrb/pretraining-rl/a5ffd4bd210a4d489ca9f6d633d6c3bb
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     grad_norm [20]  : (0.8937201855982214, 1.7579312044124606)
[1;38;5;39mCOMET INFO:[0m     mae [20]        : (0.8019486623404353, 0.9296369073108344)
[1;38;5;39mCOMET INFO:[0m     mean_reward [3] : (0.04329246059060097, 0.0435519840568304)
[1;38;5;39mCOMET INFO:[0m     std_reward [3]  : (0.0011471042911364835, 0.0013665396855924601)
[1;38;5;39mCOMET INFO:[0m     test_loss [20]  : (1.0



  self.pid = os.fork()


Train set: Average loss: 1.9590


  self.pid = os.fork()
  self.pid = os.fork()


Test set: Average loss: 1.4309, Average MAE: 0.9517




Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.9026
Test set: Average loss: 1.4074, Average MAE: 0.9434
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.8860
Test set: Average loss: 1.3909, Average MAE: 0.9378
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.8628
Test set: Average loss: 1.3770, Average MAE: 0.9334
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.8451
Test set: Average loss: 1.3569, Average MAE: 0.9255
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best 



Epoch 5: Mean reward = 0.043 +/- 0.001
Train set: Average loss: 1.8244
Test set: Average loss: 1.3326, Average MAE: 0.9183
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.8038
Test set: Average loss: 1.3101, Average MAE: 0.9072
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.7890
Test set: Average loss: 1.2952, Average MAE: 0.9020
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.7679
Test set: Average loss: 1.2740, Average MAE: 0.8980
Saved best model to checkpoints/imitation_PPO_20240514/model_best.pt
Saved best optimizer to checkpoints/imitation_PPO_20240514/optimizer_best.pt
Train set: Average loss: 1.7454
Test set:

KeyboardInterrupt: 

In [26]:
model.save('./checkpoints/PPO_pretrained')

---
### Multiprocessing

In [61]:
#from transformers import AutoTokenizer, AutoModel
from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan
import multiprocessing

# Set number of cpus to use automatically:
num_cpu = multiprocessing.cpu_count()
print("Using {} cpus!".format(num_cpu))

algorithm_name = "PPO"  # or "TD3"
# Load the YAML file
with open("algorithms.yaml", "r") as file:
    config = yaml.safe_load(file)

# Extract the parameters for the desired algorithm
algorithm_params = config[algorithm_name]

observation_type = "topopt_game"
vol_constraint_type = "hard"
use_std_strain = False
check_connectivity = True
resolution = 50
current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
# Create the tb_log_name string

train_critic_only = False  # if True, we freeze everything except the critic
pretrained_run = None
restart_run = "PPO_20240516_093938"
restart_run = None

if restart_run:
    log_name=restart_run
else:
    log_name = f"{algorithm_name}_{current_datetime}"
    #check if ./runs/log_name exists, if not create it:
    if not os.path.exists(f'./runs/{log_name}'):
        os.makedirs(f'./runs/{log_name}')
        #create a yaml file with the algorithm parameters and the additional parameters defined in this cell:
        # I first need to append to algorithm_params dict the parameters defined above:
        algorithm_params['algorithm_name'] = algorithm_name
        algorithm_params['observation_type'] = observation_type
        algorithm_params['vol_constraint_type'] = vol_constraint_type
        algorithm_params['use_std_strain'] = use_std_strain
        algorithm_params['check_connectivity'] = check_connectivity
        algorithm_params['resolution'] = resolution

        with open(f'./runs/{log_name}/config.yaml', 'w') as file:
            yaml.dump(algorithm_params, file)



train_env = sogym(mode='train',observation_type=observation_type,vol_constraint_type = 'hard',resolution=50,check_connectivity=True)#,model=model,tokenizer=tokenizer)
env= make_vec_env(lambda:train_env, n_envs=num_cpu,vec_env_cls=SubprocVecEnv)
env = VecCheckNan(env, raise_exception=True)

eval_env = sogym(mode='test',observation_type=observation_type,vol_constraint_type='hard',resolution=50,check_connectivity=True)#,model=model,tokenizer=tokenizer)
eval_env = make_vec_env(lambda:eval_env, n_envs=1,vec_env_cls=SubprocVecEnv)


Using 64 cpus!


--- 
### Defining the model

In [56]:
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
# Get the current date and time

# The noise objects for TD3
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions))

chosen_policy = "MlpPolicy" if observation_type == 'box_dense' else "MultiInputPolicy"
feature_extractor = ImageDictExtractor if observation_type == 'image' or observation_type=="topopt_game" else CustomBoxDense


policy_kwargs = dict(
    features_extractor_class=feature_extractor,
    net_arch = config['common']['net_arch'],
    share_features_extractor = False,
)

# Create the model based on the algorithm name and parameters
if algorithm_name == "SAC":
    model = SAC(env=env,
                policy = chosen_policy, 
                policy_kwargs=policy_kwargs,
                #action_noise = action_noise,
                ent_coef = 0.0,
                device=device, 
                **algorithm_params)

elif algorithm_name == "PPO":
    model = PPO(env=env, 
                policy = chosen_policy, 
                policy_kwargs=policy_kwargs,
                n_steps= 64*386 // num_cpu//100,
                batch_size= 16384//4,
                tensorboard_log  ='./runs/{}'.format(log_name),
                device = device, 
                **algorithm_params)

elif algorithm_name == "TD3":
    # Create the action noise object
    n_actions = env.action_space.shape[-1]
    action_noise_params = algorithm_params.pop("action_noise")
    action_noise = NormalActionNoise(mean=action_noise_params["mean"] * np.ones(n_actions),
                                     sigma=action_noise_params["sigma"] * np.ones(n_actions))
    model = TD3(env=env,
                policy =chosen_policy, 
                policy_kwargs=policy_kwargs,
                action_noise=action_noise,
                device=device, 
                **algorithm_params)
if load_from_pretrained and pretrained_checkpoint is not None:
    model.set_parameters(pretrained_checkpoint)

if restart_run:
    model = model.load("./runs/{}/checkpoints/best_model.zip".format(log_name),env=env)

In [57]:
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=250_000//num_cpu,
  save_path="./runs/{}/checkpoints/".format(log_name),
  name_prefix=log_name,
  save_replay_buffer=True,
  save_vecnormalize=True,
)

eval_callback = EvalCallback(eval_env,
                             log_path='./runs/{}/'.format(log_name), 
                             eval_freq=10_000//num_cpu,
                             deterministic=True,
                             n_eval_episodes=10,
                             render=False,
                             best_model_save_path='./runs/{}/checkpoints/'.format(log_name),
                             verbose=0)

callback_list = CallbackList([eval_callback,
                         checkpoint_callback,
                         MaxRewardCallback(verbose=1),
                         GradientClippingCallback(clip_value=1.0, verbose=1),
                         GradientNormCallback(verbose=1),
                         FigureRecorderCallback(eval_env=eval_env, check_freq=10_000//num_cpu, figure_size=(8, 6))
                         ])

--- 
### Training

In [58]:

import torch.nn as nn

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

if train_critic_only:
    #Freeze everything:
    for name, param in model.policy.named_parameters():
        if param.requires_grad:
            param.requires_grad=False

    if algorithm_name =='SAC':
        # Unfreeze critic:
        for param in model.policy.critic.parameters():
            if param.requires_grad==False:
                param.requires_grad=True

        for param in model.policy.critic_target.parameters():
            if param.requires_grad==False:
                param.requires_grad=True


    if algorithm_name == 'PPO':
        for param in model.policy.mlp_extractor.value_net.parameters():
            if param.requires_grad==False:
                param.requires_grad=True
            
        for param in model.policy.value_net.parameters():
            if param.requires_grad==False:
                param.requires_grad=True

In [59]:

model.learn(25_000_000,
           callback=callback_list, 
           tb_log_name=log_name,
           reset_num_timesteps=not restart
           )

# save the model:
model.save('./runs/{}/checkpoints/final_model')
if algorithm_name != 'PPO':
    model.save_replay_buffer("./runs/{}/checkpoints/final_buffer")


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.w

KeyboardInterrupt: 

In [33]:
model.save('checkpoints/imitation_PPO_critic')

---
### Let's visualize the agent's performance:

In [19]:
env=sogym(mode='test',observation_type='topopt_game',vol_constraint_type='hard' ,resolution = 50)
#env= make_vec_env(lambda:env, n_envs=1,vec_env_cls=SubprocVecEnv)
env

<sogym.env.sogym at 0x7f98b10dbc40>

In [22]:
obs,info=env.reset()
dones=False
saved_conditions = env.conditions
saved_nelx, saved_nely = env.nelx, env.nely
saved_dx, saved_dy = env.dx, env.dy
#use deepcopy to save 
while dones== False:
    action, _states = model.predict(obs,deterministic=True)
    print(action)
    obs, rewards, dones,truncated, info = env.step(action)
print("Desired volume:",saved_conditions['volfrac'],"Obtained volume:",env.volume)
print("Env reward:",rewards)
fig = env.plot()
fig.savefig('trained_agent.png', dpi=300)

[-0.75501883 -1.          0.32291675  0.64145327  0.9568924   0.9920023 ]
[-0.903395    0.6623455  -0.42170578 -0.5872051   0.7409822   0.8139564 ]
[-0.94604313 -0.02086864  0.11097331  1.          0.8015071   0.8562474 ]
[-1.          1.          0.08024888 -0.32997712  1.          1.        ]
[ 0.07595539 -0.7214547   0.4731991   0.3694167   0.37574175  0.34123728]
[-0.37390336 -0.01163775  0.40241554 -0.27844983  0.45322505  0.43657506]
[-0.16364224 -0.55042976  0.31846595  1.          0.4676532   0.45424548]
[-0.12401124  0.69860417  0.35660243 -0.06428144  0.7142122   0.7212964 ]
Desired volume: 0.41 Obtained volume: 0.30805051633717917
Env reward: 0.1713260659137914


In [21]:
fig.savefig('trained_agent.png', dpi=300)

In [34]:
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(model.policy, eval_env, n_eval_episodes=10, deterministic=True)
print(mean_reward)



0.07606540508568287


In [None]:
xval, f0val,it, H, Phimax, allPhi, den, N, cfg = run_mmc(saved_conditions,saved_nelx,saved_nely,saved_dx,saved_dy,plotting='contour')