<a href="https://colab.research.google.com/github/TBKHori/Music-Recon13/blob/main/Accessing_and_modifying_model_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install typing
!pip install stable_baselines3
!pip install torch
!pip install numpy
!pip install gymnasium

Collecting stable_baselines3
  Downloading stable_baselines3-2.0.0-py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.4/178.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: stable_baselines3
Successfully installed stable_baselines3-2.0.0


In [6]:
from typing import Dict


In [7]:
import gymnasium as gym
import numpy as np
import torch as th

In [8]:
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

In [9]:
def mutate(params: Dict[str, th.Tensor]) -> Dict[str, th.Tensor]:
    """Mutate parameters by adding normal noise to them"""
    return dict((name, param + th.randn_like(param)) for name, param in params.items())


# Create policy with a small network
model = A2C(
    "MlpPolicy",
    "CartPole-v1",
    ent_coef=0.0,
    policy_kwargs={"net_arch": [32]},
    seed=0,
    learning_rate=0.05,
)

In [10]:
# Create policy with a small network
model = A2C(
    "MlpPolicy",
    "CartPole-v1",
    ent_coef=0.0,
    policy_kwargs={"net_arch": [32]},
    seed=0,
    learning_rate=0.05,
)

In [11]:
# Use traditional actor-critic policy gradient updates to
# find good initial parameters
model.learn(total_timesteps=10_000)

<stable_baselines3.a2c.a2c.A2C at 0x7a25d5a53130>

In [12]:
# Include only variables with "policy", "action" (policy) or "shared_net" (shared layers)
# in their name: only these ones affect the action.
# NOTE: you can retrieve those parameters using model.get_parameters() too
mean_params = dict(
    (key, value)
    for key, value in model.policy.state_dict().items()
    if ("policy" in key or "shared_net" in key or "action" in key)
)


In [13]:
# population size of 50 invdiduals
pop_size = 50
# Keep top 10%
n_elite = pop_size // 10
# Retrieve the environment
vec_env = model.get_env()

In [14]:
for iteration in range(10):
    # Create population of candidates and evaluate them
    population = []
    for population_i in range(pop_size):
        candidate = mutate(mean_params)
        # Load new policy parameters to agent.
        # Tell function that it should only update parameters
        # we give it (policy parameters)
        model.policy.load_state_dict(candidate, strict=False)
        # Evaluate the candidate
        fitness, _ = evaluate_policy(model, vec_env)
        population.append((candidate, fitness))
    # Take top 10% and use average over their parameters as next mean parameter
    top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite]
    mean_params = dict(
        (
            name,
            th.stack([candidate[0][name] for candidate in top_candidates]).mean(dim=0),
        )
        for name in mean_params.keys()
    )
    mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / n_elite
    print(f"Iteration {iteration + 1:<3} Mean top fitness: {mean_fitness:.2f}")
    print(f"Best fitness: {top_candidates[0][1]:.2f}")

Iteration 1   Mean top fitness: 201.24
Best fitness: 411.10
Iteration 2   Mean top fitness: 288.46
Best fitness: 315.70
Iteration 3   Mean top fitness: 497.06
Best fitness: 500.00
Iteration 4   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 5   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 6   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 7   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 8   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 9   Mean top fitness: 500.00
Best fitness: 500.00
Iteration 10  Mean top fitness: 500.00
Best fitness: 500.00
