In [1]:
import gymnasium as gym
import numpy as np
import random
from Qtabularfunctions import*
from Cartpolefamily import*
from metaQlearning import*
import matplotlib.pyplot as plt
import random

In [2]:
# state space limits
low = np.array([-4.8, -3.0, -0.418, -3.5])
high = np.array([4.8, 3.0, 0.418, 3.5])

min_td_error = 1e-3  # Minimum TD error threshold to continue episode
consecutive_small_errors = 5  # Number of consecutive small TD errors to trigger stop

num_actions = 501
lr=0.1
gamma=0.99
epsilon=0.5
force_mag=100

In [3]:
gen = CartPoleCategoryGenerator()

#initial:
agent = TabularQLearningAgent(
    statespace=[low,high],
    num_actions=num_actions,  
    lr=lr,
    gamma=gamma,
    epsilon=epsilon,
    force_mag=force_mag
)

actionspace_dict = {state_tuple: [*range(num_actions)] for state_tuple in agent.disc.get_all_discrete_states()}
actionset_dict = {state_tuple: [] for state_tuple in agent.disc.get_all_discrete_states()}

In [4]:
runner = MetaQLearningRunner(
    gen=gen,  # your environment generator
    low=low,
    high=high, 
    num_actions=num_actions,
    actionset_dict=actionset_dict,
    actionspace_dict=actionspace_dict,
    lr=lr,
    gamma=gamma,
    epsilon=epsilon,
    force_mag=force_mag
)

In [5]:
# Step 1: First build your actionset_dict by training with actionspace_dict
print("=== Phase 1: Building ActionSet Dictionary ===")
# Train with actionspace_dict and build actionset_dict (typical usage)
runner.run_multiple_experiments(
    num_runs=10,
    episodes_per_run=10000,
    use_actionset_as_actionspace=False,
    update_actionset_dict=True,  # Build the actionset_dict
    verbose=True
)

# Remove duplicates
runner.actionset_dict = runner.remove_action_repetitions()

# Step 2: Save the trained actionset_dict for later use
runner.save_actionset_dict('trained_actionset.pkl')

=== Phase 1: Building ActionSet Dictionary ===

Starting 10 experiments with:
  Episodes per run: 10000
  Using actionset as action space: False
  Updating actionset_dict: True

--- Starting Run 1/10 ---
Starting experiment with category: very_hard
Using actionspace_dict as action space (training (updating actionset_dict))


Experiment completed in 24.40 seconds
Mean reward: 26.72 ¬± 3.49
Early stops: 0/10000
Mode: training
Actionset dictionary was updated
Run 1 completed in 24.40s

--- Starting Run 2/10 ---
Starting experiment with category: hard
Using actionspace_dict as action space (training (updating actionset_dict))
Experiment completed in 16.67 seconds
Mean reward: 18.82 ¬± 5.32
Early stops: 0/10000
Mode: training
Actionset dictionary was updated
Run 2 completed in 16.67s

--- Starting Run 3/10 ---
Starting experiment with category: very_hard
Using actionspace_dict as action space (training (updating actionset_dict))
Experiment completed in 18.50 seconds
Mean reward: 21.15 ¬± 7.05
Early stops: 0/10000
Mode: training
Actionset dictionary was updated
Run 3 completed in 18.50s

--- Starting Run 4/10 ---
Starting experiment with category: medium
Using actionspace_dict as action space (training (updating actionset_dict))
Experiment completed in 11.94 seconds
Mean reward: 13.75 ¬± 8.66
Early stops: 0/1000

In [6]:
runner.actionset_dict

{(0, 0, 0, 0): [np.int64(0), np.int64(144), np.int64(159)],
 (0, 0, 0, 1): [np.int64(0),
  np.int64(385),
  np.int64(104),
  np.int64(397),
  np.int64(349)],
 (0, 0, 0, 2): [np.int64(0),
  np.int64(196),
  np.int64(237),
  np.int64(142),
  np.int64(223)],
 (0, 0, 0, 3): [np.int64(0),
  np.int64(102),
  np.int64(71),
  np.int64(278),
  np.int64(281),
  np.int64(122),
  np.int64(220)],
 (0, 0, 0, 4): [np.int64(0),
  np.int64(96),
  np.int64(289),
  np.int64(344),
  np.int64(322),
  np.int64(461),
  np.int64(301),
  np.int64(370),
  np.int64(216)],
 (0, 0, 1, 0): [np.int64(0), np.int64(104)],
 (0, 0, 1, 1): [np.int64(0), np.int64(95)],
 (0, 0, 1, 2): [np.int64(0), np.int64(52), np.int64(301)],
 (0, 0, 1, 3): [np.int64(0),
  np.int64(489),
  np.int64(395),
  np.int64(176),
  np.int64(434),
  np.int64(84),
  np.int64(87),
  np.int64(281)],
 (0, 0, 1, 4): [np.int64(0),
  np.int64(481),
  np.int64(386),
  np.int64(136),
  np.int64(238),
  np.int64(144),
  np.int64(180),
  np.int64(341),
  np.

In [7]:
# Step 3: Evaluate both action sets
print("\n=== Phase 2: Comparing Policy Performance ===")
comparison_results = runner.compare_policy_performance(
    training_episodes=10000,
    evaluation_episodes=1000,
    num_comparisons=5
)


=== Phase 2: Comparing Policy Performance ===

POLICY PERFORMANCE COMPARISON

--- Comparison Run 1/5 ---
Using category: hard
Training with actionspace_dict...
Evaluating actionspace_dict policy...
Training with actionset_dict...
Evaluating actionset_dict policy...
ActionSpace - Train: 19.15s, Eval Reward: 22.00, Success: 100.00%
ActionSet   - Train: 44.46s, Eval Reward: 51.00, Success: 100.00%

--- Comparison Run 2/5 ---
Using category: very_hard
Training with actionspace_dict...


Evaluating actionspace_dict policy...
Training with actionset_dict...
Evaluating actionset_dict policy...
ActionSpace - Train: 19.19s, Eval Reward: 20.00, Success: 100.00%
ActionSet   - Train: 25.70s, Eval Reward: 40.00, Success: 100.00%

--- Comparison Run 3/5 ---
Using category: medium
Training with actionspace_dict...
Evaluating actionspace_dict policy...
Training with actionset_dict...
Evaluating actionset_dict policy...
ActionSpace - Train: 38.73s, Eval Reward: 48.00, Success: 100.00%
ActionSet   - Train: 14.20s, Eval Reward: 33.00, Success: 100.00%

--- Comparison Run 4/5 ---
Using category: hard
Training with actionspace_dict...
Evaluating actionspace_dict policy...
Training with actionset_dict...
Evaluating actionset_dict policy...
ActionSpace - Train: 18.39s, Eval Reward: 21.00, Success: 100.00%
ActionSet   - Train: 42.23s, Eval Reward: 53.00, Success: 100.00%

--- Comparison Run 5/5 ---
Using category: hard
Training with actionspace_dict...
Evaluating actionspace_dict policy.

In [8]:
# Step 4: Individual detailed evaluations
print("\n=== Phase 3: Detailed Individual Evaluations ===")


=== Phase 3: Detailed Individual Evaluations ===
