In [1]:
import gymnasium as gym
import numpy as np
import random
from Qtabularfunctions import*
from Cartpolefamily import*
from metaQlearning import*
import matplotlib.pyplot as plt
import random

In [2]:
# state space limits
low = np.array([-4.8, -3.0, -0.418, -3.5])
high = np.array([4.8, 3.0, 0.418, 3.5])

min_td_error = 1e-3  # Minimum TD error threshold to continue episode
consecutive_small_errors = 5  # Number of consecutive small TD errors to trigger stop

num_actions = 101
lr=0.1
gamma=0.99
epsilon=0.5
force_mag=100

In [3]:
gen = CartPoleCategoryGenerator()

#initial:
agent = TabularQLearningAgent(
    statespace=[low,high],
    num_actions=num_actions,  
    lr=lr,
    gamma=gamma,
    epsilon=epsilon,
    force_mag=force_mag
)

actionspace_dict = {state_tuple: [*range(num_actions)] for state_tuple in agent.disc.get_all_discrete_states()}
actionset_dict = {state_tuple: [] for state_tuple in agent.disc.get_all_discrete_states()}

In [5]:
runner = MetaQLearningRunner(
    gen=gen,  # your environment generator
    low=low,
    high=high, 
    num_actions=num_actions,
    actionset_dict=actionset_dict,
    actionspace_dict=actionspace_dict,
    lr=lr,
    gamma=gamma,
    epsilon=epsilon,
    force_mag=force_mag
)

# Phase 1: Train and build actionset_dict
runner.run_multiple_experiments(
    num_runs=5, 
    episodes_per_run=1000,
    use_actionset_as_actionspace=False,  # Training mode
    verbose=True
)

# Print summary
runner.print_summary()

# Remove repetitions
runner.remove_action_repetitions()

# Phase 2: Compare policies
comparison_results = runner.evaluate_policy_comparison(
    evaluation_episodes=200,
    verbose=True
)

# Print detailed summary
runner.print_policy_comparison_summary(comparison_results)


--- Starting Run 1/5 ---
Starting experiment with category: easy
Using actionspace_dict as action space (training mode)


Experiment completed in 1.44 seconds
Mean reward: 24.75 ± 13.36
Early stops: 0/1000
Mode: training
Run 1 completed in 1.44s

--- Starting Run 2/5 ---
Starting experiment with category: easy
Using actionspace_dict as action space (training mode)
Experiment completed in 1.37 seconds
Mean reward: 23.99 ± 13.42
Early stops: 0/1000
Mode: training
Run 2 completed in 1.37s

--- Starting Run 3/5 ---
Starting experiment with category: hard
Using actionspace_dict as action space (training mode)
Experiment completed in 1.35 seconds
Mean reward: 24.44 ± 16.82
Early stops: 0/1000
Mode: training
Run 3 completed in 1.35s

--- Starting Run 4/5 ---
Starting experiment with category: medium
Using actionspace_dict as action space (training mode)
Experiment completed in 0.92 seconds
Mean reward: 17.06 ± 9.23
Early stops: 0/1000
Mode: training
Run 4 completed in 0.92s

--- Starting Run 5/5 ---
Starting experiment with category: hard
Using actionspace_dict as action space (training mode)
Experiment complete

In [5]:
# After running experiments
runner.remove_action_repetitions()

{(0, 0, 0, 0): [np.int64(48), np.int64(24), np.int64(77), np.int64(0)],
 (0, 0, 0, 1): [np.int64(0),
  np.int64(3),
  np.int64(21),
  np.int64(22),
  np.int64(62)],
 (0, 0, 0, 2): [np.int64(0), np.int64(22)],
 (0, 0, 0, 3): [np.int64(0), np.int64(1)],
 (0, 0, 0, 4): [np.int64(0), np.int64(34), np.int64(88)],
 (0, 0, 1, 0): [np.int64(0), np.int64(17), np.int64(93), np.int64(39)],
 (0, 0, 1, 1): [np.int64(0), np.int64(72), np.int64(15)],
 (0, 0, 1, 2): [np.int64(0), np.int64(10)],
 (0, 0, 1, 3): [np.int64(0),
  np.int64(32),
  np.int64(37),
  np.int64(74),
  np.int64(76)],
 (0, 0, 1, 4): [np.int64(0), np.int64(11), np.int64(13), np.int64(47)],
 (0, 0, 2, 0): [np.int64(0), np.int64(73), np.int64(28)],
 (0, 0, 2, 1): [np.int64(0)],
 (0, 0, 2, 2): [np.int64(0)],
 (0, 0, 2, 3): [np.int64(0), np.int64(29)],
 (0, 0, 2, 4): [np.int64(0),
  np.int64(1),
  np.int64(79),
  np.int64(82),
  np.int64(27)],
 (0, 0, 3, 0): [np.int64(0), np.int64(33), np.int64(37), np.int64(21)],
 (0, 0, 3, 1): [np.int6

In [6]:
# Save
runner.save_actionset_dict('actionset.pkl')

# Load  
runner.load_actionset_dict('actionset.pkl')

actionset_dict saved to actionset.pkl
actionset_dict loaded from actionset.pkl


{(0, 0, 0, 0): [np.int64(48), np.int64(24), np.int64(77), np.int64(0)],
 (0, 0, 0, 1): [np.int64(0),
  np.int64(3),
  np.int64(21),
  np.int64(22),
  np.int64(62)],
 (0, 0, 0, 2): [np.int64(0), np.int64(22)],
 (0, 0, 0, 3): [np.int64(0), np.int64(1)],
 (0, 0, 0, 4): [np.int64(0), np.int64(34), np.int64(88)],
 (0, 0, 1, 0): [np.int64(0), np.int64(17), np.int64(93), np.int64(39)],
 (0, 0, 1, 1): [np.int64(0), np.int64(72), np.int64(15)],
 (0, 0, 1, 2): [np.int64(0), np.int64(10)],
 (0, 0, 1, 3): [np.int64(0),
  np.int64(32),
  np.int64(37),
  np.int64(74),
  np.int64(76)],
 (0, 0, 1, 4): [np.int64(0), np.int64(11), np.int64(13), np.int64(47)],
 (0, 0, 2, 0): [np.int64(0), np.int64(73), np.int64(28)],
 (0, 0, 2, 1): [np.int64(0)],
 (0, 0, 2, 2): [np.int64(0)],
 (0, 0, 2, 3): [np.int64(0), np.int64(29)],
 (0, 0, 2, 4): [np.int64(0),
  np.int64(1),
  np.int64(79),
  np.int64(82),
  np.int64(27)],
 (0, 0, 3, 0): [np.int64(0), np.int64(33), np.int64(37), np.int64(21)],
 (0, 0, 3, 1): [np.int6

In [7]:
runner.actionset_dict

{(0, 0, 0, 0): [np.int64(48), np.int64(24), np.int64(77), np.int64(0)],
 (0, 0, 0, 1): [np.int64(0),
  np.int64(3),
  np.int64(21),
  np.int64(22),
  np.int64(62)],
 (0, 0, 0, 2): [np.int64(0), np.int64(22)],
 (0, 0, 0, 3): [np.int64(0), np.int64(1)],
 (0, 0, 0, 4): [np.int64(0), np.int64(34), np.int64(88)],
 (0, 0, 1, 0): [np.int64(0), np.int64(17), np.int64(93), np.int64(39)],
 (0, 0, 1, 1): [np.int64(0), np.int64(72), np.int64(15)],
 (0, 0, 1, 2): [np.int64(0), np.int64(10)],
 (0, 0, 1, 3): [np.int64(0),
  np.int64(32),
  np.int64(37),
  np.int64(74),
  np.int64(76)],
 (0, 0, 1, 4): [np.int64(0), np.int64(11), np.int64(13), np.int64(47)],
 (0, 0, 2, 0): [np.int64(0), np.int64(73), np.int64(28)],
 (0, 0, 2, 1): [np.int64(0)],
 (0, 0, 2, 2): [np.int64(0)],
 (0, 0, 2, 3): [np.int64(0), np.int64(29)],
 (0, 0, 2, 4): [np.int64(0),
  np.int64(1),
  np.int64(79),
  np.int64(82),
  np.int64(27)],
 (0, 0, 3, 0): [np.int64(0), np.int64(33), np.int64(37), np.int64(21)],
 (0, 0, 3, 1): [np.int6

In [8]:
# Evaluation phase - uses actionset_dict as action space
runner.run_multiple_experiments(
    num_runs=3,
    episodes_per_run=500, 
    use_actionset_as_actionspace=True,
    verbose=True
)


--- Starting Run 1/3 ---
Starting experiment with category: hard
Using actionset_dict as action space (evaluation mode)
Experiment completed in 0.56 seconds
Mean reward: 20.59 ± 4.55
Early stops: 0/500
Mode: evaluation
Run 1 completed in 0.56s

--- Starting Run 2/3 ---
Starting experiment with category: medium
Using actionset_dict as action space (evaluation mode)
Experiment completed in 0.46 seconds
Mean reward: 17.62 ± 4.05
Early stops: 0/500
Mode: evaluation
Run 2 completed in 0.46s

--- Starting Run 3/3 ---
Starting experiment with category: easy
Using actionset_dict as action space (evaluation mode)
Experiment completed in 0.54 seconds
Mean reward: 18.70 ± 4.07
Early stops: 0/500
Mode: evaluation
Run 3 completed in 0.54s


[{'category': 'hard',
  'runtime_seconds': 0.5618312358856201,
  'total_episodes': 500,
  'early_stops': 0,
  'mean_reward': np.float64(20.586),
  'std_reward': np.float64(4.551329915530185),
  'mean_steps': np.float64(20.586),
  'final_epsilon': 0.2952450000000001,
  'mode': 'evaluation',
  'run_id': 0},
 {'category': 'medium',
  'runtime_seconds': 0.45523786544799805,
  'total_episodes': 500,
  'early_stops': 0,
  'mean_reward': np.float64(17.624),
  'std_reward': np.float64(4.051002838804238),
  'mean_steps': np.float64(17.624),
  'final_epsilon': 0.2952450000000001,
  'mode': 'evaluation',
  'run_id': 1},
 {'category': 'easy',
  'runtime_seconds': 0.5443506240844727,
  'total_episodes': 500,
  'early_stops': 0,
  'mean_reward': np.float64(18.7),
  'std_reward': np.float64(4.070626487409524),
  'mean_steps': np.float64(18.7),
  'final_epsilon': 0.2952450000000001,
  'mode': 'evaluation',
  'run_id': 2}]