In [1]:
from cellitaire.environment.agents.PPOAgent import Agent
from cellitaire.environment.cellitaire_env import CellitaireEnv
from cellitaire.environment.rewards.reward import *
from cellitaire.environment.rewards.foundation_rewards import *
import numpy as np

board_rows = 7
board_cols = 12
num_reserved = 6
test_reward = CombinedReward([
    #PlacedCardInFoundationReward(weight=6),
    WinReward(weight=100, rows=board_rows, cols=board_cols),
    #ConstantReward(weight=0.5),
    ScalingPlacedCardInFoundationReward(weight=1, rows=board_rows, cols=board_cols),
    #PlayedLegalMoveReward(weight=1, rows=board_rows, cols=board_cols, num_reserved = num_reserved),
    #PeriodicPlacedCardInFoundationReward(weight=4, reward_period=3),
    #CreatedMovesReward(weight=1, num_reserved=num_reserved, foundation_count_dropoff=30)
])
env = CellitaireEnv(test_reward, rows=board_rows, cols=board_cols, num_reserved=num_reserved, max_moves=1200, max_illegal_moves=1200)
env.action_space.n = board_rows * board_cols
#env.render()

#agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
#                alpha=alpha, n_epochs=n_epochs, 
#                input_dims=env.observation_space.shape)

N = 10000
batch_size = 2000
n_epochs = 5
alpha = 4e-6
gamma = 0.99
gae_lambda = 0.95
checkpoint_dir = 'tmp/testing_parallel_1'
agent = Agent(n_actions=env.action_space.n, 
    input_dims=(board_rows * board_cols * 4 + 6,), batch_size=batch_size, fc1_actor=2048, fc2_actor=2048, fc1_critic=4096, fc2_critic=4096,
                alpha=alpha, n_epochs=n_epochs, gamma=gamma, gae_lambda=gae_lambda, checkpoint_dir=checkpoint_dir)

normalize_reward = False
save_model = True

pygame 2.6.1 (SDL 2.28.4, Python 3.13.2)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [None]:
from process import *

trainer = AgentTrainer(agent, env, checkpoint_dir=checkpoint_dir, steps_to_post=N, batch_size=batch_size, collector_processes=1, normalize_reward=normalize_reward, performance_games_to_sim=100, steps_between_performance_checks=3)
trainer.start_training()

init exp coll


In [None]:
import threading
import copy

from cellitaire.environment.agents.PPOMemory import PPOMemory

class ExperienceCollector(threading.Thread):
    def __init__(self, agent, steps_to_post, batch_size, env, memory_queue, normalize_reward=False):
        super().__init__()
        
        self.agent = copy.deepcopy(agent)
        self.agent.load_models()
        self.steps_to_post = steps_to_post
        self.batch_size = batch_size
        self.env = copy.deepcopy(env)
        self.memory_queue = memory_queue
        self.normalize_reward = normalize_reward
        self.memory = PPOMemory(batch_size)

    def post_experiences(self):
        self.memory_queue.put(self.memory)
        self.memory = PPOMemory(self.batch_size)

    def remember(self, observation, action, prob, val, reward, done):
        self.memory.store_memory(observation, action, prob, val, reward, done)

    def run(self):

        self.running = True
        n_steps = 0
        while self.running:
            self.agent.load_models()
            self.env.reset()
            observation = self.env.get_state()
            done = False
            truncated = False
            while not done and not truncated:
                n_steps += 1
                action, prob, val = self.agent.choose_legal_action_mostly(observation, self.env.get_legal_actions_as_int())
                observation_, reward, done, truncated, info = self.env.step(action)
                if self.normalize_reward:
                    reward = reward / self.env.reward.max_reward
                self.remember(observation, action, prob, val, reward, done)
                if n_steps % self.steps_to_post == 0:
                    self.post_experiences()
                observation = observation_
                
    def kill(self):
        self.running = False
        ExperienceCollector.instantiated = False

In [None]:
import threading
import copy

class AgentPerformanceMonitor(threading.Thread):
    def __init__(self, agent, env, signal_queue, episodes_to_sim=100, normalize_reward=False):
        super().__init__()

        self.agent = copy.deepcopy(agent)
        self.agent.max_moves = 1200
        self.env = copy.deepcopy(env)
        self.signal_queue = signal_queue
        self.episodes_to_sim = episodes_to_sim
        self.normalize_reward = normalize_reward

    def run(self):
        self.running = True
        while self.running:
            learn_steps = self.signal_queue.get()
            if not self.running:
                break
            print(f'Model has had {learn_steps} Learning Steps. Simming {self.episodes_to_sim} games...')
            episodes_simmed = 0
            scores = []
            cards_saved = []
            while episodes_simmed < self.episodes_to_sim:
                self.agent.load_models()
                self.env.reset()
                observation = self.env.get_state()
                done = False
                truncated = False
                score = 0
                while not done:
                    action, _, _ = self.agent.choose_legal_action_mostly(observation, self.env.get_legal_actions_as_int())
                    observation_, reward, done, truncated, _ = self.env.step(action)
                    if self.normalize_reward:
                        reward = reward / self.env.reward.max_reward
                    score += reward
                    observation = observation_
                scores.append(score)
                cards_saved.append(self.env.game.foundation.total_cards())
                episodes_simmed += 1
            avg_score = np.mean(scores)
            max_score = np.max(scores)
            avg_cs = np.mean(cards_saved)
            max_cs = np.max(cards_saved)
            print(f'Simmed {self.episodes_to_sim} games | Average Score {avg_score:>6.1f} | Best Score {max_score:>6.1f} | Average Cards Saved {avg_cs:>2.0f} | Max Cards Saved {max_cs:>2.0f}')
            
    def kill(self):
        self.running = False
        AgentPerformanceMonitor.instantiated = False

In [None]:
import queue
import os

class AgentTrainer:
    def __init__(self, agent, env, steps_to_post=10000, batch_size=2000, collector_threads=8, normalize_reward=False, max_learn_steps=-1, performance_games_to_sim=100, steps_between_performance_checks=50):
        self.agent = agent
        try:
            self.agent.load_models()
        except:
            print('New run, godspeed lad')
            os.mkdir(checkpoint_dir)
            self.agent.save_models()
            
        self.env = env
        self.steps_to_post = steps_to_post
        self.batch_size = batch_size
        self.collector_threads = collector_threads
        self.normalize_reward = normalize_reward
        self.memory_queue = queue.Queue()
        self.monitor_signal_queue = queue.Queue()
        self.max_learn_steps = max_learn_steps
        self.performance_games_to_sim = performance_games_to_sim
        self.steps_between_performance_checks = steps_between_performance_checks

    def start_training(self):
        collectors = []
        try:   
            for _ in range(self.collector_threads):
                c = ExperienceCollector(self.agent, self.steps_to_post, self.batch_size, self.env, self.memory_queue, self.normalize_reward)
                c.start()
                collectors.append(c)

            performance_monitor = AgentPerformanceMonitor(self.agent, self.env, self.monitor_signal_queue, episodes_to_sim=self.performance_games_to_sim, normalize_reward=self.normalize_reward)
            performance_monitor.start()
    
            learn_steps = 0
            running = True
            last_batch = time.time()
            while running:
                new_memory = self.memory_queue.get()
                print(f'Time since last batch {time.time() - last_batch}')
                last_batch = time.time()
                self.agent.learn(new_memory)
                learn_steps += 1
                if learn_steps % self.collector_threads == 0:
                    agent.save_models()
                if learn_steps % self.steps_between_performance_checks == 0:
                    self.monitor_signal_queue.put(learn_steps)
                if self.max_learn_steps != -1 and learn_steps >= self.max_learn_steps:
                    running = False
        finally:
            for collector in collectors:
                collector.kill()
            performance_monitor.kill()
            self.monitor_signal_queue.put(0)

In [None]:
print(agent.actor.device)

In [None]:
import os

try:
    agent.load_models()
except:
    print('New run, godspeed lad')
    os.mkdir(checkpoint_dir)
n_games = 300

best_score = 4.0
score_history = []
cards_saved_history = []
max_score = 0
max_cards_saved = 0
episodes_without_best = 0

learn_iters = 0
avg_score = 0
n_steps = 0
i = 0
while True:
    env.reset()
    observation = env.get_state()
    done = False
    truncated = False
    score = 0
    while not done and not truncated:
        action, prob, val = agent.choose_legal_action_mostly(observation, env.get_legal_actions_as_int())
        observation_, reward, done, truncated, info = env.step(action)
        n_steps += 1
        if normalize_reward:
            score += reward / env.reward.max_reward
        else:
            score += reward
        agent.remember(observation, action, prob, val, reward, done)
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
        observation = observation_
    i += 1
    score_history.append(score)
    cards_saved = env.game.foundation.total_cards()
    cards_saved_history.append(cards_saved)
    avg_score = np.mean(score_history[-100:])
    recent_max = np.max(score_history[-110:])
    avg_cards_saved = np.mean(cards_saved_history[-100:])
    recent_max_cs = np.max(cards_saved_history[-100:])

    max_score = max(max_score, score)
    max_cards_saved = max(max_cards_saved, cards_saved)
    
    if avg_score > best_score and n_steps > N:
        best_score = avg_score
        recent_std = np.std(score_history[-100:])
        if save_model:
            agent.save_models()
        episodes_without_best = 0
        print(f'episode {i:>5} | score {score:>6.1f} | avg s {avg_score:>6.1f} | max s R {recent_max:>5.1f} | max s A {max_score:>5.1f} | avg cs {avg_cards_saved:>4.1f} | max cs R {recent_max_cs:>2.0f} | max cs A {max_cards_saved:>2.0f} | ls {learn_iters:>5} *')
    else:
        episodes_without_best += 1
        
    if episodes_without_best % 100 == 0 and episodes_without_best > 0:
        recent_std = np.std(score_history[-100:])
        print(f'episode {i:>5} | score {score:>6.1f} | avg s {avg_score:>6.1f} | max s R {recent_max:>5.1f} | max s A {max_score:>5.1f} | avg cs {avg_cards_saved:>4.1f} | max cs R {recent_max_cs:>2.0f} | max cs A {max_cards_saved:>2.0f} | ls {learn_iters:>5}')

In [None]:
import gymnasium as gym
import numpy as np

env = gym.make('CartPole-v1', render_mode='human')
N = 20
batch_size = 5
n_epochs = 4
alpha = 0.0003
agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                alpha=alpha, n_epochs=n_epochs, 
                input_dims=env.observation_space.shape)
n_games = 300

figure_file = 'plots/cartpole.png'

best_score = -1000
score_history = []
max_score = 0
episodes_without_best = 0


learn_iters = 0
avg_score = 0
n_steps = 0

for i in range(n_games):
    observation = env.reset()[0]
    done = False
    truncated = False
    score = 0
    while not done and not truncated:
        action, prob, val = agent.choose_action(observation)
        observation_, reward, done, truncated, info = env.step(action)
        n_steps += 1
        score += reward
        agent.remember(observation, action, prob, val, reward, done)
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    max_score = max(max_score, score)
    
    if avg_score > best_score:
        best_score = avg_score
        recent_std = np.std(score_history[-100:])
        agent.save_models()
        episodes_without_best = 0
        print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done} *')
    else:
        episodes_without_best += 1
        
    if episodes_without_best % 100 == 0 and episodes_without_best > 0:
        recent_std = np.std(score_history[-100:])
        print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done}')


In [None]:
env.close()

In [None]:
agent.save_models()

In [None]:
demo_env.close()

In [None]:
%pip install -e ../.