In [1]:
import copy
# TODO: bug in score somewhere here
# Could definitely parallelize this as well
class RolloutAgent:
    def __init__(self, agent, starting_move, env):
        self.agent = agent
        self.starting_move = starting_move
        reward_copy = copy.deepcopy(env.reward)
        game_copy = copy.deepcopy(env.game)
        self.env = CellitaireEnv(reward_copy, rows=env.rows, cols=env.cols, num_reserved=env.num_reserved)
        self.env.reset()
        self.env.game = game_copy
        self.env.reward = reward_copy

    def rollout(self):
        observation, reward, done, _, _ = self.env.step(self.starting_move)
        total_reward = reward
        actions = [self.starting_move]
        while not done:
            action, _, _ = self.agent.choose_legal_action_mostly(observation, self.env.get_legal_actions_as_int())
            actions.append(action)
            observation_, reward, done, _, _ = self.env.step(action)
            total_reward += reward
            observation = observation_
        return total_reward, actions

In [2]:
class TreeNode:
    def __init__(self, state, best_reward):
        self.state = state
        self.best_reward = -1e9
        self.prev = None
        self.nexts = []

In [3]:
import torch
import numpy as np

class TreeSearch:
    def __init__(self, agent, env, n_actions):
        self.agent = agent
        self.env = env
        self.n_actions = n_actions

    def get_moves(self):
        state = torch.tensor(np.array([self.env.get_state()]), dtype=torch.float).to(self.agent.actor.device)
        state = state.squeeze(0)
        legal_actions = self.env.get_legal_actions_as_int()
        
        dist = self.agent.actor(state)

        mask = torch.zeros(self.n_actions).to(self.agent.actor.device)
        mask[legal_actions] = 1
        masked_dist = dist.probs * mask
        legal_indices = torch.nonzero(masked_dist, as_tuple=True)[0]
        legal_values = masked_dist[legal_indices]
        _, sorted_order = torch.sort(legal_values, descending=True)
        sorted_moves = legal_indices[sorted_order]
        best_reward = -1
        best_moves = []
        for move in sorted_moves:
            rollout_agent = RolloutAgent(self.agent, move, self.env)
            rollout_reward, rollout_moves = rollout_agent.rollout()
            if best_reward < rollout_reward:
                best_reward = rollout_reward
                best_moves = rollout_moves
        return best_moves, best_reward

In [4]:
from cellitaire.environment.agents.PPOAgent import Agent
from cellitaire.environment.cellitaire_env import CellitaireEnv
from cellitaire.environment.rewards.reward import *
from cellitaire.environment.rewards.foundation_rewards import *
import numpy as np

board_rows = 7
board_cols = 12
num_reserved = 6
test_reward = CombinedReward([
    #PlacedCardInFoundationReward(weight=6),
    WinReward(weight=100, rows=board_rows, cols=board_cols),
    #ConstantReward(weight=0.5),
    ScalingPlacedCardInFoundationReward(weight=1, rows=board_rows, cols=board_cols),
    #PlayedLegalMoveReward(weight=1, rows=board_rows, cols=board_cols, num_reserved = num_reserved),
    #PeriodicPlacedCardInFoundationReward(weight=4, reward_period=3),
    #CreatedMovesReward(weight=1, num_reserved=num_reserved, foundation_count_dropoff=30)
])
env = CellitaireEnv(test_reward, rows=board_rows, cols=board_cols, num_reserved=num_reserved, max_moves=1200, max_illegal_moves=1200, render_mode='human', frame_rate=0.1)
#env = CellitaireEnv(test_reward, rows=board_rows, cols=board_cols, num_reserved=num_reserved, max_moves=1200, max_illegal_moves=1200)
env.render()

#agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
#                alpha=alpha, n_epochs=n_epochs, 
#                input_dims=env.observation_space.shape)

N = 10000
batch_size = 2000
n_epochs = 5
alpha = 1e-7
agent = Agent(n_actions=env.action_space.n, 
        input_dims=(board_rows * board_cols * 4 + 6,), batch_size=batch_size, 
                    alpha=alpha, n_epochs=n_epochs)

pygame 2.6.1 (SDL 2.28.4, Python 3.13.2)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [5]:
import time

try:
    best_score = -1000
    score_history = []
    max_score = 0
    episodes_without_best = 0
    
    learn_iters = 0
    avg_score = 0
    n_steps = 0
    i = 0
    max_reward = 52. * 53. / 2. + 100.
    print(f'Max reward: {max_reward}')
    while True:
        agent.load_models()
        env.reset()
        seen_cards = set([slot.card.card_id for row in env.game.board.slots for slot in row if slot.has_card()])
        seen_cards.add(env.game.stockpile.top_card().card_id)
        observation = env.get_state()
        done = False
        score = 0
        while not done:
            top_card = env.game.stockpile.top_card()
            if top_card is not None:
                seen_cards.add(top_card.card_id)
            if len(seen_cards) == 52:
                break
            action, _, _ = agent.choose_legal_action_mostly(observation, env.get_legal_actions_as_int())
            observation_, reward, done, _, info = env.step(action)
            score += reward
            observation = observation_

        tree_search = TreeSearch(agent, env, board_rows * board_cols)

        best_trajectory = -1e9
        remaining_moves_in_trajectory = None
        while not done:
            moves, end_reward = tree_search.get_moves()
            current_trajectory = end_reward + score
            if current_trajectory > best_trajectory:
                best_trajectory = current_trajectory
                remaining_moves_in_trajectory = moves
                print(f'Current trajectory: {best_trajectory}')
            if best_trajectory >= max_reward:
                for move in moves:
                    _, reward, done, _, _ = env.step(move)
                    score += reward
                break
            for move in remaining_moves_in_trajectory[:30]:
                _, reward, done, _, _ = env.step(move)
                score += reward
            remaining_moves_in_trajectory = remaining_moves_in_trajectory[30:]
        i += 1
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        
        max_score = max(max_score, score)
    
        if avg_score > best_score and i > N:
            best_score = avg_score
            recent_std = np.std(score_history[-100:])
            #agent.save_models()
            episodes_without_best = 0
            print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done} *')
        else:
            episodes_without_best += 1
            
        if episodes_without_best % 100 == 0 and episodes_without_best > 0:
            recent_std = np.std(score_history[-100:])
            print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done}')
        time.sleep(10)
        print(score)
except Exception as e:
    print(e)
finally:
    env.close()

Max reward: 1478.0
... loading models ...
Current trajectory: 992.0
Current trajectory: 994.0
Current trajectory: 1085.0
Current trajectory: 1326.0
1326.0
... loading models ...
Current trajectory: 1038.0
Current trajectory: 1039.0
Current trajectory: 1189.0
Current trajectory: 1296.0
Current trajectory: 1315.0
1275.0
... loading models ...
Current trajectory: 1083.0
Current trajectory: 1181.0
Current trajectory: 1264.0


KeyboardInterrupt: 

In [None]:
import torch as torch

a = torch.randn(5)
b = torch.randint(low=0, high=2, size=(5,))
print(a)
print(b)

In [None]:
masked_dist = a * b
legal_indices = torch.nonzero(masked_dist, as_tuple=True)[0]
legal_values = masked_dist[legal_indices]
_, sorted_order = torch.sort(legal_values, descending=True)
sorted_moves = legal_indices[sorted_order]
print(sorted_moves)

In [None]:
env.close()

In [None]:
filtered_values