In [1]:
# train.py
import numpy as np
import os
import time
from collections import deque
from tqdm import tqdm # Import tqdm

import settings as s
from blockdoku_env import BlockdokuEnv
from dqn_agent import DQNAgent # PyTorch version
import json
import datetime




pygame 2.6.1 (SDL 2.28.4, Python 3.10.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [5]:
def print_grid_formatted(valid_action_mask):
    """
    Print the valid action mask as a 2D grid for each piece.
    Assumes action encoding: piece_idx * (GRID_HEIGHT * GRID_WIDTH) + row * GRID_WIDTH + col
    """
    # Assuming 9x9 grid dimensions
    grid_height, grid_width = 9, 9
    
    # Calculate number of pieces based on mask length
    total_actions = len(valid_action_mask)
    actions_per_piece = grid_height * grid_width
    num_pieces = total_actions // actions_per_piece
    
    # For each piece
    for piece_idx in range(num_pieces):
        print(f"\nValid positions for piece {piece_idx}:")
        
        # Create a 2D grid for this piece
        piece_grid = np.zeros((grid_height, grid_width), dtype=int)
        
        # Fill the grid with valid actions
        for row in range(grid_height):
            for col in range(grid_width):
                action_idx = piece_idx * actions_per_piece + row * grid_width + col
                if action_idx < total_actions:
                    piece_grid[row, col] = 1 if valid_action_mask[action_idx] else 0
        
        # Print the grid with nice formatting
        for row in range(grid_height):
            if row > 0 and row % 3 == 0:
                print("-" * 21)  # Horizontal separator every 3 rows
            
            row_str = ""
            for col in range(grid_width):
                row_str += f"{piece_grid[row, col]} "
                if (col + 1) % 3 == 0 and col < grid_width - 1:
                    row_str += "| "  # Vertical separator every 3 columns
            
            print(row_str)

In [None]:

env = BlockdokuEnv(render_mode=None)
vis_env = None


grid_shape_numpy = (s.GRID_HEIGHT, s.GRID_WIDTH, s.STATE_GRID_CHANNELS)
piece_vector_size = s.STATE_PIECE_VECTOR_SIZE
# Pass numpy shape (H, W, C) to agent, it handles internal PyTorch shape
agent = DQNAgent(grid_shape_numpy, piece_vector_size, env.action_size,
                    load_model_path=None)


Using device: cpu
Initializing new model.


In [7]:

# for episode in range(1): # Iterate through the tqdm progress bar
state_np, info = env.reset()

grid = state_np["grid"]
for row in grid:
    # Format each cell with fixed width and join them
    print(" ".join([f"{cell}" for cell in row]))
print("Initial info:", info["available_piece_keys"])
# print("Initial info:", info["valid_action_mask"])
# print(info["valid_action_mask"])
print_grid_formatted(info["valid_action_mask"])
episode_score = 0
total_episode_loss = 0
learn_steps = 0
steps = 0
done = False



[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
[0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.] [0.]
Initial info: ['u_shape', 'corner_small', 'big_square']

Valid positions for piece 0:
1 1 1 | 1 1 1 | 1 0 0 
1 1 1 | 1 1 1 | 1 0 0 
1 1 1 | 1 1 1 | 1 0 0 
---------------------
1 1 1 | 1 1 1 | 1 0 0 
1 1 1 | 1 1 1 | 1 0 0 
1 1 1 | 1 1 1 | 1 0 0 
---------------------
1 1 1 | 1 1 1 | 1 0 0 
1 1 1 | 1 1 1 | 1 0 0 
0 0 0 | 0 0 0 | 0 0 0 

Valid positions for piece 1:
1 1 1 | 1 1 1 | 1 1 0 
1 1 1 | 1 1 1 | 1 1 0 
1 1 1 | 1 1 1 | 1 1 0 
---------------------
1 1 1 | 1 1 1 | 1 1 0 
1 1 1 | 1 1 1 | 1 1 0 
1 1 1 | 1 1 1 | 1 1 0 
---------------------
1 1 1 | 1 1 1 | 1 1 0 
1 1 1 | 1 1 1 | 1 1 0 


In [None]:

while not done:
    # ... (inner loop logic: act, step, remember, replay) ...
    valid_mask = info.get("valid_action_mask", None)
    print(f"Valid action mask: {valid_mask}")
    action = agent.act(state_np, valid_action_mask=valid_mask, use_epsilon=True)
    print(f"Action taken: {action}")
    next_state_np, reward, done, info = env.step(action) #
    print(f"Next state: {next_state_np}, Reward: {reward}, Done: {done}")

    agent.remember(state_np, action, reward, next_state_np, done)
    loss = agent.replay()

    state_np = next_state_np
    episode_score += reward
    if loss > 0:
        total_episode_loss += loss
        learn_steps += 1
    steps += 1



In [None]:

for episode in range(1): # Iterate through the tqdm progress bar
    state_np, info = env.reset()
    
    grid = state_np["grid"]
    for row in grid:
        # Format each cell with fixed width and join them
        print(" ".join([f"{cell}" for cell in row]))
    print("Initial info:", info["available_piece_keys"])
    # print("Initial info:", info["valid_action_mask"])
    print(info["valid_action_mask"])
    print_grid_formatted(info["valid_action_mask"])
    episode_score = 0
    total_episode_loss = 0
    learn_steps = 0
    steps = 0
    done = False
    
    
    while not done:
        # ... (inner loop logic: act, step, remember, replay) ...
        valid_mask = info.get("valid_action_mask", None)
        print(f"Valid action mask: {valid_mask}")
        action = agent.act(state_np, valid_action_mask=valid_mask, use_epsilon=True)
        print(f"Action taken: {action}")
        next_state_np, reward, done, info = env.step(action) #
        print(f"Next state: {next_state_np}, Reward: {reward}, Done: {done}")

        agent.remember(state_np, action, reward, next_state_np, done)
        loss = agent.replay()

        state_np = next_state_np
        episode_score += reward
        if loss > 0:
            total_episode_loss += loss
            learn_steps += 1
        steps += 1
    
