In [1]:
import PIL
import PIL.ImageDraw
import random

In [2]:
class RubkisCube:
    def __init__(self, moves_nr=3):
        self.cube = {"U":[['W', 'W', 'W'], ['W', 'W', 'W'], ['W', 'W', 'W']],
            "L":[['O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'O']],
            "F":[['G', 'G', 'G'], ['G', 'G', 'G'], ['G', 'G', 'G']],
            "R":[['R', 'R', 'R'], ['R', 'R', 'R'], ['R', 'R', 'R']],
            "B":[['B', 'B', 'B'], ['B', 'B', 'B'], ['B', 'B', 'B']],
            "D":[['Y', 'Y', 'Y'], ['Y', 'Y', 'Y'], ['Y', 'Y', 'Y']]
        }
        self.shuffle(moves_nr)
        
    def __str__(self):
        string = ""
        for face in self.cube.values():
            for row in face:
                for square in row:
                    string+=square
        return string
    '''
    def get_color(self, color: str):
        match color:
            case "R":
                return (255,0,0)
            case "G":
                return (0,255,0)
            case "B":
                return (0,0,255)
            case "W":
                return (255,255,255)
            case "Y":
                return (255,255,0)
            case "O":
                return (255,128,0)
    '''
    def get_color(self, color: str):
        if color == "R":
            return (255,0,0)
        elif color == "G":
            return (0,255,0)
        elif color == "B":
            return (0,0,255)
        elif color == "W":
            return (255,255,255)
        elif color == "Y":
            return (255,255,0)
        elif color == "O":
            return (255,128,0)
             
    def draw_cube(self):
        square_size = 50
        im = PIL.Image.new(mode="RGB", size=(square_size*12,square_size*9))
        draw = PIL.ImageDraw.Draw(im)

        positions = {
            "U": (3, 0),
            "L": (0, 3),
            "F": (3, 3),
            "R": (6, 3),
            "B": (9, 3),
            "D": (3, 6) 
        }

        for side, pos in positions.items():
            base_x, base_y = pos[0] * square_size, pos[1] * square_size
            
            for row in range(3):
                for col in range(3):
                    color = self.get_color(self.cube[side][row][col])

                    x1 = base_x + (col * square_size)
                    y1 = base_y + (row * square_size)
                    x2 = x1 + square_size
                    y2 = y1 + square_size
                    
                    draw.rectangle(
                        xy=(x1, y1, x2, y2),
                        fill=color,
                        outline=(0, 0, 0)
                    )
        #im.show()
        return im
    
    def rotate(self, move: str):
        face = move[0]
        direction = "CCW" if "'" in move else "CW"
        
        old_face = [row[:] for row in self.cube[face]]
        for i in range(3):
            for j in range(3):
                if direction == "CW":
                    self.cube[face][j][2-i] = old_face[i][j]
                elif direction == "CCW":
                    self.cube[face][2-j][i] = old_face[i][j]

        if face == "U":
            if direction == "CW":
                temp = self.cube["L"][0][:]
                self.cube["L"][0] = self.cube["F"][0][:]
                self.cube["F"][0] = self.cube["R"][0][:]
                self.cube["R"][0] = self.cube["B"][0][:]
                self.cube["B"][0] = temp
            elif direction == "CCW":
                temp = self.cube["L"][0][:]
                self.cube["L"][0] = self.cube["B"][0][:]
                self.cube["B"][0] = self.cube["R"][0][:]
                self.cube["R"][0] = self.cube["F"][0][:]
                self.cube["F"][0] = temp

        elif face == "D":
            if direction == "CW":
                temp = self.cube["L"][2][:]
                self.cube["L"][2] = self.cube["B"][2][:]
                self.cube["B"][2] = self.cube["R"][2][:]
                self.cube["R"][2] = self.cube["F"][2][:]
                self.cube["F"][2] = temp
            elif direction == "CCW":
                temp = self.cube["L"][2][:]
                self.cube["L"][2] = self.cube["F"][2][:]
                self.cube["F"][2] = self.cube["R"][2][:]
                self.cube["R"][2] = self.cube["B"][2][:]
                self.cube["B"][2] = temp

        elif face == "F":
            if direction == "CW":
                temp = self.cube["U"][2][:]
                self.cube["U"][2] = [self.cube["L"][2][2], self.cube["L"][1][2], self.cube["L"][0][2]]
                self.cube["L"][0][2], self.cube["L"][1][2], self.cube["L"][2][2] = self.cube["D"][0][:]
                self.cube["D"][0] = [self.cube["R"][2][0], self.cube["R"][1][0], self.cube["R"][0][0]]
                self.cube["R"][0][0], self.cube["R"][1][0], self.cube["R"][2][0] = temp
            elif direction == "CCW":
                temp = self.cube["U"][2][:]
                self.cube["U"][2] = [self.cube["R"][0][0], self.cube["R"][1][0], self.cube["R"][2][0]]
                self.cube["R"][0][0], self.cube["R"][1][0], self.cube["R"][2][0] = self.cube["D"][0][::-1]
                self.cube["D"][0] = [self.cube["L"][0][2], self.cube["L"][1][2], self.cube["L"][2][2]]
                self.cube["L"][0][2], self.cube["L"][1][2], self.cube["L"][2][2] = temp[::-1]

        elif face == "B":
            if direction == "CW":
                temp = self.cube["U"][0][:]
                self.cube["U"][0] = [self.cube["R"][0][2], self.cube["R"][1][2], self.cube["R"][2][2]]
                self.cube["R"][0][2], self.cube["R"][1][2], self.cube["R"][2][2] = self.cube["D"][2][::-1]
                self.cube["D"][2] = [self.cube["L"][0][0], self.cube["L"][1][0], self.cube["L"][2][0]]
                self.cube["L"][0][0], self.cube["L"][1][0], self.cube["L"][2][0] = temp[::-1]
            elif direction == "CCW":
                temp = self.cube["U"][0][:]
                self.cube["U"][0] = [self.cube["L"][2][0], self.cube["L"][1][0], self.cube["L"][0][0]]
                self.cube["L"][0][0], self.cube["L"][1][0], self.cube["L"][2][0] = self.cube["D"][2][:]
                self.cube["D"][2] = [self.cube["R"][2][2], self.cube["R"][1][2], self.cube["R"][0][2]]
                self.cube["R"][0][2], self.cube["R"][1][2], self.cube["R"][2][2] = temp

        elif face == "L":
            if direction == "CW":
                temp = [self.cube["U"][i][0] for i in range(3)]
                for i in range(3): self.cube["U"][i][0] = self.cube["B"][2-i][2]
                for i in range(3): self.cube["B"][i][2] = self.cube["D"][2-i][0]
                for i in range(3): self.cube["D"][i][0] = self.cube["F"][i][0]
                for i in range(3): self.cube["F"][i][0] = temp[i]
            elif direction == "CCW":
                temp = [self.cube["U"][i][0] for i in range(3)]
                for i in range(3): self.cube["U"][i][0] = self.cube["F"][i][0]
                for i in range(3): self.cube["F"][i][0] = self.cube["D"][i][0]
                for i in range(3): self.cube["D"][i][0] = self.cube["B"][2-i][2]
                for i in range(3): self.cube["B"][i][2] = temp[2-i]

        elif face == "R":
            if direction == "CW":
                temp = [self.cube["U"][i][2] for i in range(3)]
                for i in range(3): self.cube["U"][i][2] = self.cube["F"][i][2]
                for i in range(3): self.cube["F"][i][2] = self.cube["D"][i][2]
                for i in range(3): self.cube["D"][i][2] = self.cube["B"][2-i][0]
                for i in range(3): self.cube["B"][i][0] = temp[2-i]
            elif direction == "CCW":
                temp = [self.cube["U"][i][2] for i in range(3)]
                for i in range(3): self.cube["U"][i][2] = self.cube["B"][2-i][0]
                for i in range(3): self.cube["B"][i][0] = self.cube["D"][2-i][2]
                for i in range(3): self.cube["D"][i][2] = self.cube["F"][i][2]
                for i in range(3): self.cube["F"][i][2] = temp[i]

    def shuffle(self, moves=50):
        for _ in range(moves):
            move = random.choice(["F","R","U","B","L","D","F'","R'","U'","B'","L'","D'"])
            self.rotate(move)
        return self
    
    def check_score(self):
        return sum(sum(row.count(face[1][1]) for row in face) for face in self.cube.values())
    
    def check_done(self):
        return self.check_score() == 54
    
    def step(self, action):
        move = ["F","R","U","B","L","D","F'","R'","U'","B'","L'","D'"][action]
        self.rotate(move)
        #reward = 200 if self.check_score()==54 else (self.check_score()-54)/10
        #return reward
        return self.check_score()

# Custom Environment

# Building the Environment

In [3]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, MultiBinary, MultiDiscrete, Tuple
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


In [4]:
class CubeEnv(Env):
    def __init__(self):
        self.action_space = Discrete(12)
        
        self.color_to_idx = {
            'W': 0,  # White
            'O': 1,  # Orange
            'G': 2,  # Green
            'R': 3,  # Red
            'B': 4,  # Blue
            'Y': 5   # Yellow
        }
        self.observation_space = Box(low=0, high=5, shape=(54,), dtype=np.int8)
        
        self.state = RubkisCube()
        self.moves_left = 50
        self.previous_score = self.state.check_score()
        
    def _state_to_observation(self):
        state_str = str(self.state)  # You might need to adjust this based on your RubkisCube class
        return np.array([self.color_to_idx[c] for c in state_str], dtype=np.int8)
    
    def step(self, action):
        current_score = self.state.step(action)
        self.moves_left -= 1

        improvement = current_score - self.previous_score
        move_penalty = -0.1  

        done = True if current_score==54 else False
        
        if done:
            reward = 200
        else:
            reward = improvement * 2.0  # Multiply improvement by 2 for stronger signal
            reward += move_penalty  # Add move penalty
            
        if self.moves_left == 0:
            done = True
            reward += (current_score / 54.0) * 20.0

        self.previous_score = current_score

        info = {
            'moves_remaining': self.moves_left,
            'current_score': current_score,
            'completion_percentage': (current_score / 54.0) * 100
            }
            
        return self._state_to_observation(), reward, done, info
    
    def render(self):
        pass
        #return self.state.draw_cube()
    
    def reset(self):
        self.state = RubkisCube()
        self.moves_left = 50
        self.previous_score = self.state.check_score()

        return self._state_to_observation()

# Test Environment

In [5]:
env_cub = CubeEnv()

In [6]:
episodes = 1
for episode in range(episodes):
    obs = env_cub.reset()
    done = False
    score = 0
    
    for _ in range(50):
        action = env_cub.action_space.sample()
        obs, reward, done, info = env_cub.step(action)
        score += reward
        print(score)
        #env_cub.render().show()
        if done:
            break

-16.1
-26.200000000000003
-28.300000000000004
-30.400000000000006
-36.50000000000001
-40.60000000000001
-34.70000000000001
-34.80000000000001
-38.90000000000001
-35.000000000000014
-37.100000000000016
-35.20000000000002
-39.30000000000002
-41.40000000000002
-39.50000000000002
-37.60000000000002
-41.700000000000024
-39.800000000000026
-39.90000000000003
-36.00000000000003
-40.10000000000003
-40.20000000000003
-44.30000000000003
-40.400000000000034
-40.500000000000036
-34.60000000000004
-34.70000000000004
-36.80000000000004
-36.90000000000004
-35.00000000000004
-33.100000000000044
-35.200000000000045
-41.30000000000005
-37.40000000000005
-35.50000000000005
-27.60000000000005
-37.70000000000005
-41.800000000000054
-45.900000000000055
-50.00000000000006
-48.10000000000006
-50.20000000000006
-40.30000000000006
-38.40000000000006
-38.500000000000064
-42.600000000000065
-30.700000000000067
-36.80000000000007
-30.90000000000007
-31.074074074074144


# Train Model

In [7]:
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', env_cub, verbose=1, tensorboard_log=log_path, device='cpu')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [8]:
model.learn(total_timesteps=1_000_000)

Logging to Training\Logs\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.7     |
|    ep_rew_mean     | -21.3    |
| time/              |          |
|    fps             | 1880     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 48.8        |
|    ep_rew_mean          | -27.5       |
| time/                   |             |
|    fps                  | 1281        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007357899 |
|    clip_fraction        | 0.0512      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.48       |
|    explained_variance   | 7.52e-05    |

<stable_baselines3.ppo.ppo.PPO at 0x27ba1b0a040>

In [9]:
cube_model_path = os.path.join('Training', 'Saved Models', 'Cube_Model')
model.save(cube_model_path)

In [None]:
#load
model = PPO.load(cube_model_path, env=env_cub)

In [15]:
evaluate_policy(model, env_cub, n_eval_episodes=10)
#score mean / standard deviation

(197.11333347260953, 58.1109579098235)