In [16]:
import math
import random
import numpy as np
import gymnasium as gym
import copy
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib import animation

class MCTSNode:
    def __init__(self, state, parent=None, action=None):
        self.state = state
        self.parent = parent
        self.children = []
        self.action = action
        self.visits = 0
        self.wins = 0

    def is_fully_expanded(self, action_space_size):
        return len(self.children) == action_space_size

    def best_child(self, exploration_param=1.414):
        choices_weights = [
            (child.wins / max(1, child.visits)) + exploration_param * math.sqrt(
                math.log(max(1, self.visits)) / max(1, child.visits)
            ) for child in self.children
        ]
        return self.children[np.argmax(choices_weights)]

    def most_visited_child(self):
        return max(self.children, key=lambda child: child.visits)

def discretize_state(state, bins):
    """
    Discretize the continuous state into a discrete bin representation.
    We create bins for each feature (position, velocity, angle, angular velocity).
    """
    binned_state = []
    for i in range(len(state)):
#         print('state[i]', state[i])
#         print('bin[i]', bins[i])
        binned_state.append(np.digitize(state[i], bins[i]))
    return tuple(binned_state)

def create_bins():
    """
    Create bins for each state variable.
    CartPole has 4 continuous state variables:
    [Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity].
    We define the ranges and bins for each.
    """
    bins = [
        np.linspace(-4.8, 4.8, 10),  # Cart Position
        np.linspace(-5.0, 5.0, 10),  # Cart Velocity
        np.linspace(-0.418, 0.418, 10),  # Pole Angle
        np.linspace(-5.0, 5.0, 10),  # Pole Angular Velocity
    ]
    return bins

def rollout(env, bins):
    """
    Simulate a random rollout from the current state.
    """
    total_reward = 0
    done = False
    while not done:
        # Randomly pick an action (exploration)
        action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward

def backpropagate(node, reward):
    while node is not None:
        node.visits += 1
        node.wins += reward
        node = node.parent

def expand(node, env, action_space_size, bins):
    """
    Expand the node by creating child nodes for each action.
    """
    for action in range(action_space_size):
        # Clone the environment and step with the current action
        env_copy = gym.make('CartPole-v1')
        env_copy.reset()
        env_copy.env.state = env.env.state  # Copy the environment state
        state, reward, done, _, _ = env_copy.step(action)

        # Discretize the state to get the discrete representation
        discrete_state = discretize_state(state, bins)

        child_node = MCTSNode(state=discrete_state, parent=node, action=action)
        node.children.append(child_node)

def select(node, action_space_size):
    """
    Traverse the tree by selecting the best child node based on the UCB1 algorithm.
    """
    current_node = node
    while current_node.is_fully_expanded(action_space_size):
        current_node = current_node.best_child()
    return current_node

def mcts(env, state, current_node, simulations=1000):
    env_copy = copy.deepcopy(env)
    action_space_size = env.action_space.n

    for _ in range(simulations):
        # Step 1: Selection
        selected_node = select(current_node, action_space_size)
#         print('sn',selected_node)
        
        # Step 2: Expansion
        expand(selected_node, env_copy, action_space_size, bins)
        
        # Step 3: Simulation
        reward = rollout(env_copy, bins)
        if reward !=0:
            print('r', reward)
        
        # Step 4: Backpropagation
        backpropagate(selected_node, reward)
        
    best_node = current_node.best_child(exploration_param=0)
    print('children', current_node.children)
    for c in current_node.children:
        print(c.wins)
    print('mv', current_node.most_visited_child().wins)
    print('bst', current_node.best_child().wins)
    print('curr', current_node.wins)
    return best_node

# Initialize the CartPole environment
env = gym.make("CartPole-v1")

# Run MCTS for 1000 simulations


# Apply the best move (select action based on the best child)
state = env.reset()[0]
print(state)
done = False
tot_r = 0

# Discretize the initial state
bins = create_bins()
discrete_state = discretize_state(state, bins)
current_node = MCTSNode(discrete_state) # first will be root node

while not done:
    current_node = mcts(env, state, current_node, simulations=1000)
    print('action', current_node.action)
    state, reward, done, _, _ = env.step(current_node.action)
    tot_r += reward
    #     print(reward)
    env.render()

print('done')
print(tot_r)
env.close()


[-0.02977553  0.02695098 -0.03981876  0.03226086]
r 38.0
children [<__main__.MCTSNode object at 0x7f031fe1fdf0>, <__main__.MCTSNode object at 0x7f031fe1f9a0>]
0.0
0.0
mv 0.0
bst 0.0
curr 38.0
action 0
r 33.0
children [<__main__.MCTSNode object at 0x7f031fe1f5b0>, <__main__.MCTSNode object at 0x7f031fe1f130>]
0.0
33.0
mv 33.0
bst 33.0
curr 33.0
action 1
r 19.0
children [<__main__.MCTSNode object at 0x7f031fe1faf0>, <__main__.MCTSNode object at 0x7f031fe1f160>]
52.0
0.0
mv 52.0
bst 0.0
curr 52.0
action 0
r 26.0
children [<__main__.MCTSNode object at 0x7f03327cc550>, <__main__.MCTSNode object at 0x7f031fe1f9d0>]
26.0
52.0
mv 52.0
bst 26.0
curr 78.0
action 1
r 18.0
r 4.0
children [<__main__.MCTSNode object at 0x7f031fb36b50>, <__main__.MCTSNode object at 0x7f031fb36d00>]
55.0
19.0
mv 55.0
bst 19.0
curr 74.0
action 0
r 21.0
children [<__main__.MCTSNode object at 0x7f031fb36f10>, <__main__.MCTSNode object at 0x7f031fb36ca0>]
0.0
76.0
mv 76.0
bst 0.0
curr 76.0
action 1
r 33.0
r 8.0
children [

In [56]:
import math
import random
import numpy as np
import gymnasium as gym
import copy
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib import animation
from gym.wrappers import RecordVideo
from IPython.display import Video
import cv2

class MCTSNode:
    def __init__(self, state, parent=None, action=None):
        self.state = state
        self.parent = parent
        self.children = []
        self.action = action
        self.visits = 0
        self.wins = 0

    def is_fully_expanded(self, action_space_size):
        return len(self.children) == action_space_size

    def best_child(self, exploration_param=1.414):
        choices_weights = [
            (child.wins / max(1, child.visits)) + exploration_param * math.sqrt(
                math.log(max(1, self.visits)) / max(1, child.visits)
            ) for child in self.children
        ]
        return self.children[np.argmax(choices_weights)]

    def most_visited_child(self):
        return max(self.children, key=lambda child: child.visits)

def discretize_state(state, bins):
    """
    Discretize the continuous state into a discrete bin representation.
    We create bins for each feature (position, velocity, angle, angular velocity).
    """
    binned_state = []
    for i in range(len(state)):
        binned_state.append(np.digitize(state[i], bins[i]))
    return tuple(binned_state)

def create_bins():
    """
    Create bins for each state variable.
    CartPole has 4 continuous state variables:
    [Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity].
    We define the ranges and bins for each.
    """
    bins = [
        np.linspace(-4.8, 4.8, 10),  # Cart Position
        np.linspace(-5.0, 5.0, 10),  # Cart Velocity
        np.linspace(-0.418, 0.418, 10),  # Pole Angle
        np.linspace(-5.0, 5.0, 10),  # Pole Angular Velocity
    ]
    return bins

def rollout(env, bins):
    """
    Simulate a random rollout from the current state.
    """
    total_reward = 0
    done = False
    while not done:
        # Randomly pick an action (exploration)
        action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward

def backpropagate(node, reward):
    while node is not None:
        node.visits += 1
        node.wins += reward
        node = node.parent

def expand(node, env, action_space_size, bins):
    """
    Expand the node by creating child nodes for each action.
    """
    for action in range(action_space_size):
        # Clone the environment and step with the current action
        env_copy = gym.make('CartPole-v1')
        env_copy.reset()
        env_copy.env.state = env.env.state  # Copy the environment state
        state, reward, done, _, _ = env_copy.step(action)

        # Discretize the state to get the discrete representation
        discrete_state = discretize_state(state, bins)

        child_node = MCTSNode(state=discrete_state, parent=node, action=action)
        node.children.append(child_node)

def select(node, action_space_size):
    """
    Traverse the tree by selecting the best child node based on the UCB1 algorithm.
    """
    current_node = node
    while current_node.is_fully_expanded(action_space_size):
        current_node = current_node.best_child()
    return current_node

def mcts(env, state, current_node, simulations=1000):
    env_copy = copy.deepcopy(env)
    action_space_size = env.action_space.n

    for _ in range(simulations):
        # Step 1: Selection
        selected_node = select(current_node, action_space_size)
#         print('sn',selected_node)
        
        # Step 2: Expansion
        expand(selected_node, env_copy, action_space_size, bins)
        
        # Step 3: Simulation
        reward = rollout(env_copy, bins)
        if reward !=0:
            print('r', reward)
        
        # Step 4: Backpropagation
        backpropagate(selected_node, reward)
        
    best_node = current_node.best_child(exploration_param=0)
    return best_node

def show_frame(frame):
    plt.imshow(frame)
    plt.axis('off')
    plt.show()


# Initialize the CartPole environment
env = gym.make("CartPole-v1", render_mode='rgb_array')
# Define video recording parameters
video_path = 'cartpole_mcts.mp4'
frame_width = env.render().shape[1]
frame_height = env.render().shape[0]
fps = 30  # Set the frames per second

# Create a video writer object using OpenCV
video_writer = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))




# Apply the best move (select action based on the best child)
env.reset()
done = False
tot_r = 0

# Discretize the initial state
bins = create_bins()
discrete_state = discretize_state(state, bins)
current_node = MCTSNode(discrete_state) # first will be root node

while not done:
    current_node = mcts(env, state, current_node, simulations=1000)
    state, reward, done, _, _ = env.step(1)
    tot_r += reward



print('done')
print(tot_r)
env.close()


# Release the video writer
video_writer.release()

# Display the recorded video in the notebook
Video(video_path)

TypeError: cannot pickle 'pygame.surface.Surface' object

In [17]:
#100 bins

import math
import random
import numpy as np
import gymnasium as gym
import copy
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib import animation

class MCTSNode:
    def __init__(self, state, parent=None, action=None):
        self.state = state
        self.parent = parent
        self.children = []
        self.action = action
        self.visits = 0
        self.wins = 0

    def is_fully_expanded(self, action_space_size):
        return len(self.children) == action_space_size

    def best_child(self, exploration_param=1.414):
        choices_weights = [
            (child.wins / max(1, child.visits)) + exploration_param * math.sqrt(
                math.log(max(1, self.visits)) / max(1, child.visits)
            ) for child in self.children
        ]
        return self.children[np.argmax(choices_weights)]

    def most_visited_child(self):
        return max(self.children, key=lambda child: child.visits)

def discretize_state(state, bins):
    """
    Discretize the continuous state into a discrete bin representation.
    We create bins for each feature (position, velocity, angle, angular velocity).
    """
    binned_state = []
    for i in range(len(state)):
#         print('state[i]', state[i])
#         print('bin[i]', bins[i])
        binned_state.append(np.digitize(state[i], bins[i]))
    return tuple(binned_state)

def create_bins():
    """
    Create bins for each state variable.
    CartPole has 4 continuous state variables:
    [Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity].
    We define the ranges and bins for each.
    """
    bins = [
        np.linspace(-4.8, 4.8, 100),  # Cart Position
        np.linspace(-5.0, 5.0, 100),  # Cart Velocity
        np.linspace(-0.418, 0.418, 100),  # Pole Angle
        np.linspace(-5.0, 5.0, 100),  # Pole Angular Velocity
    ]
    return bins

def rollout(env, bins):
    """
    Simulate a random rollout from the current state.
    """
    total_reward = 0
    done = False
    while not done:
        # Randomly pick an action (exploration)
        action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward

def backpropagate(node, reward):
    while node is not None:
        node.visits += 1
        node.wins += reward
        node = node.parent

def expand(node, env, action_space_size, bins):
    """
    Expand the node by creating child nodes for each action.
    """
    for action in range(action_space_size):
        # Clone the environment and step with the current action
        env_copy = gym.make('CartPole-v1')
        env_copy.reset()
        env_copy.env.state = env.env.state  # Copy the environment state
        state, reward, done, _, _ = env_copy.step(action)

        # Discretize the state to get the discrete representation
        discrete_state = discretize_state(state, bins)

        child_node = MCTSNode(state=discrete_state, parent=node, action=action)
        node.children.append(child_node)

def select(node, action_space_size):
    """
    Traverse the tree by selecting the best child node based on the UCB1 algorithm.
    """
    current_node = node
    while current_node.is_fully_expanded(action_space_size):
        current_node = current_node.best_child()
    return current_node

def mcts(env, state, current_node, simulations=1000):
    env_copy = copy.deepcopy(env)
    action_space_size = env.action_space.n

    for _ in range(simulations):
        # Step 1: Selection
        selected_node = select(current_node, action_space_size)
#         print('sn',selected_node)
        
        # Step 2: Expansion
        expand(selected_node, env_copy, action_space_size, bins)
        
        # Step 3: Simulation
        reward = rollout(env_copy, bins)
        
        # Step 4: Backpropagation
        backpropagate(selected_node, reward)
        
    best_node = current_node.best_child(exploration_param=0)

    return best_node

# Initialize the CartPole environment
env = gym.make("CartPole-v1")

# Run MCTS for 1000 simulations


# Apply the best move (select action based on the best child)
state = env.reset()[0]
done = False
tot_r = 0

# Discretize the initial state
bins = create_bins()
discrete_state = discretize_state(state, bins)
current_node = MCTSNode(discrete_state) # first will be root node

while not done:
    current_node = mcts(env, state, current_node, simulations=1000)
    print('action', current_node.action)
    state, reward, done, _, _ = env.step(current_node.action)
    tot_r += reward
    env.render()

print('done')
print(tot_r)
env.close()


action 0
action 1
action 1
action 0
action 1
action 0
action 1
action 1
action 1
action 1
action 1
action 1
action 1
action 1
action 0
done
15.0


# Use MCTS above to generate data, then train a NN to that data

In [39]:
import math
import random
import numpy as np
import collections
import gymnasium as gym
import copy
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib import animation

class MCTSNode:
    def __init__(self, state, parent=None, action=None):
        self.state = state
        self.parent = parent
        self.children = []
        self.action = action
        self.visits = 0
        self.wins = 0

    def is_fully_expanded(self, action_space_size):
        return len(self.children) == action_space_size

    def best_child(self, exploration_param=1.414):
        choices_weights = [
            (child.wins / max(1, child.visits)) + exploration_param * math.sqrt(
                math.log(max(1, self.visits)) / max(1, child.visits)
            ) for child in self.children
        ]
        return self.children[np.argmax(choices_weights)]

    def most_visited_child(self):
        return max(self.children, key=lambda child: child.visits)

def discretize_state(state, bins):
    """
    Discretize the continuous state into a discrete bin representation.
    We create bins for each feature (position, velocity, angle, angular velocity).
    """
    binned_state = []
    for i in range(len(state)):
        binned_state.append(np.digitize(state[i], bins[i]))
    return tuple(binned_state)

def create_bins():
    """
    Create bins for each state variable.
    CartPole has 4 continuous state variables:
    [Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity].
    We define the ranges and bins for each.
    """
    bins = [
        np.linspace(-4.8, 4.8, 100),  # Cart Position
        np.linspace(-5.0, 5.0, 100),  # Cart Velocity
        np.linspace(-0.418, 0.418, 100),  # Pole Angle
        np.linspace(-5.0, 5.0, 100),  # Pole Angular Velocity
    ]
    return bins

def rollout(env, bins):
    """
    Simulate a random rollout from the current state.
    """
    total_reward = 0
    done = False
    while not done:
        # Randomly pick an action (exploration)
        action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward

def backpropagate(node, reward):
    while node is not None:
        node.visits += 1
        node.wins += reward
        node = node.parent

def expand(node, env, action_space_size, bins):
    """
    Expand the node by creating child nodes for each action.
    """
    for action in range(action_space_size):
        # Clone the environment and step with the current action
        env_copy = gym.make('CartPole-v1')
        env_copy.reset()
        env_copy.env.state = env.env.state  # Copy the environment state
        state, reward, done, _, _ = env_copy.step(action)

        # Discretize the state to get the discrete representation
        discrete_state = discretize_state(state, bins)

        child_node = MCTSNode(state=discrete_state, parent=node, action=action)
        node.children.append(child_node)

def select(node, action_space_size):
    """
    Traverse the tree by selecting the best child node based on the UCB1 algorithm.
    """
    current_node = node
    while current_node.is_fully_expanded(action_space_size):
        current_node = current_node.best_child()
    return current_node

def mcts(env, state, current_node, simulations=1000):
    env_copy = copy.deepcopy(env)
    action_space_size = env.action_space.n

    for _ in range(simulations):
        # Step 1: Selection
        selected_node = select(current_node, action_space_size)
#         print('sn',selected_node)
        
        # Step 2: Expansion
        expand(selected_node, env_copy, action_space_size, bins)
        
        # Step 3: Simulation
        reward = rollout(env_copy, bins)
        
        # Step 4: Backpropagation
        backpropagate(selected_node, reward)
        
    best_node = current_node.best_child(exploration_param=0)

    return best_node


class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        idx = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, next_states, dones = zip(*[self.buffer[i] for i in idx])
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

    def __len__(self):
        return len(self.buffer)
    
    def extract_data(self):
        states, actions, rewards, next_states, dones = zip(*self.buffer)
#         X = np.hstack((np.array(next_states), np.array(states), np.array(rewards).reshape(-1, 1), np.array(dones).reshape(-1, 1)))
        # we only want state for X for now
        X = np.array(states)
        Y = np.array(actions)
        return X, Y



# Initialize the CartPole environment
env = gym.make("CartPole-v1")

replay_buffer = ReplayBuffer()


for i in range(410):

    state = env.reset()[0]
    done = False
    tot_r = 0

    # Discretize the initial state
    bins = create_bins()
    discrete_state = discretize_state(state, bins)
    current_node = MCTSNode(discrete_state) # first will be root node


    while not done:
        current_node = mcts(env, state, current_node, simulations=1000)
#         print('action', current_node.action)
        next_state, reward, done, _, _ = env.step(current_node.action)
        replay_buffer.add(state, current_node.action, reward, next_state, done)
        state = next_state
        tot_r += reward
        env.render()
    print('end of eps', i)
#     print(len(replay_buffer))

print('done')
print('RB len', len(replay_buffer))
env.close()

import pickle

# Save the replay buffer to a file
with open('replay_buffer.pkl', 'wb') as f:
    pickle.dump(replay_buffer, f)

# Load the replay buffer from the file
with open('replay_buffer.pkl', 'rb') as f:
    rb = pickle.load(f)

  logger.warn(
  logger.warn(
  gym.logger.warn(


end of eps 0
end of eps 1
end of eps 2
end of eps 3
end of eps 4
end of eps 5
end of eps 6
end of eps 7
end of eps 8
end of eps 9
end of eps 10
end of eps 11
end of eps 12
end of eps 13
end of eps 14
end of eps 15
end of eps 16
end of eps 17
end of eps 18
end of eps 19
end of eps 20
end of eps 21
end of eps 22
end of eps 23
end of eps 24
end of eps 25
end of eps 26
end of eps 27
end of eps 28
end of eps 29
end of eps 30
end of eps 31
end of eps 32
end of eps 33
end of eps 34
end of eps 35
end of eps 36
end of eps 37
end of eps 38
end of eps 39
end of eps 40
end of eps 41
end of eps 42
end of eps 43
end of eps 44
end of eps 45
end of eps 46
end of eps 47
end of eps 48
end of eps 49
end of eps 50
end of eps 51
end of eps 52
end of eps 53
end of eps 54
end of eps 55
end of eps 56
end of eps 57
end of eps 58
end of eps 59
end of eps 60
end of eps 61
end of eps 62
end of eps 63
end of eps 64
end of eps 65
end of eps 66
end of eps 67
end of eps 68
end of eps 69
end of eps 70
end of eps 71
en

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Sample data from replay buffer
X, Y = replay_buffer.extract_data()
print(X[1])
X = torch.tensor(X, dtype=torch.float32)
Y = torch.tensor(Y, dtype=torch.long)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


input_dim = X.shape[1] # lets just take state for now
# input_dim = 4
print(input_dim)
output_dim = len(np.unique(Y))
model = NeuralNetwork(input_dim, output_dim)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100

# Training loop
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, Y_train)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == Y_test).float().mean()
    print(f'Test Accuracy: {accuracy:.4f}')

[-0.02710983 -0.5409558   0.01723253  0.73685557]
4
Epoch [10/100], Loss: 0.6902
Epoch [20/100], Loss: 0.6888
Epoch [30/100], Loss: 0.6875
Epoch [40/100], Loss: 0.6860
Epoch [50/100], Loss: 0.6844
Epoch [60/100], Loss: 0.6825
Epoch [70/100], Loss: 0.6805
Epoch [80/100], Loss: 0.6784
Epoch [90/100], Loss: 0.6763
Epoch [100/100], Loss: 0.6744
Test Accuracy: 0.5495


In [43]:
# Initialize the environment
env = gym.make("CartPole-v1")

# Create the neural network
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

# Function to select an action using the neural network
def select_action(state, model):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        action_scores = model(state)
    return torch.argmax(action_scores, dim=1).item()

total_reward = 0
state = env.reset()[0]
done = False

while not done:
    action = select_action(state, model)
    next_state, reward, done, _, _ = env.step(action)
    total_reward += reward
    state = next_state
    env.render()

print("Total Reward:", total_reward)
env.close()

Total Reward: 50.0


In [31]:
#random agent
import gymnasium as gym

# Initialize the CartPole environment
env = gym.make("CartPole-v1")

# Reset the environment to get the initial state
state = env.reset()

total_reward = 0
done = False

while not done:
    # Sample a random action from the action space
    action = env.action_space.sample()
    
    # Apply the action to the environment
    next_state, reward, done, _, _ = env.step(action)
    
    total_reward += reward
    state = next_state
    
    # Render the environment
    env.render()

print("Total Reward:", total_reward)
env.close()


Total Reward: 24.0


#### While the MBRL agent is better than the random agent, in true MBRL, we want to learn the system dynamics. So X should be S,A; and Y should be S', R, T. and then you take the output with the highest R then selet that action to take.
Question: will this work? Since CP gives 1 reward for every state where we are alive, but doesn't account for if that state is good or bad. Or is the reward factored into the MCTS backpropagation? 