In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class GridWorld:
    def __init__(self):
        self.grid_size = (3, 3)
        self.num_actions = 4  # Up, Down, Left, Right
        self.start_state = (0, 0)
        self.goal_state = (2, 2)

    def step(self, state, action):
        # Define the dynamics of the environment
        row, col = state
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.grid_size[0] - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.grid_size[1] - 1, col + 1)
        next_state = (row, col)
        reward = 0
        if next_state == self.goal_state:
            reward = 1  # Reward of +1 upon reaching the goal state
        return next_state, reward

def generate_training_data(grid_world, num_samples):
    X = np.zeros((num_samples, 2))  # State features
    y = np.zeros((num_samples,))  # Actions
    for i in range(num_samples):
        state = (np.random.randint(grid_world.grid_size[0]),
                 np.random.randint(grid_world.grid_size[1]))
        action = np.random.randint(grid_world.num_actions)
        next_state, _ = grid_world.step(state, action)
        X[i] = state
        y[i] = action
    return X, y

# Create a grid world environment
grid_world = GridWorld()

# Generate training data
num_samples = 10000
X_train, y_train = generate_training_data(grid_world, num_samples)

# Train a supervised learning model using decision tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate the learned policy
def evaluate_policy(grid_world, model):
    total_reward = 0
    state = grid_world.start_state
    max_steps = 50
    steps = 0
    while state != grid_world.goal_state and steps < max_steps:
        action = model.predict([state])[0]
        print(f"Step {steps}: State = {state}, Action = {action}")
        state, reward = grid_world.step(state, action)
        total_reward += reward
        steps += 1
    return total_reward

# Evaluate the learned policy
total_reward = evaluate_policy(grid_world, model)
print("Total reward obtained by learned policy:", total_reward)

Step 0: State = (0, 0), Action = 1.0
Step 1: State = (1, 0), Action = 3.0
Step 2: State = (1, 1), Action = 1.0
Step 3: State = (2, 1), Action = 3.0
Total reward obtained by learned policy: 1
