In [25]:
import numpy as np

size = 10
class GridWorld:
    def __init__(self, size=10):
        self.size = size
        self.grid = np.zeros((size, size))
        # Define rewards for each cell
        self.grid[3, 3] = 1  # Reward of +1 at cell (3, 3)
        self.grid[7, 7] = -1  # Reward of -1 at cell (7, 7)
        self.state = None
        self.reset()

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        if action == 0:  # Move right
            self.state = (self.state[0], min(self.state[1] + 1, self.size - 1))
        elif action == 1:  # Move left
            self.state = (self.state[0], max(self.state[1] - 1, 0))
        elif action == 2:  # Move down
            self.state = (min(self.state[0] + 1, self.size - 1), self.state[1])
        elif action == 3:  # Move up
            self.state = (max(self.state[0] - 1, 0), self.state[1])

        reward = self.grid[self.state[0], self.state[1]]
        done = (self.state == (self.size - 1, self.size - 1))  # Terminate if reached bottom-right corner
        return self.state, reward, done, {}

    def render(self):
        for i in range(self.size):
            for j in range(self.size):
                if (i, j) == self.state:
                    print("x", end=" ")
                else:
                    print("-", end=" ")
            print()

In [26]:
# Example usage:
env = GridWorld(size)
print("Initial State:", env.reset())
print("GridWorld:")
env.render()
action = 0  # Move right
print("Taking action:", action)
next_state, reward, done, _ = env.step(action)
print("Next State:", next_state)
print("Reward:", reward)
print("Done:", done)
print("GridWorld:")
env.render()

Initial State: (0, 0)
GridWorld:
x - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
Taking action: 0
Next State: (0, 1)
Reward: 0.0
Done: False
GridWorld:
- x - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - - - - 


In [27]:
import numpy as np

# Holds a preference for which action to perform in which state
class QTable:
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        self.q_table = np.zeros((num_states, num_actions, 4))  # 4 actions: right, left, down, up

    def get_best_action(self, state):
        return np.argmax(self.q_table[state[0], state[1]])

# Interacts with the environment
class SarsaAgent:
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions

        # These defined by NFT for rarity
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

        # Sets the learning
        self.last_state = None
        self.last_action = None

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.randint(self.num_actions)
        else:
            return self.q_table.get_best_action(state)

    def learn(self, state, action, reward, next_state, next_action):
        if self.last_state is not None:
            # Update Q-value using SARSA
            self.q_table.q_table[self.last_state, self.last_action] += self.alpha * (
                    reward + self.gamma * self.q_table.q_table[next_state, next_action] -
                    self.q_table.q_table[self.last_state, self.last_action])
        self.last_state = next_state
        self.last_action = next_action

    def reset(self):
        self.last_state = None
        self.last_action = None

    def add_q_table(self, q_table):
        self.q_table = q_table

# Example usage:
agent = SarsaAgent()
q_table = QTable(size * size, )
state = (0, 0)
action = agent.choose_action(state)
print("Chosen action:", action)

Chosen action: 0


In [28]:
def train_agent(env, agent, num_episodes=100):
    for episode in range(1, num_episodes + 1):
        state = env.reset()
        action = agent.choose_action(state)
        agent.reset()
        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = agent.choose_action(next_state)
            agent.learn(state, action, reward, next_state, next_action)
            state = next_state
            action = next_action


# Define environment and agent
env = GridWorld()
agent = SarsaAgent(num_states=20, num_actions=4)  # Assuming 10x10 grid world

# Train the agent
train_agent(env, agent)

# Evaluate the trained agent
total_rewards = 0
num_episodes = 10
for _ in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.choose_action(state)
        state, reward, done, _ = env.step(action)
        total_rewards += reward
print("Average reward per episode after training:", total_rewards / num_episodes)

IndexError: index 4 is out of bounds for axis 1 with size 4