In [2]:
import numpy as np
import random
import gym

class SARSA:
    def __init__(self, env, alpha=0.1, gamma=0.99, epsilon=0.1, max_episodes=2000, max_steps=200):
        self.env = env
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon  # exploration-exploitation tradeoff
        self.max_episodes = max_episodes
        self.max_steps = max_steps
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()  # Explore action space
        else:
            return np.argmax(self.q_table[state, :])  # Exploit learned values

    def update_q_table(self, state, action, reward, next_state, next_action):
        predict = self.q_table[state, action]
        target = reward + self.gamma * self.q_table[next_state, next_action]
        self.q_table[state, action] += self.alpha * (target - predict)

    def train(self):
        rewards = []
        for episode in range(self.max_episodes):
            state = self.env.reset()
            total_reward = 0
            action = self.choose_action(state)
            for step in range(self.max_steps):
                next_state, reward, done, _ = self.env.step(action)
                next_action = self.choose_action(next_state)
                self.update_q_table(state, action, reward, next_state, next_action)
                total_reward += reward
                state = next_state
                action = next_action
                if done:
                    break
            rewards.append(total_reward)
        return rewards

# Create a grid world environment
env = gym.make("FrozenLake-v1")

# Create an instance of SARSA
sarsa_agent = SARSA(env)

# Train SARSA
rewards = sarsa_agent.train()

# Print average rewards
print("Average Rewards:", np.mean(rewards))

Average Rewards: 0.0
