In [1]:
import random
import numpy as np

class VolcanoProblem:
    def __init__(self):
        self.grid_size = (3, 4)
        self.start_state = (0, 0)
        # self.end_states = {(2, 3), (1, 3)}
        self.penalty_states = {(1, 1), (0, 2)}
        self.penalty_value = -50
        self.reward_states = {(2, 0), (0, 3)}
        self.reward_values = {(2, 0): 20, (0, 3): 100}
        self.current_state = self.start_state

    def reset(self):
        # Randomly select a new initial state for each episode
        available_states = list(set(self.grid_states()) - self.reward_states - self.penalty_states)
        self.current_state = random.choice(available_states)
        return self.current_state


    def grid_states(self):
        # Generate all possible states in the grid
        rows, cols = self.grid_size
        return [(i, j) for i in range(rows) for j in range(cols)]


    def step(self, action):
        next_state = self.get_next_state(action)
        reward = self.get_reward(next_state)
        self.current_state = next_state  # Update the current state
        return next_state, reward

    def get_next_state(self, action):
        current_row, current_col = self.current_state
        if action == 'up' and current_row >= 1:
            return current_row - 1, current_col
        elif action == 'down' and current_row < self.grid_size[0] - 1:
            return current_row + 1, current_col
        elif action == 'left' and current_col >= 1:
            return current_row, current_col - 1
        elif action == 'right' and current_col < self.grid_size[1] - 1:
            return current_row, current_col + 1
        else:
            return current_row, current_col



    def get_reward(self, state):
        if state in self.penalty_states:
            return self.penalty_value
        elif state in self.reward_states:
            return self.reward_values[state]
        else:
            return 0

In [2]:
# def epsilon_greedy_policy(Q_values, epsilon):
#     if not Q_values:
#         # If Q_values is empty, choose a random action
#         return np.random.choice(['up', 'down', 'left', 'right'])
#     if np.random.rand() < epsilon:
#         # Explore: Choose a random action among available actions
#         return np.random.choice(list(Q_values.keys()))
#     else:
#         # Exploit: Choose the action with the highest Q-value
#         return max(Q_values, key=Q_values.get)


import numpy as np

def model_free_monte_carlo(env, num_episodes=1000, epsilon=0.1, discount=1, alpha=0.1):
    Q = {}
    utilities = []
    overall_utilities = []

    all_states = env.penalty_states.union(env.reward_states)

    for episode in range(num_episodes):
        state = env.reset()
        initial_state = state  # Store the initial state of the episode
        episode_data = []

        while state not in env.reward_states:
            action = epsilon_greedy_policy(Q.get(tuple(state), {}), epsilon)
            next_state, reward = env.step(action)
            episode_data.append((tuple(state), action, reward))
            state = next_state

        G = 0
        for t in reversed(range(len(episode_data))):
            state, action, reward = episode_data[t]
            G = discount * G + reward

            current_q = Q.get(state, {}).get(action, 0)
            Q.setdefault(state, {}).setdefault(action, current_q + alpha * (G - current_q))

        state_tuple = tuple(initial_state)  # Use the initial state for printing and computing utilities
        if state_tuple in Q:
            avg_utility = np.mean(np.array(list(Q[state_tuple].values())))
            utilities.append(avg_utility)
        else:
            utilities.append(0)
        # Debugging statements
        # print(f"Episode: {episode}, Initial State: {state_tuple}, Q: {Q[state_tuple]}, Utility: {utilities[-1]}")
    overall_utilities.append(np.mean(utilities))
    # print(f"Overall Average Utility: {overall_utilities[0]}")
    return Q, overall_utilities







def epsilon_greedy_policy(Q, epsilon):
    if Q and random.uniform(0, 1) < epsilon:
        return random.choice(list(Q.keys()))
    elif Q:
        return max(Q, key=Q.get)
    else:
        # If Q is empty, return a random action
        return random.choice(['up', 'down', 'left', 'right'])



def sarsa(env, num_episodes=1000, epsilon=0.1, alpha=0.5, gamma=1):
    Q = {}
    overall_utilities = []

    for episode in range(num_episodes):
        state = env.reset()
        # Check if the environment has an 'actions' attribute
        if hasattr(env, 'actions'):
            possible_actions = env.actions
        else:
            # Use a default set of actions if 'actions' attribute is not present
            possible_actions = ['up', 'down', 'left', 'right']

        action = epsilon_greedy_policy(Q.get(state, {}), epsilon)

        while state not in env.reward_states and state not in env.penalty_states:
            next_state, reward = env.step(action)
            next_action = epsilon_greedy_policy(Q.get(next_state, {}), epsilon)

            # Ensure keys are set for state-action pairs
            Q.setdefault(state, {a: 0 for a in possible_actions})
            Q.setdefault(next_state, {a: 0 for a in possible_actions})

            Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])
            # Print debug statements
            # print(f"Episode: {episode}, State: {state}, Action: {action}, Next State: {next_state}, Next Action: {next_action}")
            # print(f"Q[state]: {Q[state]}, Q[next_state]: {Q[next_state]}")
            state = next_state
            action = next_action

        # Calculate overall utility based on the average of maximum Q-values over all states
        overall_utility = np.mean([max(actions.values()) for actions in Q.values()])
        overall_utilities.append(overall_utility)
    # Calculate the average overall utility over all episodes
    average_overall_utility = np.mean(overall_utilities)
    print(f"Average Overall Utility: {average_overall_utility}")
    return Q, average_overall_utility






# Q-Learning
def q_learning(env, num_episodes=1000, epsilon=0.1, alpha=0.5, discount=1):
    Q = {}
    overall_utilities = []

    for episode in range(num_episodes):
        state = env.reset()

        # Check if the environment has an 'actions' attribute
        if hasattr(env, 'actions'):
            possible_actions = env.actions
        else:
            # Use a default set of actions if 'actions' attribute is not present
            possible_actions = ['up', 'down', 'left', 'right']

        while state not in env.reward_states:
            action = epsilon_greedy_policy(Q.get(state, {}), epsilon)
            next_state, reward = env.step(action)

            # Ensure keys are set for state-action pairs
            Q.setdefault(state, {a: 0 for a in possible_actions})
            Q.setdefault(next_state, {a: 0 for a in possible_actions})

            best_next_action = max(Q[next_state], key=Q[next_state].get)
            Q[state][action] += alpha * (reward + discount * Q[next_state][best_next_action] - Q[state][action])

            state = next_state

        # Calculate overall utility based on the average of maximum Q-values over all states
        overall_utility = np.mean([max(actions.values()) for actions in Q.values()])
        overall_utilities.append(overall_utility)

    # Calculate the average overall utility over all episodes
    average_overall_utility = np.mean(overall_utilities)
    print(f"Average Overall Utility: {average_overall_utility}")

    return Q, average_overall_utility

In [None]:
import tkinter as tk
from tkinter import ttk

class VolcanoGUI:
    def __init__(self, master):
        self.master = master
        master.title("Volcano Crossing Solver")

        self.create_widgets()

    def create_widgets(self):
        # Slip Probability
        self.slip_label = ttk.Label(self.master, text="Slip Probability:")
        self.slip_label.grid(row=0, column=0, padx=10, pady=10)

        self.slip_var = tk.DoubleVar()
        self.slip_var.set(0.1)  # Initial slip probability

        validate_cmd = self.master.register(self.validate_slip_entry)  # Register validation method

        self.slip_entry = ttk.Entry(self.master, textvariable=self.slip_var, validate="key", validatecommand=(validate_cmd, "%P"))
        self.slip_entry.grid(row=0, column=1, padx=10, pady=10)

        # Epsilon
        self.epsilon_label = ttk.Label(self.master, text="Epsilon Value:")
        self.epsilon_label.grid(row=1, column=0, padx=10, pady=10)

        self.epsilon_var = tk.DoubleVar()
        self.epsilon_var.set(0.2)  # Initial epsilon value
        self.epsilon_entry = ttk.Entry(self.master, textvariable=self.epsilon_var)
        self.epsilon_entry.grid(row=1, column=1, padx=10, pady=10)

        # Number of Episodes
        self.episodes_label = ttk.Label(self.master, text="Number of Episodes:")
        self.episodes_label.grid(row=2, column=0, padx=10, pady=10)

        self.episodes_var = tk.IntVar()
        self.episodes_var.set(1000)  # Initial number of episodes
        self.episodes_entry = ttk.Entry(self.master, textvariable=self.episodes_var)
        self.episodes_entry.grid(row=2, column=1, padx=10, pady=10)

        # Run Button for Monte Carlo
        self.run_monte_button = ttk.Button(self.master, text="Run Monte Carlo", command=self.run_monte_carlo)
        self.run_monte_button.grid(row=3, column=0, padx=10, pady=10)
        
        # Run Button for SARSA
        self.run_sarsa_button = ttk.Button(self.master, text="Run SARSA", command=self.run_sarsa)
        self.run_sarsa_button.grid(row=3, column=1, padx=10, pady=10)

        # Run Button for QLEARNING
        self.run_qlearning_button = ttk.Button(self.master, text="Run Q LERANING", command=self.run_qlearning)
        self.run_qlearning_button.grid(row=3, column=2, padx=10, pady=10)

         # Canvas for grid visualization
        self.canvas = tk.Canvas(self.master, width=500, height=320, bg="white")
        self.canvas.grid(row=4, columnspan=3, padx=10, pady=10)


    def validate_slip_entry(self, new_value):
        try:
            slip_value = float(new_value)
            return 0.0 <= slip_value <= 0.3
        except ValueError:
            return False


    def draw_grid(self, env, Q_values=None):
        cell_width = 100
        cell_height = 80
        top_margin = 20

        # Calculate the total width and height of the grid area
        total_width = env.grid_size[1] * cell_width
        total_height = env.grid_size[0] * cell_height

        # Calculate the margins to center the grid within the canvas
        left_margin = (self.canvas.winfo_reqwidth() - total_width) // 2
        top_margin = (self.canvas.winfo_reqheight() - total_height) // 2

        for row in range(env.grid_size[0]):
            for col in range(env.grid_size[1]):
                x1, y1 = left_margin + col * cell_width, top_margin + row * cell_height
                x2, y2 = x1 + cell_width, y1 + cell_height
                state = (row, col)
                color = "white"
                if state in env.penalty_states:
                    color = "red"
                elif state in env.reward_states:
                    color = "green"
                self.canvas.create_rectangle(x1, y1, x2, y2, fill=color)

                # Display States with Reward/Penalty values inside the box
                if state in env.reward_values:
                    reward = env.reward_values[state]
                    self.canvas.create_text((x1 + x2) / 2, (y1 + y2) / 2, text=f"Reward: {reward}", fill="black")
                elif state in env.penalty_states:
                    penalty_value = env.penalty_value
                    self.canvas.create_text((x1 + x2) / 2, (y1 + y2) / 2, text=f"Penalty: {penalty_value}", fill="black")
                elif color == "white" and Q_values:
                    # Display Q-values in white boxes
                    q_values = Q_values.get(state, {})
                    actions_text = "\n".join([f"{action}: {value:.2f}" for action, value in q_values.items()])
                    self.canvas.create_text((x1 + x2) / 2, (y1 + y2) / 2, text=actions_text, fill="black")
        
    def run_sarsa(self):
        epsilon_val = self.epsilon_var.get()
        num_episodes = self.episodes_var.get()

        env = VolcanoProblem()  # Make sure to replace this with the actual environment
        Q_sarsa, overall_utility_sarsa = sarsa(env, num_episodes=num_episodes, epsilon=epsilon_val, alpha=0.9)

        # Display the Q-values in a new window
        result_window = tk.Toplevel(self.master)
        result_window.title("SARSA Results")

        # Create a Treeview widget
        q_values_tree = ttk.Treeview(result_window)
        q_values_tree["columns"] = ("Action", "Q-value")
        q_values_tree.heading("#0", text="State")
        q_values_tree.heading("Action", text="Action")
        q_values_tree.heading("Q-value", text="Q-value")

        # Insert Q-values into the Treeview
        for state, actions in Q_sarsa.items():
            state_str = f"{state[0]}, {state[1]}"  # Assuming state is a tuple (x, y)
            for action, q_value in actions.items():
                q_values_tree.insert("", "end", text=state_str, values=(action, q_value))

        q_values_tree.pack(pady=10)

        # Display Overall Utility
        overall_utility_label = ttk.Label(result_window, text=f"Overall Average Utility (Sarsa): {overall_utility_sarsa}")
        overall_utility_label.pack(pady=10)

        self.draw_grid(env, Q_sarsa)

    def run_monte_carlo(self):
        slip_prob = self.slip_var.get()
        epsilon_val = self.epsilon_var.get()
        num_episodes = self.episodes_var.get()

        env = VolcanoProblem()
        Q_monte_carlo, overall_utility = model_free_monte_carlo(env, num_episodes=num_episodes, epsilon=epsilon_val, discount=1)

        # Display the Q-values in a new window
        result_window = tk.Toplevel(self.master)
        result_window.title("Monte Carlo Results")

        # Create a Treeview widget
        q_values_tree = ttk.Treeview(result_window)
        q_values_tree["columns"] = ("Action", "Q-value")
        q_values_tree.heading("#0", text="State")
        q_values_tree.heading("Action", text="Action")
        q_values_tree.heading("Q-value", text="Q-value")

        # Insert Q-values into the Treeview
        for state, actions in Q_monte_carlo.items():
            state_str = f"{state[0]}, {state[1]}"  # Assuming state is a tuple (x, y)
            for action, q_value in actions.items():
                q_values_tree.insert("", "end", text=state_str, values=(action, q_value))

        q_values_tree.pack(pady=10)

        # Display Overall Utility
        overall_utility_label = ttk.Label(result_window, text=f"Overall Average Utility (Monte Carlo): {overall_utility}")
        overall_utility_label.pack(pady=10)

        self.draw_grid(env, Q_monte_carlo)

    def run_qlearning(self):
        epsilon_val = self.epsilon_var.get()
        num_episodes = self.episodes_var.get()

        env = VolcanoProblem()  # Make sure to replace this with the actual environment
        Q_qlearning, overall_utility_qlearning = q_learning(env, num_episodes=num_episodes, epsilon=epsilon_val, alpha=0.5, discount=1)

        # Display the Q-values in a new window
        result_window = tk.Toplevel(self.master)
        result_window.title("Q-learning Results")

        # Create a Treeview widget
        q_values_tree = ttk.Treeview(result_window)
        q_values_tree["columns"] = ("Action", "Q-value")
        q_values_tree.heading("#0", text="State")
        q_values_tree.heading("Action", text="Action")
        q_values_tree.heading("Q-value", text="Q-value")

        # Insert Q-values into the Treeview
        for state, actions in Q_qlearning.items():
            state_str = f"{state[0]}, {state[1]}"  # Assuming state is a tuple (x, y)
            for action, q_value in actions.items():
                q_values_tree.insert("", "end", text=state_str, values=(action, q_value))

        q_values_tree.pack(pady=10)

        # Display Overall Utility
        overall_utility_label = ttk.Label(result_window, text=f"Overall Average Utility (Q-Learning): {overall_utility_qlearning}")
        overall_utility_label.pack(pady=10)

        self.draw_grid(env, Q_qlearning)
        
if __name__ == "__main__":
    root = tk.Tk()
    app = VolcanoGUI(root)
    root.mainloop()

Average Overall Utility: 40.106149855824405
Average Overall Utility: 62.79747340961237
Average Overall Utility: 41.61230465478507
Average Overall Utility: 66.96884507556503
Average Overall Utility: 27.888022689877292
Average Overall Utility: 39.78667728640921
Average Overall Utility: 38.83163358004542
Average Overall Utility: 40.45384036829587
Average Overall Utility: 65.817549129872
Average Overall Utility: 40.052056753777144
Average Overall Utility: 67.28165339328692
