### Base (A+B)

In [None]:
class Maze:
    def __init__(self):
        self.map = [[-1, -1, -1, +40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [+10, -2, -1, -1]]
        self.num_rows = len(self.map)
        self.num_cols = len(self.map[0])
        self.start_state = (3, 2)  # row, col
        self.terminal_states = [(0, 3), (3, 0)]
        self.agent_pos = self.start_state
        self.actions = {'↑': (-1, 0), '→': (0, 1), '↓': (1, 0), '←': (0, -1)}

    def step(self, action):
        move = self.actions[action]
        row, col = self.agent_pos
        new_row, new_col = row + move[0], col + move[1]
        
        # Check if new location is within map boundaries
        if (0 <= new_row < self.num_rows) and (0 <= new_col < self.num_cols):
            self.agent_pos = (new_row, new_col)
            reward = self.map[new_row][new_col]
            done = self.agent_pos in self.terminal_states
        else:
            # If the new location is outside the map, stay in the same spot and get the same punishment again
            reward = self.map[row][col]
            done = False
        
        return self.agent_pos, reward, done
    
class Agent:
    def __init__(self):
        self.maze = Maze()
        self.policy = Policy(self.maze)
        self.value_function = {}

    def act(self, state):
        action = self.policy.select_action(state)
        next_state, reward, done = self.maze.step(action)
        print("Agent position:", self.maze.agent_pos)
        print("Selected action:", action)
        return next_state, reward, done
    
class Policy:
    def __init__(self, maze):
        self.maze = maze
        
    def select_action(self, state):
        possible_actions = ['↑', '→', '↓', '←']
        return random.choice(possible_actions)


agent = Agent()
state = agent.maze.start_state

for i in range(10):
    action = agent.policy.select_action(state)
    state, reward, done = agent.act(state)

## Poging tot C

In [1]:
class Maze:
    def __init__(self):
        self.map = [[-1, -1, -1, 40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [10, -2, -1, -1]]
        self.num_rows = len(self.map)
        self.num_cols = len(self.map[0])
        self.start_state = (3, 2)  # row, col
        self.terminal_states = [(0, 3), (3, 0)]
        self.agent_pos = self.start_state
        self.actions = {'↑': (-1, 0), '→': (0, 1), '↓': (1, 0), '←': (0, -1)}

    def step(self, state, action):
        move = self.actions[action]
        row, col = state
        new_row, new_col = row + move[0], col + move[1]
        
        # Check if new location is within map boundaries
        if (0 <= new_row < self.num_rows) and (0 <= new_col < self.num_cols):
            if self.map[new_row][new_col] != -1:  # not a wall cell
                self.agent_pos = (new_row, new_col)
            reward = self.map[self.agent_pos[0]][self.agent_pos[1]]
            done = self.agent_pos in self.terminal_states
        else:
            # If the new location is outside the map, stay in the same spot and get the same punishment again
            reward = self.map[row][col]
            done = False
        
        return self.agent_pos, reward, done


    
class Agent:
    def __init__(self):
        self.maze = Maze()
        self.policy = Policy(self.maze)
        self.value_function = {}
        self.gamma = 1.0 # discount factor

    def act(self, state):
        action = self.policy.select_action(state)
        next_state, reward, done = self.maze.step(action)
        print("Current state:", state)
        print("Selected action:", action)
        print("Next state:", next_state)
        print("Reward:", reward)
        print("Done:", done)
        return next_state, reward, done

    def value_iteration(self, delta=0.01):
        # Initialize the value of all states to 0
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                self.value_function[state] = 0

        # Repeat until convergence
        for i in range(100000):
            max_change = 0
            #print(max_change)

            # Create a copy of the current value function to compare with later
            old_value_function = self.value_function.copy()

            # For each state, update its value
            for row in range(self.maze.num_rows):
                for col in range(self.maze.num_cols):
                    state = (row, col)

                    # Compute the optimal value for the state
                    max_value = float('-inf')
                    for action in self.maze.actions:
                        next_state, reward, done = self.maze.step(state, action)
                        value = reward + self.gamma * old_value_function[next_state]
                        if value > max_value:
                            max_value = value

                    # Update the value of the state
                    self.value_function[state] = max_value
                    change = abs(max_value - old_value_function[state])
                    if change > max_change:
                        max_change = change

            #print(max_change)

            # Check for convergence
            if max_change < delta:
                break

        print("Value iteration converged in", len(self.value_function), "steps.")


    def visualize(self):
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state in self.maze.terminal_states:
                    print('T', end='\t')
                elif self.maze.map[row][col] == -1:
                    print('X', end='\t')
                else:
                    best_action = None
                    best_value = float('-inf')
                    for action in self.maze.actions:
                        next_state, reward, done = self.maze.step(state, action) 
                        value = reward + self.gamma * self.value_function[next_state]
                        if value > best_value:
                            best_value = value
                            best_action = action
                    print(best_action, end='\t')
            print()
        print()
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state in self.maze.terminal_states or self.maze.map[row][col] == -1:
                    print('-', end='\t')
                else:
                    print(round(self.value_function[state], 1), end='\t')
            print()



    
class Policy:
    def __init__(self, maze):
        self.maze = maze
        
    def select_action(self, state):
        possible_actions = ['↑', '→', '↓', '←']
        return random.choice(possible_actions)


agent = Agent()
agent.value_iteration()
agent.visualize()

Value iteration converged in 16 steps.
X	X	X	T	
X	X	→	↑	
X	X	X	X	
T	←	X	X	

-	-	-	-	
-	-	2333300.0	2333350.0	
-	-	-	-	
-	2333320.0	-	-	
