In [1]:
import random
import numpy as np

random.seed(10)

# TD-Learning
Opdracht: Implementeer temporal difference learning. Voer de evaluatie uit op de optimale policy π∗ met γ = 1 en γ = 0.5. Visualiseer de uitkomsten en verklaar het resultaat.

Note: Werkt nog niet goed, als ik select_action random wil maken duurt het heel lang om te runnen. voor de rest ook buggie.

In [2]:
class Maze:
    def __init__(self):
        self.map = [[-1, -1, -1, +40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [+10, -2, -1, -1]]
        self.num_rows = len(self.map)
        self.num_cols = len(self.map[0])
        self.start_state = (3, 2)  # row, col
        self.terminal_states = [(0, 3), (3, 0)]
        self.agent_pos = self.start_state
        self.actions = {'↑': (-1, 0), '→': (0, 1), '↓': (1, 0), '←': (0, -1)}

    def step(self, action):
        move = self.actions[action]
        row, col = self.agent_pos
        new_row, new_col = row + move[0], col + move[1]

        # Check if new location is within map boundaries
        if (0 <= new_row < self.num_rows) and (0 <= new_col < self.num_cols):
            self.agent_pos = (new_row, new_col)
            reward = self.map[new_row][new_col]
            done = self.agent_pos in self.terminal_states
        else:
            # If the new location is outside the map, stay in the same spot and get the same punishment again
            reward = self.map[row][col]
            done = False

        return self.agent_pos, reward, done

class Policy:
    def __init__(self, agent, epsilon=0.1):
        self.agent = agent
        self.epsilon = epsilon

    def select_action(self, state):
        possible_actions = ['↑', '→', '↓', '←']

        # Epsilon-greedy exploration
        if random.random() < self.epsilon:
            return random.choice(possible_actions)

        # Exploitation: Choose action with highest utility score
        max_utility = float('-inf')
        best_action = None

        for action in possible_actions:
            next_state, reward, done = self.agent.maze.step(action)
            utility = self.agent.value_function.get(next_state, 0)
            if utility > max_utility:
                max_utility = utility
                best_action = action

        return best_action


class Agent:
    def __init__(self, alpha, gamma):
        self.maze = Maze()
        self.policy = Policy(self)
        self.value_function = {}
        self.alpha = alpha
        self.gamma = gamma

        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state in self.maze.terminal_states:
                    self.value_function[state] = 0
                else:
                    self.value_function[state] = random.random() * 10

    def act(self, state):
        action = self.policy.select_action(state)
        next_state, reward, done = self.maze.step(action)
        self.update_value_function(state, next_state, reward)
        return next_state, reward, done

    def update_value_function(self, state, next_state, reward):
        if next_state in self.maze.terminal_states:
            td_target = reward
        else:
            td_target = reward + self.value_function.get(next_state, 0)

        td_error = td_target - self.value_function.get(state, 0)
        self.value_function[state] += self.alpha * td_error


        
    def reset_value_function(self):
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state in self.maze.terminal_states:
                    self.value_function[state] = 0
                else:
                    self.value_function[state] = random.random() * 10
                

def print_utility(agent, gamma):
    print(f"\nUtility (γ = {gamma}):")
    state = agent.maze.start_state

    for i in range(10):
        action = agent.policy.select_action(state)
        state, reward, done = agent.act(state)

    for row in range(agent.maze.num_rows):
        for col in range(agent.maze.num_cols):
            state = (row, col)
            value = agent.value_function.get(state, 0)
            print(f"{value:.1f}", end=' ')
        print()

def print_policy(agent, gamma):
    print(f"\nPolicy (γ = {gamma}):")
    state = agent.maze.start_state

    for i in range(10):
        action = agent.policy.select_action(state)
        state, reward, done = agent.act(state)

    for row in range(agent.maze.num_rows):
        for col in range(agent.maze.num_cols):
            state = (row, col)
            if state == agent.maze.start_state:
                print('○', end=' ')
            elif state in agent.maze.terminal_states:
                print('×', end=' ')
            else:
                action = agent.policy.select_action(state)
                print(action, end=' ')
        print()

agent = Agent(alpha=0.1, gamma=0.9)

# Main training loop
for episode in range(10):
    state = agent.maze.start_state
    done = False
    while not done:
        state, reward, done = agent.act(state)

print_utility(agent, gamma=0.9)
print_policy(agent, gamma=0.9)
    
    
    
agent.reset_value_function()



# Train again with gamma = 1
agent = Agent(alpha=0.1, gamma=1)

for episode in range(10):
    state = agent.maze.start_state
    done = False
    while not done:
        state, reward, done = agent.act(state)

print_utility(agent, gamma=1.0)
print_policy(agent, gamma=1.0)


Utility (γ = 0.9):
-1.0 -1.3 -1.1 0.0 
-1.1 -1.0 -9.2 0.6 
1.8 1.7 -16.0 -17.9 
0.0 -4.3 0.4 -0.4 

Policy (γ = 0.9):
← ↑ ← × 
← ← ← ← 
← ← ← ← 
× ← ○ ← 

Utility (γ = 1.0):
0.4 0.4 -1.8 0.0 
0.4 0.5 -11.4 1.9 
2.3 1.5 -15.8 -18.5 
9.3 9.7 5.6 4.9 

Policy (γ = 1.0):
← ← ← × 
← ← ← ← 
← ← ← ← 
× ← ○ ← 


# Sarsa 
Opdracht: Implementeer SARSA (on-policy TD control). Voer control uit met γ = 1 en γ = 0.9, visualiseer de uitkomsten en verklaar het resultaat.

Antwoord: 
met γ = 1 leert de agent om optimale acties te kiezen met een langetermijnvisie, terwijl γ = 0.9 leidt tot meer kortetermijndenken. Dit is te zien in de output door hogere utility-waarden en policy die meer (direct) naar het doel bewegen bij γ = 1.

In [3]:
class Maze:
    def __init__(self):
        self.map = [[-1, -1, -1, +40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [+10, -2, -1, -1]]
        self.num_rows = len(self.map)
        self.num_cols = len(self.map[0])
        self.start_state = (3, 2)  # row, col
        self.terminal_states = [(0, 3), (3, 0)]
        self.agent_pos = self.start_state
        self.actions = {'↑': (-1, 0), '→': (0, 1), '↓': (1, 0), '←': (0, -1)}

    def step(self, state, action):
        move = self.actions[action]
        row, col = state
        new_row, new_col = row + move[0], col + move[1]
        
        # Check if new location is within map boundaries
        if (0 <= new_row < self.num_rows) and (0 <= new_col < self.num_cols):
            next_state = (new_row, new_col)
            reward = self.map[new_row][new_col]
            done = next_state in self.terminal_states
        else:
            # If the new location is outside the map, stay in the same spot and get the same punishment again
            next_state = state
            reward = self.map[row][col]
            done = False
        
        return next_state, reward, done
    
class Agent:
    def __init__(self, gamma, learning_rate, epsilon):
        self.maze = Maze()
        self.q_values = {}
        self.utility = {}  # New attribute to store utility values
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon

        # Initialize Q-values with random numbers for non-terminal states
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state not in self.maze.terminal_states:
                    self.q_values[state] = {action: random.uniform(1, 100) for action in self.maze.actions}

    def update_q_value(self, state, action, next_state, next_action, reward):
        current_q_value = self.q_values.get(state, {}).get(action, 0)
        next_q_value = self.q_values.get(next_state, {}).get(next_action, 0)
        new_q_value = (1 - self.learning_rate) * current_q_value + self.learning_rate * (reward + self.gamma * next_q_value)
        self.q_values[state][action] = new_q_value

        # Update utility values based on Q-values
        best_action_value = max(self.q_values[next_state].values(), default=0) if next_state in self.q_values else 0
        self.utility[state] = reward + self.gamma * best_action_value


    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.choice(list(self.maze.actions.keys()))
        else:
            q_values_for_state = self.q_values.get(state, {})
            if q_values_for_state:
                max_q_value = max(q_values_for_state.values())
                best_actions = [action for action, q_value in q_values_for_state.items() if q_value == max_q_value]
                return random.choice(best_actions)
            else:
                return random.choice(list(self.maze.actions.keys()))


    def act(self, state):
        action = self.select_action(state)
        next_state, reward, done = self.maze.step(state, action)
        next_action = self.select_action(next_state)
        self.update_q_value(state, action, next_state, next_action, reward)
        return next_state, reward, done
    
    def reset_q_values(self):
        self.q_values = {}
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state not in self.maze.terminal_states:
                    self.q_values[state] = {action: random.uniform(1, 100) for action in self.maze.actions}

def print_utility(agent, gamma):
    maze = agent.maze
    q_values = agent.q_values
    utility = agent.utility

    for i in range(maze.num_rows):
        for j in range(maze.num_cols):
            state = (i, j)
            actions = ["↑", "→", "↓", "←"]
            values = [q_values.get(state, {}).get(action, 0) for action in actions]
            best_action = actions[np.argmax(values)]
            best_value = np.max(values)

            print(f"State ({i}, {j}):", end=" ")
            for action, value in zip(actions, values):
                print(f"({action}): {value:.1f}", end=" ")
            print(f"  Best: ({best_action}): {best_value:.1f}")

        print()

    # Print the 2D map representation of utility
    print("Utility Map:")
    for i in range(maze.num_rows):
        for j in range(maze.num_cols):
            state = (i, j)
            print(f"{utility.get(state, 0) * gamma ** 0:.1f}", end=" ")
        print()


def print_policy(agent, gamma):
    print(f"Policy (γ = {gamma}):")
    for row in range(agent.maze.num_rows):
        for col in range(agent.maze.num_cols):
            state = (row, col)
            if state == agent.maze.start_state:
                best_action = max(agent.q_values[state].keys(), key=lambda k: agent.q_values[state][k])
                print(best_action, end=" ")
            elif state in agent.maze.terminal_states:
                print("×", end=" ")
            else:
                action = agent.select_action(state)
                print(action, end=" ")
        print()


num_episodes = 250000
print_interval = 100

# Run for gamma = 0.9
agent = Agent(gamma=0.9, learning_rate=0.1, epsilon=0.1)
for episode in range(num_episodes):
    state = agent.maze.start_state
    action = agent.select_action(state)
    done = False
    
    while not done:
        next_state, reward, done = agent.maze.step(state, action)
        next_action = agent.select_action(next_state)
        agent.update_q_value(state, action, next_state, next_action, reward)
        state = next_state
        action = next_action
        
    #if episode % print_interval == 0:
        #print(f"Episode: {episode}")
        #print_utility(agent, gamma=0.9)
        
print(f"Episodes: {num_episodes}")
print_utility(agent, gamma=0.9)

print("Final Policy:")
print_policy(agent, gamma=0.9)

agent.reset_q_values()

# Run for gamma = 1
agent = Agent(gamma=1, learning_rate=0.1, epsilon=0.1)
for episode in range(num_episodes):
    state = agent.maze.start_state
    action = agent.select_action(state)
    done = False
    
    while not done:
        next_state, reward, done = agent.maze.step(state, action)
        next_action = agent.select_action(next_state)
        agent.update_q_value(state, action, next_state, next_action, reward)
        state = next_state
        action = next_action
        
    #if episode % print_interval == 0:
        #print(f"Episode: {episode}")
        #print_utility(agent, gamma=1)
print()
print(f"Episodes: {num_episodes}")
print_utility(agent, gamma=1)

print("Final Policy:")
print_policy(agent, gamma=1)


Episodes: 250000
State (0, 0): (↑): 25.0 (→): 28.7 (↓): 21.1 (←): 25.2   Best: (→): 28.7
State (0, 1): (↑): 29.4 (→): 35.0 (↓): 24.1 (←): 24.9   Best: (→): 35.0
State (0, 2): (↑): 34.2 (→): 40.0 (↓): 19.6 (←): 29.0   Best: (→): 40.0
State (0, 3): (↑): 0.0 (→): 0.0 (↓): 0.0 (←): 0.0   Best: (↑): 0.0

State (1, 0): (↑): 24.6 (→): 23.6 (↓): 17.8 (←): 20.4   Best: (↑): 24.6
State (1, 1): (↑): 30.4 (→): 19.8 (↓): 19.6 (←): 21.4   Best: (↑): 30.4
State (1, 2): (↑): 34.9 (→): 24.4 (↓): 17.2 (←): 25.0   Best: (↑): 34.9
State (1, 3): (↑): 40.0 (→): 25.4 (↓): 20.0 (←): 19.8   Best: (↑): 40.0

State (2, 0): (↑): 21.3 (→): 20.3 (↓): 10.0 (←): 17.9   Best: (↑): 21.3
State (2, 1): (↑): 25.6 (→): 17.2 (↓): 16.4 (←): 16.9   Best: (↑): 25.6
State (2, 2): (↑): 19.8 (→): 19.7 (↓): 14.2 (←): 21.2   Best: (←): 21.2
State (2, 3): (↑): 25.6 (→): 21.2 (↓): 17.4 (←): 17.6   Best: (↑): 25.6

State (3, 0): (↑): 0.0 (→): 0.0 (↓): 0.0 (←): 0.0   Best: (↑): 0.0
State (3, 1): (↑): 21.3 (→): 14.7 (↓): 16.0 (←): 10.0 

# Q-learning

Opdracht: Implementeer Q-learning (off-policy TD control). Voer control uit met γ = 1 en γ = 0.9, visualiseer de uitkomsten en verklaar het resultaat.

het antwoord op vraag is eigenlijk hetzelfde als bij sarsa 

In [4]:
class Maze:
    def __init__(self):
        self.map = [[-1, -1, -1, +40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [+10, -2, -1, -1]]
        self.num_rows = len(self.map)
        self.num_cols = len(self.map[0])
        self.start_state = (3, 2)  # row, col
        self.terminal_states = [(0, 3), (3, 0)]
        self.agent_pos = self.start_state
        self.actions = {'↑': (-1, 0), '→': (0, 1), '↓': (1, 0), '←': (0, -1)}

    def step(self, state, action):
        move = self.actions[action]
        row, col = state
        new_row, new_col = row + move[0], col + move[1]
        
        # Check if new location is within map boundaries
        if (0 <= new_row < self.num_rows) and (0 <= new_col < self.num_cols):
            next_state = (new_row, new_col)
            reward = self.map[new_row][new_col]
            done = next_state in self.terminal_states
        else:
            # If the new location is outside the map, stay in the same spot and get the same punishment again
            next_state = state
            reward = self.map[row][col]
            done = False
        
        return next_state, reward, done
    
class Agent:
    def __init__(self, gamma, learning_rate, epsilon):
        self.maze = Maze()
        self.q_values = {}
        self.utility = {}
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon

        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state not in self.maze.terminal_states:
                    self.q_values[state] = {action: random.uniform(1, 100) for action in self.maze.actions}
                    self.utility[state] = 0

    def update_q_value(self, state, action, next_state, reward):
        max_next_q_value = max(self.q_values[next_state].values(), default=0) if next_state in self.q_values else 0
        current_q_value = self.q_values[state].get(action, 0)
        new_q_value = (1 - self.learning_rate) * current_q_value + self.learning_rate * (reward + self.gamma * max_next_q_value)
        self.q_values[state][action] = new_q_value

        # Update utility value
        self.utility[state] = max(self.q_values[state].values())


    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.choice(list(self.maze.actions.keys()))
        else:
            q_values_for_state = self.q_values.get(state, {})
            if q_values_for_state:
                max_q_value = max(q_values_for_state.values())
                best_actions = [action for action, q_value in q_values_for_state.items() if q_value == max_q_value]
                return random.choice(best_actions)
            else:
                return random.choice(list(self.maze.actions.keys()))



    def act(self, state):
        action = self.select_action(state)
        next_state, reward, done = self.maze.step(state, action)
        self.update_q_value(state, action, next_state, reward)
        return next_state, reward, done
    
    def reset_q_values(self):
        self.q_values = {}
        for row in range(self.maze.num_rows):
            for col in range(self.maze.num_cols):
                state = (row, col)
                if state not in self.maze.terminal_states:
                    self.q_values[state] = {action: random.uniform(1, 100) for action in self.maze.actions}

def print_utility(agent, gamma):
    maze = agent.maze
    q_values = agent.q_values
    utility = agent.utility

    for i in range(maze.num_rows):
        for j in range(maze.num_cols):
            state = (i, j)
            actions = ["↑", "→", "↓", "←"]
            values = [q_values.get(state, {}).get(action, 0) for action in actions]
            best_action = actions[np.argmax(values)]
            best_value = np.max(values)

            print(f"State ({i}, {j}):", end=" ")
            for action, value in zip(actions, values):
                print(f"({action}): {value:.1f}", end=" ")
            print(f"  Best: ({best_action}): {best_value:.1f}")

        print()

    # Print the 2D map representation of utility
    print("Utility Map:")
    for i in range(maze.num_rows):
        for j in range(maze.num_cols):
            state = (i, j)
            print(f"{utility.get(state, 0) * gamma ** 0:.1f}", end=" ")
        print()



def print_policy(agent, gamma):
    print(f"Policy (γ = {gamma}):")
    for row in range(agent.maze.num_rows):
        for col in range(agent.maze.num_cols):
            state = (row, col)
            if state == agent.maze.start_state:
                best_action = max(agent.q_values[state].keys(), key=lambda k: agent.q_values[state][k])
                print(best_action, end=" ")
            elif state in agent.maze.terminal_states:
                print("×", end=" ")
            else:
                action = agent.select_action(state)
                print(action, end=" ")
        print()


num_episodes = 250000
print_interval = 100

# Run for gamma = 0.9
agent = Agent(gamma=0.9, learning_rate=0.1, epsilon=0.1)
for episode in range(num_episodes):
    state = agent.maze.start_state
    done = False
    
    while not done:
        state, reward, done = agent.act(state)
        
    #if episode % print_interval == 0:
        #print(f"Episode: {episode}")
        #print_utility(agent, gamma=0.9)      
print(f"Episodes: {num_episodes}")
print_utility(agent, gamma=0.9)

print("Final Policy:")
print_policy(agent, gamma=0.9)

agent.reset_q_values()

# Run for gamma = 1
agent = Agent(gamma=1, learning_rate=0.1, epsilon=0.1)
for episode in range(num_episodes):
    state = agent.maze.start_state
    done = False
    
    while not done:
        state, reward, done = agent.act(state)
        
    #if episode % print_interval == 0:
        #print(f"Episode: {episode}")
        #print_utility(agent, gamma=1)
print()
print(f"Episodes: {num_episodes}")
print_utility(agent, gamma=1)

print("Final Policy:")
print_policy(agent, gamma=1)


Episodes: 250000
State (0, 0): (↑): 26.4 (→): 30.5 (↓): 22.8 (←): 26.5   Best: (→): 30.5
State (0, 1): (↑): 30.5 (→): 35.0 (↓): 26.4 (←): 26.4   Best: (→): 35.0
State (0, 2): (↑): 35.0 (→): 40.0 (↓): 21.5 (←): 30.5   Best: (→): 40.0
State (0, 3): (↑): 0.0 (→): 0.0 (↓): 0.0 (←): 0.0   Best: (↑): 0.0

State (1, 0): (↑): 26.4 (→): 26.4 (↓): 19.5 (←): 22.8   Best: (↑): 26.4
State (1, 1): (↑): 30.5 (→): 21.5 (↓): 22.8 (←): 22.8   Best: (↑): 30.5
State (1, 2): (↑): 35.0 (→): 26.0 (↓): 19.5 (←): 26.4   Best: (↑): 35.0
State (1, 3): (↑): 40.0 (→): 26.0 (↓): 22.4 (←): 21.5   Best: (↑): 40.0

State (2, 0): (↑): 22.8 (→): 22.8 (↓): 10.0 (←): 19.5   Best: (↑): 22.8
State (2, 1): (↑): 26.4 (→): 19.5 (↓): 18.5 (←): 19.5   Best: (↑): 26.4
State (2, 2): (↑): 21.5 (→): 22.4 (↓): 16.6 (←): 22.8   Best: (←): 22.8
State (2, 3): (↑): 26.0 (→): 22.4 (↓): 19.2 (←): 19.5   Best: (↑): 26.0

State (3, 0): (↑): 0.0 (→): 0.0 (↓): 0.0 (←): 0.0   Best: (↑): 0.0
State (3, 1): (↑): 22.8 (→): 16.6 (↓): 18.5 (←): 10.0 