# Reinforcement learning

In [1]:
import gymnasium as gym
import numpy as np
# for allowing abstract methodes (closest thing to interface)
from abc import ABC, abstractmethod
from IPython.display import display, clear_output

class Agent(ABC):

    @abstractmethod
    def _init_weights(self):
        pass

    @abstractmethod
    def copy(self):
        pass

    @abstractmethod
    def get_action(self, observation=None):
        pass

    @abstractmethod
    def mutate(parents=None, mutation_rate=None):
        pass

# simulate single training run
def simulate_env(env, agent):

    # Schrijf dit zelf: voer 1 run uit met de agent in het environment
    # Return de cumulatieve reward (return)

    # start nieuw spel
    observation = env.reset()[0]
    return_value = 0
    
    # speel het spel
    done = False
    while not done:
        # wat is de actie die moet gebeuren?
        action = agent.get_action(observation)

        # voer de actie uit
        observation, reward, terminated, truncated, _ = env.step(action)

        done = terminated or truncated
        return_value += reward
    
    return return_value

# train a reinforcment learning agent
def train_agent(env, agent, population_size = 50, mutation_rate=0.4, num_generations = 100, num_episodes=5):

    # Initialize the population
    population = [agent.copy() for _ in range(population_size)]

    # number of generations in the algorithm
    for generation in range(num_generations):
        # Evaluate each individual in the population
        scores = []
        for current_pop in population:
            # for all agents
            total_return = 0
            
            for i in range(num_episodes):
                # num_episodes runs uitvoeren -> hou de som van de returns bij
                total_return += simulate_env(env, current_pop)
            scores.append(total_return) # /num_episodes niet noodzakelijk
        
        # Select the top-performing individuals
        elite_indices = np.argsort(scores)[-int(0.2 * population_size):] # ga op zoek naar de 20% beste
    
        # Create a new population by mutating and recombining the elite individuals
        best_agent = population[elite_indices[-1]]
        new_population = [best_agent] # keep the best one always
        while len(new_population) < population_size:
            # select a parent
            indices = np.random.choice(elite_indices, size=agent.num_parents) # bij evolutionaire algortimes is size=1
            parents = [population[index] for index in indices]
            # mutate the parent to create a child
            child = parents[0].mutate(parents, mutation_rate)
            # add the child to the population
            new_population.append(child)            
    
        # Print the best score in this generation
        best_score = max(scores)
        print(f"Generation {generation + 1}: Best Score = {best_score}")
    
    # return best individual
    return best_agent


## Evolutionary algorithms


In [2]:
class EvolutionaryAgent(Agent):
    num_parents = 1

    def __init__(self, num_inputs=1, num_outputs=1, hidden_layer_sizes=[]):
        super().__init__()

        # vul de constructor indien nodig aan
        self.num_inputs=num_inputs
        self.num_outputs = num_outputs
        self.hidden_layer_sizes = hidden_layer_sizes
        self.weights = []
        
        self._init_weights()
        
    def _init_weights(self):
        # maak een neuraal netwerk/de gewichten ervan
        for index, hidden_layer in enumerate(self.hidden_layer_sizes):
            if index==0:
                # input layer
                self.weights.append(np.random.randn(self.num_inputs, hidden_layer))
            else:
                # hidden layers
                self.weights.append(np.random.randn(self.hidden_layer_sizes[index-1], hidden_layer))

            if index == len(self.hidden_layer_sizes)-1:
                # output layer
                self.weights.append(np.random.randn(hidden_layer, self.num_outputs))
        pass
                
    def copy(self):
        # maak een nieuwe agent van deze klasse
        agent = EvolutionaryAgent(self.num_inputs, self.num_outputs, self.hidden_layer_sizes)
        return agent
        

    def get_action(self, observation=None):
        # zoek de beste actie (forward pass door de weight vectors)
        output = observation

        for index, hidden in enumerate(self.weights):
            if index == len(self.weights)-1:
                # output layer
                output = np.tanh(np.dot(output, hidden))
            else:
                # hidden layers (hier met relu activatiefunctie)
                x = np.dot(output, hidden)
                output = x * (x > 0) 

        return np.argmax(output) # selecteer de beste actie (exploitation)

    def mutate(self, parents=None, mutation_rate=None):
        # return een nieuw kind op basis van de gewichten van deze agent
        # dit is exploration in evolutionaire algoritmes
        # maak een nieuw child
        child = self.copy()

        # pas de gewichten aan
        child.weights = [layer + mutation_rate * np.random.randn(*layer.shape) for layer in parents[0].weights]

        return child
        

In [4]:
# Define the MountainCar environment
env = gym.make("MountainCar-v0")

# Hyperparameters
population_size = 100
mutation_rate = 0.3
num_generations = 100
num_episodes = 5

# RL agent with internally a NN with a hidden layer of 8 neurons
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
agent = EvolutionaryAgent(num_inputs=input_size, num_outputs=output_size, hidden_layer_sizes=[4, 8, 12])

best_evolutionary_agent = train_agent(env, agent, population_size=population_size, mutation_rate=mutation_rate, num_generations=num_generations, num_episodes=num_episodes)

Generation 1: Best Score = -919.0
Generation 2: Best Score = -696.0
Generation 3: Best Score = -777.0
Generation 4: Best Score = -768.0
Generation 5: Best Score = -767.0
Generation 6: Best Score = -763.0
Generation 7: Best Score = -763.0
Generation 8: Best Score = -840.0
Generation 9: Best Score = -920.0
Generation 10: Best Score = -769.0
Generation 11: Best Score = -765.0
Generation 12: Best Score = -850.0
Generation 13: Best Score = -767.0
Generation 14: Best Score = -844.0
Generation 15: Best Score = -839.0
Generation 16: Best Score = -926.0
Generation 17: Best Score = -841.0
Generation 18: Best Score = -684.0
Generation 19: Best Score = -927.0
Generation 20: Best Score = -706.0
Generation 21: Best Score = -919.0
Generation 22: Best Score = -919.0
Generation 23: Best Score = -683.0
Generation 24: Best Score = -780.0
Generation 25: Best Score = -838.0
Generation 26: Best Score = -763.0
Generation 27: Best Score = -767.0
Generation 28: Best Score = -921.0
Generation 29: Best Score = -

NameError: name 'best_individual' is not defined

In [None]:
# Evaluate the best individual
env = gym.make("MountainCar-v0", render_mode="human")

for episode in range(5):
    score = simulate_env(env, best_evolutionary_agent)
    print(f"Best Individual Score: {score}")

In [None]:
import matplotlib.pyplot as plt

def plot_action_space(agent):

    results = []
    xs = np.arange(-1.2, 0.6, 0.05)
    ys = np.arange(-0.07, 0.07, 0.001)

    for x in xs:
        tmp = []
        for y in ys:
            tmp.append(agent.get_action(np.array([x, y])))
        results.append(tmp)
    results = np.array(results)

    plt.imshow(results, cmap='gray', interpolation='none') 

plot_action_space(best_evolutionary_agent)