# Reinforcement learning

In [11]:
import gymnasium as gym
import numpy as np
# for allowing abstract methodes (closest thing to interface)
from abc import ABC, abstractmethod
from IPython.display import display, clear_output

class Agent(ABC):

    @abstractmethod
    def _init_weights(self):
        pass

    @abstractmethod
    def copy(self):
        pass

    @abstractmethod
    def get_action(self, observation=None):
        # get actie met exploitation
        pass

    @abstractmethod
    def mutate(parents=None, mutation_rate=None):
        # exploration door andere gewichten uit te proberen
        pass

# simulate single training run
def simulate_env(env, agent):
    # Schrijf dit, voer 1 run uit van een spel en return de cumulatieve reward

    obs, _ = env.reset() # maak een nieuw spel ( _ will zeggen dat we deze waarde niet gebruiken)
    
    done = False
    return_value = 0
    while not done:
        # voer het spel uit stap per stap
        action = agent.get_action(obs) # zoek de beste actie

        obs, reward, terminated, truncated, _ = env.step(action) # voer de actie uit
        done = terminated or truncated
        
        return_value += reward # bereken de cumulatieve reward
    
    return return_value

# train a reinforcment learning agent
def train_agent(env, agent, population_size = 50, mutation_rate=0.4, num_generations = 100, num_episodes=5):

    # Initialize the population
    population = [agent.copy() for _ in range(population_size)]

    # number of generations in the algorithm
    for generation in range(num_generations):
        # Evaluate each individual in the population
        scores = []
        for agent in population:
            score = 0
            # soort van cross validation om de score uit te middelen (randomness in startpositie vermijden)
            for i in range(num_episodes):
                score += simulate_env(env,agent)
                
            scores.append(score/num_episodes)
        
        # Select the top-performing individuals (keep the 20% best)
        elite_indices = np.argsort(scores)[-int(0.2*population_size):]
        # 1, 10, 5, 4 -> [0, 2, 3, 1]
    
        # Create a new population by mutating and recombining the elite individuals
        best_individual = population[elite_indices[-1]]
        new_population = [best_individual] # always keep the best one alive
        while len(new_population) < population_size:
            indices = np.random.choice(elite_indices, 1) # pick a random parent out of the 20% best
            parent = population[indices[0]]
            
            child = parent.mutate(mutation_rate=mutation_rate) # creer een variant met lichtjes afwijkende gewichten

            new_population.append(child) # voeg het toe aan de populatie

        population = new_population # switch to the new population
        
        # Print the best score in this generation
        best_score = max(scores)
        print(f"Generation {generation + 1}: Best Score = {best_score}")
    
    # return best individual
    return best_individual


## Evolutionary algorithms


In [12]:
class EvolutionaryAgent(Agent):
    num_parents = 1

    def __init__(self, num_inputs=1, num_outputs=1, hidden_layer_sizes=[]):
        super().__init__()

        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.hidden_layer_sizes = hidden_layer_sizes

        self.weights = [] # contains the weights of the nn
        
        self._init_weights()
        
    def _init_weights(self):
        # maak een neuraal netwerk/de gewichten ervan
        if len(self.hidden_layer_sizes) == 0:
            self.weights = [np.random.randn(self.num_inputs, self.num_outputs)]

        hidden = 0
        for index, hidden in enumerate(self.hidden_layer_sizes):
            if index == 0:
                layer = np.random.randn(self.num_inputs, hidden) # van inputs naar eerste hidden laag
            else:
                hidden_prev = self.hidden_layer_sizes[index-1] # aantal neuren vorige laag
                layer = np.random.randn(hidden_prev, hidden) # van prev hidden laag naar huidige laag
            
            self.weights.append(layer)

        output_layer = np.random.randn(hidden, self.num_outputs)
        self.weights.append(output_layer)
            
    def copy(self):
        # maak een nieuwe agent van deze klasse
        agent = EvolutionaryAgent(self.num_inputs, self.num_outputs, self.hidden_layer_sizes)
        return agent
        

    def get_action(self, observation=None):
        # zoek de beste actie

        output = observation
        
        for index, hidden_layer in enumerate(self.weights):
            output = np.dot(output, hidden_layer)

            if index < len(self.weights) -1:
                # relu activation function
                output = output * (output>0)
            else:
                # output activation function
                output = np.tanh(output)

        return np.argmax(output)

    def mutate(self, parents=None, mutation_rate=None):
        # return een nieuw kind op basis van de gewichten van deze agent
        
        child = self.copy()

        child.weights = [layer + np.random.randn(*layer.shape) * mutation_rate for layer in self.weights]
        # door het toevoegen van ruis op de gewichten is er exploration

        # layer shape is (5, 10), random.randn(5, 10)
        # *layer.shape zet de 5 in de eerste parameter van random.randn, en de 10 in de tweede

        return child
    
        

In [13]:
# Define the MountainCar environment
env = gym.make("MountainCar-v0")

# Hyperparameters
population_size = 100
mutation_rate = 0.3
num_generations = 100
num_episodes = 5

# RL agent with internally a NN with a hidden layer of 8 neurons
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
agent = EvolutionaryAgent(num_inputs=input_size, num_outputs=output_size, hidden_layer_sizes=[8])

best_evolutionary_agent = train_agent(env, agent, population_size=population_size, mutation_rate=mutation_rate, num_generations=num_generations, num_episodes=num_episodes)

Generation 1: Best Score = -196.2
Generation 2: Best Score = -169.6
Generation 3: Best Score = -172.6
Generation 4: Best Score = -167.6
Generation 5: Best Score = -154.0
Generation 6: Best Score = -163.8
Generation 7: Best Score = -167.4
Generation 8: Best Score = -122.2
Generation 9: Best Score = -120.4
Generation 10: Best Score = -142.6
Generation 11: Best Score = -119.8
Generation 12: Best Score = -122.8
Generation 13: Best Score = -126.4
Generation 14: Best Score = -119.2
Generation 15: Best Score = -118.0
Generation 16: Best Score = -120.0
Generation 17: Best Score = -124.2
Generation 18: Best Score = -120.6
Generation 19: Best Score = -122.6
Generation 20: Best Score = -121.8
Generation 21: Best Score = -120.6
Generation 22: Best Score = -154.2
Generation 23: Best Score = -152.6
Generation 24: Best Score = -125.8
Generation 25: Best Score = -120.8
Generation 26: Best Score = -121.8
Generation 27: Best Score = -124.4
Generation 28: Best Score = -122.4
Generation 29: Best Score = -

In [None]:
# Evaluate the best individual
env = gym.make("MountainCar-v0", render_mode="human")

for episode in range(5):
    score = simulate_env(env, best_evolutionary_agent)
    print(f"Best Individual Score: {score}")

In [None]:
import matplotlib.pyplot as plt

def plot_action_space(agent):

    results = []
    xs = np.arange(-1.2, 0.6, 0.05)
    ys = np.arange(-0.07, 0.07, 0.001)

    for x in xs:
        tmp = []
        for y in ys:
            tmp.append(agent.get_action(np.array([x, y])))
        results.append(tmp)
    results = np.array(results)

    plt.imshow(results, cmap='gray', interpolation='none') 

plot_action_space(best_evolutionary_agent)