In [6]:
# -*- coding: utf-8 -*-
#
# Laboration 3 Deep reinforcement Learning (Deep Q-learning)
# 
# This laboration is my first experience with the OpenAI gym and the CartPole environment. Gym is basically a Python library that includes several machine learning challenges, 
# in which an autonomous agent should be learned to fulfill different tasks.One of the simplest and most popular challenges is CartPole. It’s basically a 2D game in which the agent has to control, 
# i.e. move left or right, a cart to balance a pole standing perpendicularly on the cart. This is a classical reinforcement learning problem. The agents starts by trying random actions as a consequence 
# to which it gets rewarded (or not). Based on the rewards, it continuously “learns”, which action is good in which specific situation. Doing so, it learns how to master the game without ever being told
# how the game even works. CartPole is basically a binary classification problem with four inputs (state variables):
#
# - position of the cart on the track
# - angle of the pole with the vertical
# - cart velocity
# - rate of change of the angle
#
# The output is binary, i.e. either 0 or 1, corresponding to “left” or “right”. One challenge is the fact that all four features are continuous values (floating point numbers), which, naively, 
# implies an infinitely large feature space.
#
# Having exectuted the program below where the parameter "EPISODES" is changed to 1000 you can see that the score varies quite a bit over the period but the variations become less drastic as the agent learns how to become an expert player. 
# "Epsilon" i.e.  is also coming down drastically as it starts with a random value and then stabilizes. I stopped the running for various reasons after 771 episodes.




import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

EPISODES = 1000  # I have changed the parameter from 30 to 1000.

class DQNAgent:
    def __init__(self, state_size, action_size): # Initializes the class with "state_size" and "action_size" parameters
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # Discount rate.
        self.epsilon = 1.0  # Exploration rate.
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self): # Builds the neural network model using Keras seqwuential model and returns it. The model has two hidden layers with 24 neurons each. The last output layer has an ouput of "auction_size".
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu')) # I am using linear activation.
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',                                          # Mean Square Error (MSE) loss and the Adam optimizer as characteristics of the neural network.
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # Returns action

    def replay(self, batch_size): # A Method that trains the neural network with experiences in the memory.
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) 
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name): # Loads the model weights with the given name.
        self.model.load_weights(name)

    def save(self, name): # Saves the model weights with the given name.
        self.model.save_weights(name)


if __name__ == "__main__": # The agent is trained for "EPISODES", improving the reward and recalucalting the "epsilon".
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(200):
            action = agent.act(state) # "act(state)" acts based on the previous state and predicts the reward.
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        
        

episode: 0/1000, score: 13, e: 1.0
episode: 1/1000, score: 41, e: 0.89
episode: 2/1000, score: 29, e: 0.77
episode: 3/1000, score: 19, e: 0.7
episode: 4/1000, score: 10, e: 0.67
episode: 5/1000, score: 15, e: 0.62
episode: 6/1000, score: 14, e: 0.58
episode: 7/1000, score: 20, e: 0.52
episode: 8/1000, score: 12, e: 0.49
episode: 9/1000, score: 19, e: 0.45
episode: 10/1000, score: 25, e: 0.39
episode: 11/1000, score: 11, e: 0.37
episode: 12/1000, score: 15, e: 0.35
episode: 13/1000, score: 24, e: 0.31
episode: 14/1000, score: 21, e: 0.28
episode: 15/1000, score: 32, e: 0.23
episode: 16/1000, score: 46, e: 0.19
episode: 17/1000, score: 49, e: 0.15
episode: 18/1000, score: 157, e: 0.066
episode: 19/1000, score: 106, e: 0.039
episode: 20/1000, score: 91, e: 0.025
episode: 22/1000, score: 193, e: 0.01
episode: 23/1000, score: 187, e: 0.01
episode: 30/1000, score: 108, e: 0.01
episode: 31/1000, score: 9, e: 0.01
episode: 32/1000, score: 8, e: 0.01
episode: 35/1000, score: 153, e: 0.01
episod

KeyboardInterrupt: ignored