In [None]:
#Codice dell'agente DQN
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
import numpy as np
import random
import os

class DQNAgent():
    def __init__(self, env, iterations=200, load=False, alpha= 0.001, epsilon=1.0, gamma=0.99, epsilon_reduction=0.05, model_name="cervello_positronico"):
        #variabili dell'ambiente
        self.env = env
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.state_shape = self.env.observation_space.shape
        self.max_iterations = iterations

        #replay memory e campione
        self.memory = deque(maxlen=20_000)
        self.mem_sample = 32

        #learning rate
        self.alpha = alpha

        #exploration
        self.epsilon = epsilon
        self.epsilon_min = 0.01
        self.epsilon_reduction = epsilon_reduction

        #futuro 
        self.gamma = gamma

        #creazione delle due reti neurali, quella dell'addestramento e quella stabile
        self.model_name = model_name
        self.train_model = self.create_model(load)
        self.target_model = self.create_model(False)
        self.target_model.set_weights(self.train_model.get_weights())

    def create_model(self, load):
        model = Sequential()
        model.add(Dense(24, input_shape=self.state_shape, activation='relu'))
        model.add(Dense(48, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))

        model.compile(loss='mse', optimizer=Adam(lr=self.alpha))
        if load:
            model.load_weights(self.model_name)

        return model

    #scelta dell'azione in base ad epsilon
    def choose_action(self, state):
        self.epsilon = max(self.epsilon, self.epsilon_min)
        if random.uniform(0,1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.train_model.predict(state)[0])

    def train_from_memory(self):
        if len(self.memory) < self.mem_sample:
            return

        minibatch = random.sample(self.memory, self.mem_sample)

        states = []
        new_states=[]

        for state, _, _, new_state, _ in minibatch:
            states.append(state)
            new_states.append(new_state)
        
        states = np.array(states).reshape(self.mem_sample, 2)
        new_states = np.array(new_states).reshape(self.mem_sample, 2)

        targets = self.train_model.predict(states)
        new_state_targets=self.target_model.predict(new_states)

        i=0
        for state, action, reward, new_state, done in minibatch:
            target = targets[i]
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * np.max(new_state_targets[i])
            i+=1
        
        self.train_model.fit(states, targets, epochs=1, verbose=0)

    #salvataggio del cervellone
    def save_model(self, name):
        self.train_model.save(name)
        print("___Model Saved___")

    def load_model(self, name):
        self.target_model.load_weights(name)
        self.train_model.load_weights(name)
        print("___Model Loaded___")

    #memorizzazione degli stati
    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def fit(self, episode, state, render):
        total_reward = 0
        record = self.env.observation_space.low[0]

        for epoch in range(self.max_iterations):
            if render and episode % 50 == 0:
                self.env.render()
            
            action = self.choose_action(state)
            new_state, reward, done, _ = self.env.step(action)
            new_state = state.reshape(1,self.state_size)

            if new_state[0][0] > record:
                record = new_state[0][0]

            if new_state[0][0] >= 0.5:
                reward += 10

            total_reward += reward

            self.memorize(state, action, reward, new_state, done)
            self.train_from_memory()

            state = new_state

            if done:
                break

        if epoch >= self.max_iterations - 1:
            print("Failed to finish task in epsoide {}".format(episode))
        else:
            print("Success in epsoide {}, used {} iterations!".format(episode,epoch))
            self.save_model('cervelli/train_model_ep-{}.h5'.format(episode))

        #Sync
        self.target_model.set_weights(self.train_model.get_weights())

        print("--now epsilon is {}, the reward is {} maxPosition is {}".format(max(self.epsilon, self.epsilon_min), total_reward,record))

        self.epsilon -= self.epsilon_reduction

        return total_reward, epoch
    
    def start_training(self, episodes=400, render=False):
        total_rewards=[]
        total_epochs=[]
        
        try:
            for episode in range(episodes):
                state = env.reset().reshape(1,self.state_size)
                total_reward, epoch = self.fit(episode, state, render)

                total_epochs.append(epoch+1)
                total_rewards.append(total_reward)
        finally:
            self.env.close()

        return total_rewards, total_epochs

    def play(self, trials, filename, render=True):
        epochs = []
        done = False
        successes = []
        self.load_model(filename)
        try:
            for episode in range(trials):
                state = self.env.reset()
                state = np.reshape(state, (1, self.state_size))

                succ = 0
                for i in range(self.max_iterations):
                    if render:
                        self.env.render()
                    action = np.argmax(self.train_model.predict(state))

                    next_state, reward, done, _ = self.env.step(action)
                    next_state = np.reshape(next_state, (1, self.state_size))

                    if next_state[0][0] >= 0.5:
                        succ = 1
                    state = next_state
                print("Completed/Episodes {}/{}, success = {}".format(episode + 1, trials, succ))
                successes.append(succ)
                epochs.append(i+1)
        finally:
            self.env.close()

        return epochs, successes

In [None]:
import gym

#creazione dell'ambiente e dell'agente
env = gym.make('MountainCar-v0')
pippo = DQNAgent(env, alpha=0.001, epsilon_reduction=0.05)

In [None]:
#addestramento del pilota Pippo
episodes = 500
total_rewards, total_epochs = pippo.start_training(episodes, False)

In [None]:
import matplotlib.pyplot as plt

plt.xlabel("Episodes")
plt.ylabel("Epochs")
plt.plot(range(episodes), total_epochs)

In [None]:
plt.xlabel("Episodes")
plt.ylabel("Rewards")
plt.plot(range(episodes), total_rewards)

In [28]:
#test del pilota
pippo = DQNAgent(env, epsilon=0.0)
trials = 30
epochs, successes = pippo.play(trials , "pippo.h5")

___Model Loaded___
Completed/Episodes 1/30, success = 0
Completed/Episodes 2/30, success = 0
Completed/Episodes 3/30, success = 0
Completed/Episodes 4/30, success = 0
Completed/Episodes 5/30, success = 0
Completed/Episodes 6/30, success = 0
Completed/Episodes 7/30, success = 0
Completed/Episodes 8/30, success = 0
Completed/Episodes 9/30, success = 0
Completed/Episodes 10/30, success = 0
Completed/Episodes 11/30, success = 0
Completed/Episodes 12/30, success = 0
Completed/Episodes 13/30, success = 0
Completed/Episodes 14/30, success = 0
Completed/Episodes 15/30, success = 0
Completed/Episodes 16/30, success = 0
Completed/Episodes 17/30, success = 0
Completed/Episodes 18/30, success = 0
Completed/Episodes 19/30, success = 0
Completed/Episodes 20/30, success = 0
Completed/Episodes 21/30, success = 0
Completed/Episodes 22/30, success = 0
Completed/Episodes 23/30, success = 0
Completed/Episodes 24/30, success = 0
Completed/Episodes 25/30, success = 0
Completed/Episodes 26/30, success = 0
Co

In [None]:
plt.xlabel("Episodes")
plt.ylabel("Epoches")
plt.plot(range(trials), epochs)

In [None]:
plt.xlabel("Episodes")
plt.ylabel("Successes")
plt.plot(range(len(successes)), successes)