In [1]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
import random

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [17]:
class mountain_agent():
    def __init__(self, state_size=2, action_size=3, alpha= 0.001,epsilon=1.0,gamma=0.99, episodes=100, load=False):
        #ambiente
        self.env = gym.make('MountainCar-v0').env
        self.state_size = state_size
        self.action_size = action_size
        #replay memory e campione
        self.memory = deque(maxlen=2000)
        self.mem_sample = 64
        #numero episodi complessivi
        self.episodes = episodes
        #learning rate
        self.alpha = alpha
        #exploration
        self.epsilon = epsilon
        self.epsilon_min = 0.01
        self.epsilon_reduction = 0.995
        #futuro 
        self.gamma = gamma
        self.model_name = "mountain_car_agent_terzo"
        #modello dell'agente
        self.model = self.create_model(load)

    def create_model(self, load):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))

        model.add(Dense(24, activation='relu'))

        model.add(Dense(self.action_size, activation='linear'))

        model.compile(loss='mse', optimizer=Adam(lr=self.alpha), metrics=['accuracy'])
        if load:
            model.load_weights(self.model_name)
        return model
    
    #memorizzazione degli stati
    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    #scelta dell'azione in base ad epsilon
    def choose_action(self, state):
        if random.uniform(0,1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state)[0])

    #impara
    def replay(self):
        if len(self.memory) < self.mem_sample:
            return
        
        minibatch = random.sample(self.memory, self.mem_sample)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.max(self.model.predict(next_state)[0])

            prediction = self.model.predict(state)
            prediction[0][action] = target
            self.model.fit(state, prediction, epochs=1, verbose=0)

        # Decadimento di epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_reduction


    def fit(self,limit=1000, time_to_render=100, render=False):
        total_epochs = []

        for episode in range(self.episodes):
            current_state = self.env.reset()
            current_state = np.reshape(current_state, (1, self.state_size))
            epochs = 0
            
            for time in range(limit):
                if episode % time_to_render == 0 and render:
                    self.env.render()
                
                #scelta dell'azione
                action = self.choose_action(current_state)
                #esecuzione dell'azione
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, (1, 2))
                #aggiornamento del reward
                reward = reward if not done else -10
                #aggiornmanto della memoria
                self.memorize(current_state, action, reward, next_state, done)

                current_state = next_state
                epochs += 1

                if done:
                    print("Done in time. Episode: {}/{}, score: {}, e: {:.2}".format(episode, self.episodes, time, self.epsilon))
                    break

            self.replay()
            print("Out of time. Episode: {}/{}, score: {}, e: {:.2}".format(episode, self.episodes, epochs, self.epsilon))
            total_epochs.append(epochs)
        print('Fitting terminated!')
        self.save_model()
    
    def save_model(self):
        self.model.save(self.model_name)

    def play(self):
        done = False
        state = self.env.reset()
        state = np.reshape(state, (1, 2))
        i = 0
        while not done:
            i += 1
            self.env.render()
            action = np.argmax(self.model.predict(state)[0])

            next_state, reward, done, _ = self.env.step(action)
            next_state = np.reshape(next_state, (1, 2))

            state = next_state
            if(i == 200):
                done = True


In [19]:
#addestramento

pippo = mountain_agent(alpha=0.1, episodes=300,load=True)
pippo.fit(render=False)

Out of time. Episode: 0/300, score: 1000, e: 0.99
Out of time. Episode: 1/300, score: 1000, e: 0.99
Out of time. Episode: 2/300, score: 1000, e: 0.99
Out of time. Episode: 3/300, score: 1000, e: 0.98
Out of time. Episode: 4/300, score: 1000, e: 0.98
Out of time. Episode: 5/300, score: 1000, e: 0.97
Out of time. Episode: 6/300, score: 1000, e: 0.97
Out of time. Episode: 7/300, score: 1000, e: 0.96
Out of time. Episode: 8/300, score: 1000, e: 0.96
Out of time. Episode: 9/300, score: 1000, e: 0.95
Out of time. Episode: 10/300, score: 1000, e: 0.95
Out of time. Episode: 11/300, score: 1000, e: 0.94
Out of time. Episode: 12/300, score: 1000, e: 0.94
Out of time. Episode: 13/300, score: 1000, e: 0.93
Out of time. Episode: 14/300, score: 1000, e: 0.93
Out of time. Episode: 15/300, score: 1000, e: 0.92
Out of time. Episode: 16/300, score: 1000, e: 0.92
Out of time. Episode: 17/300, score: 1000, e: 0.91
Out of time. Episode: 18/300, score: 1000, e: 0.91
Out of time. Episode: 19/300, score: 1000

In [16]:
pippo.play()