In [1]:
import gym
import numpy as np

env = gym.make('MountainCar-v0')

In [25]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

class Agent:
    def __init__(self, state_size, action_size, learning_rate = 0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.memory_load = 0
        self.gamma = 0.95    
        self.learning_rate = learning_rate
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(48, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        self.memory_load += 1
        
    def choose_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        return np.argmax(self.model.predict(state))
    
    def get_samples(self, n):
        if n > len(self.memory):
            return random.sample(self.memory, len(self.memory))
        else:
            return random.sample(self.memory, n)
    
    def replay(self, batch_size):
        if batch_size > memory_load:
            return
        batch = np.array(random.sample(memory, batch_size))
        states, actions, rewards, newstates, dones = np.hsplit(samples, 5)
        states = 
        actions, rewards, newstates, 
        x = np.zeros((len(batch), self.state_size))
        y = np.zeros((len(batch), self.action_size))
        states = np.array([val[0] for val in batch])
        next_states = np.array([val[3] for val in batch])
        
        q_current_arr = self.model.predict(states)
        q_future_arr = self.model.predict(next_states)
        for i, item in enumerate(batch):
            state, action, reward, done = item[0], item[1], item[2], item[4]
            q_current = q_current_arr[i]
            if done:
                q_current[action] = reward
            else:
                q_current[action] = reward + self.gamma * np.amax(q_future_arr[i])
            x[i] = state
            y[i] = q_current
        self.model.fit(x, y, epochs=1, verbose=0)

In [30]:
ep_count = 400
ep_length = 200
epsilon_bounds = [0, 1]
epsilon_interval = (epsilon_bounds[1] - epsilon_bounds[0])/ep_count

epsilon_interval = 0.05

training_epochs = 1
batch_size = 48

In [33]:
from tqdm.notebook import tqdm

def compute_reward(x, x_d, k):
    if x_d < x and x_d < -0.5: 
        return (x+0.5)**2
    if x_d > x and x_d > -0.5:
        return (x+0.5)**2
    return 0
    
    

state_size = 2
action_size = 3
agent = Agent(state_size, action_size)
old_state = -1
old_action = -1
old_distance = 0
reward = 0
total_reward = 0
epsilon = epsilon_bounds[1]
log = {}
for episode in tqdm(range(ep_count)):
    random.seed(episode) #Reproducivity
    state = env.reset()
    for t_step in range(ep_length):
        if t_step != 0:
            agent.memorize(old_state, old_action, reward, state, done)
            
        if random.random() < epsilon:
            action = env.action_space.sample()
        else: 
            action = agent.choose_action(state)
        old_state = state
        old_action = action
        state, _, done, info = env.step(action)
        if done:
            reward += 10
        total_reward += reward
        if t_step != 0:
            agent.replay(batch_size)
        if done:
            break
        
    epsilon -= epsilon_interval
    epsilon = max(0, epsilon)
    log[episode+1] = total_reward
    total_reward = 0
#     for i in range(training_epochs):
#         agent.replay(batch_size)
        

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




KeyboardInterrupt: 

In [21]:
state = env.reset()

for t_step in range(ep_length):
    env.render()
    
    action = agent.choose_action(state)
    
    state, _, done, info = env.step(action)

    if done:
        print("Finished in {} steps".format(t_step))
        break

Finished in 199 steps


In [18]:
log

{1: 3050,
 2: 5080,
 3: 4960,
 4: 4990,
 5: 4990,
 6: 4970,
 7: 5050,
 8: 4990,
 9: 4930,
 10: 5050,
 11: 5010,
 12: 5030,
 13: 5080,
 14: 5000,
 15: 4990,
 16: 5060,
 17: 5010,
 18: 4950,
 19: 5020,
 20: 5040,
 21: 4920,
 22: 4950,
 23: 5030,
 24: 5040,
 25: 5050,
 26: 5020,
 27: 4980,
 28: 5080,
 29: 4950,
 30: 5030,
 31: 4900,
 32: 4930,
 33: 4890,
 34: 4990,
 35: 4860,
 36: 4980,
 37: 5030,
 38: 5110,
 39: 4970,
 40: 4940,
 41: 4930,
 42: 5060,
 43: 4890,
 44: 5030,
 45: 4950,
 46: 5060,
 47: 5060,
 48: 5010,
 49: 5040,
 50: 5030,
 51: 5080,
 52: 4970,
 53: 4970,
 54: 5090,
 55: 4990,
 56: 4890,
 57: 4870,
 58: 4950,
 59: 5000,
 60: 4990,
 61: 4950,
 62: 4850,
 63: 5060,
 64: 4990,
 65: 4990,
 66: 4940,
 67: 4940,
 68: 5020,
 69: 5080,
 70: 4960,
 71: 4970,
 72: 5080,
 73: 4900,
 74: 5040,
 75: 4930,
 76: 4980,
 77: 4980,
 78: 4870,
 79: 4930,
 80: 4950,
 81: 4840,
 82: 5070,
 83: 4910,
 84: 4830,
 85: 4850,
 86: 5000,
 87: 4980,
 88: 5050,
 89: 4990,
 90: 4930,
 91: 4910,
 92: 494