Dependencies

In [2]:
#dependencies
import random
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
from keras.optimizers import RMSprop, Adam
import gym


Using TensorFlow backend.


Q(state, action) = R(state, action) + Gamma * Max[Q(next state, all actions)]

The Gamma parameter has a range of 0 to 1 (0 <= Gamma > 1).  If Gamma is closer to zero, the agent will tend to consider only immediate rewards.  If Gamma is closer to one, the agent will consider future rewards with greater weight, willing to delay the reward.

epsilon parameter determines with which probability our agent takes a random action

## First model
first model 

In [5]:
class Agent:
    def __init__(self, envName=('CartPole-v0'), gamma=None, epsilon=None, e_decay=0.995, learning_rate=0.001):
        self.envName = envName
        self.env = gym.make(envName)
        self.memory = deque(maxlen=10000)
        self.gamma = gamma
        self.epsilon = epsilon
        self.e_decay = e_decay
        self.e_min = 0.02  # minimal exploraton rate
        self.learning_rate = learning_rate
        self.model = self._build_model()

    def _build_model(self):
        '''
        builds a neural network
        :return: model - neural network object
        '''
        model = Sequential()
        model.add(Dense(24, input_dim=self.env.observation_space.shape[0], activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        '''
        add <s,a,r,s'> vector to memory
        :param state: agent state
        :param action: action to perform
        :param reward: reward given after performed action
        :param next_state: next state
        :param done: bool parameter which determines whrther current state terminal or not
        :return: 
        '''
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        '''
        defins gow agent will actin a given state
        :param state: state of our agent
        :return: action to be perfomed
        '''
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.env.action_space.n)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        '''
        train network by sampling elements from memory
        :param batch_size: sample size of agent memory
        :return: 
        '''
        batch_size = min(batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)
        X = np.zeros((batch_size, self.env.observation_space.shape[0]))
        Y = np.zeros((batch_size, self.env.action_space.n))
        for i in range(batch_size):
            state, action, reward, next_state, done = minibatch[i]
            target = self.model.predict(state)[0]
            # print "i", i, " predict", self.model.predict(state)
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            X[i], Y[i] = state, target
        self.model.fit(X, Y, batch_size=batch_size, nb_epoch=1, verbose=0)
        if self.epsilon > self.e_min:
            self.epsilon *= self.e_decay

    def load(self, name):
        '''
        load weights from a file
        :param name: filename
        :return: 
        '''
        self.model.load_weights(name)

    def save(self, name):
        '''
        seves weights to a file
        :param name: filename
        :return: 
        '''
        self.model.save_weights(name)


### Cartpole

In [7]:
agent = Agent('CartPole-v0',.95,1.0)
# Iterate the game
#####################################
#uncomment if u want to load weights#
#####################################
# agent.model.load_weights("cartpole3.kek")

#####################################
#uncomment if u want to save scores#
#####################################
f.open(/data/CPscores.txt)

for e in range(2000):
    state = agent.env.reset()
    state_size = agent.env.observation_space.shape[0]
    state = np.reshape(state, [1,state_size])
    resultReward = 0
    while True:
        agent.env.render()
        action = agent.act(state)
        next_state, reward, done, _ = agent.env.step(action)
        # reward = reward if not done else -1
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        resultReward += reward
        if done and resultReward <= 190:
            reward = -1
        else:
            reward = reward
        if done:
            print "episode:", e, "score:", resultReward #, 'ep', e
            break
    agent.replay(64)
    if e % 10 == 0:
         agent.save("cartpole2.kek")

[2017-10-17 13:03:19,076] Making new env: CartPole-v0


episode: 0 score: 24.0
episode: 1 score: 35.0
episode: 2 score: 26.0
episode: 3 score: 16.0
episode: 4 score: 17.0
episode: 5 score: 18.0
episode: 6 score: 30.0
episode: 7 score: 24.0
episode: 8 score: 32.0
episode: 9 score: 11.0
episode: 10 score: 32.0
episode: 11 score: 27.0
episode: 12 score: 20.0
episode: 13 score: 21.0
episode: 14 score: 17.0
episode: 15 score: 13.0
episode: 16 score: 29.0
episode: 17 score: 31.0
episode: 18 score: 10.0
episode: 19 score: 25.0
episode: 20 score: 22.0
episode: 21 score: 15.0
episode: 22 score: 27.0
episode: 23 score: 11.0
episode: 24 score: 19.0
episode: 25 score: 44.0
episode: 26 score: 38.0
episode: 27 score: 19.0
episode: 28 score: 11.0
episode: 29 score: 11.0
episode: 30 score: 17.0
episode: 31 score: 32.0
episode: 32 score: 14.0
episode: 33 score: 25.0
episode: 34 score: 19.0
episode: 35 score: 19.0
episode: 36 score: 16.0
episode: 37 score: 13.0
episode: 38 score: 13.0
episode: 39 score: 34.0
episode: 40 score: 37.0
episode: 41 score: 17.0
ep

KeyboardInterrupt: 

### Mountain Car

In [6]:
agent = Agent('MountainCar-v0',1.,1.0,0.9995,0.0001)
# Iterate the game
f=open('mc1.txt','w')
for e in range(100000):
    state = agent.env.reset()
    state_size = agent.env.observation_space.shape[0]
    state = np.reshape(state, [1,state_size])
    resultReward = 0
    while True:
        agent.env.render()
        action = agent.act(state)
        next_state, reward, done, _ = agent.env.step(action)
        # reward = reward if not done else -1
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        resultReward += reward
        if done:
            if resultReward > -200:
                print 'eeeee', e , 'score', resultReward
            print >> f, "episode:", e, "score:", resultReward, 'ep ',agent.epsilon #, 'ep', e
            break
    agent.replay(64)

    if e % 5 == 0:
        agent.save("MountainCar.kek")
f.close()


[2017-10-16 00:55:17,855] Making new env: MountainCar-v0


KeyboardInterrupt: 

# Second model

In [None]:
class Agent2:
    def __init__(self, state_size, action_size):
        self.render = True

        self.state_size = state_size
        self.action_size = action_size

        self.memory = deque(maxlen=10000) #memory size
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.e_min = 0.005
        self.e_decay = 0.00002
        self.train_start = 1000

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        '''
        build neural net
        :return: model - neural net object
        '''
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(16, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=0.001))
        return model

    def update_target_model(self):
        '''
        update weights of our target model
        :return:
        '''
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        '''
        perform agent action in a given state
        :param state: agent state
        :return:
        '''
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def remember(self, state, action, reward, next_state, done):
        '''
        add <s,a,r,s'> vector to agent memory
        :param state: agent state
        :param action: action to perform
        :param reward: collected reward
        :param next_state: next state after performing actopn
        :param done: bool variable which determines is current state terminal or not
        :return:
        '''
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.e_min:
            self.epsilon -= self.e_decay
        # print(len(self.memory))

    def replay(self, batch_size):
        '''
        train network by sampling elements from memory
        :param batch_size:
        :return:
        '''
        if len(self.memory) < self.train_start:
            return
        batch_size = min(batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]

            if done:
                target[action] = reward
            else:
                target[action] = reward + self.discount_factor * \
                                          np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)

    def load(self, name):
        '''
        loadweights of pre-trained network
        :param name: filename to load
        :return:
        '''
        self.model.load_weights(name)

    def save(self, name):
        '''
        store network weights
        :param name: filename to save
        :return:
        '''
        self.model.save_weights(name)



### mountain car

In [None]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
#action_size = env.action_space.n
# reduce action size for mountain car problem
action_size = 2
agent = Agent2(state_size, action_size)
# agent.load("./save/MountainCar1.kek")
# uncomment to save scores to a file
#f= open('scoresCar1.txt', 'w')
for e in range(100000):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    # print(state)

    take_action = 0

    action_count = 0

    while not done:
        if agent.render:
            env.render()

        action_count = action_count + 1

        if action_count == 4:
            action = agent.act(state)
            action_count = 0

            if action == 0:
                take_action = 0
            elif action == 1:
                take_action = 2

        next_state, reward, done, info = env.step(take_action)
        next_state = np.reshape(next_state, [1, state_size])
        #reward = reward if not done else -100

        agent.remember(state, take_action, reward, next_state, done)
        agent.replay(64)
        score += reward
        state = next_state

        if done:
            env.reset()
            agent.update_target_model()
            # print ony successful episodes
            if score > -200:
                print "episode:", e, "  score:", score
            # uncomment to save scores to a file
            #print >> f,"episode:", e, "  score:", score
    # save model weights every 50 iterations
    if e % 50 == 0:
        agent.save("MountainCar1.kek")
# file close, uncomment if you want to use file
# f.close()

# Results