In [1]:
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
import random

In [14]:
class DQLAgent():
    
    def __init__(self, env):
        
        #hyperparameter
        self.state_size = env.observation_space.shape[0]                    #Input Dim
        self.action_size = env.action_space.n                               #Output Dim
       
        self.gamma = 0.95 #Whether to focus on future or present rewards
        self.learning_rate = 0.001
        
        self.epsilon = 1
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000)
        self.model = self.build_model()
    
    def build_model(self):
        
        model = Sequential()
        model.add(Dense(48,input_dim =self.state_size))
        model.add(Activation('tanh'))
        
        model.add(Dense(self.action_size))
        model.add(Activation('linear'))
        
        model.compile(loss ="mse", optimizer = Adam(lr = self.learning_rate))
        
        return model
        
    
    def remember(self, state, action, reward, next_state,done):
        
        #Store information related to the environment!!!
        self.memory.append((state, action, reward, next_state,done))  
        
    
    def act(self,state):
        
        #Explore or Exploit
        #Acts according to the information of the STATE.
        
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
    
    
    def replay(self,batch_size):
        #How much of the memory is used (batch_size) + Training
        
        if len(self.memory) < batch_size:
            return
        mini_batch = random.sample(self.memory,batch_size)
        
        for state, action, reward, next_state,done in mini_batch:
            if done:
                target = reward
            
            else:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
                
            train_target = self.model.predict(state)
            train_target[0][action] = target    
            self.model.fit(state,train_target, verbose = 0)
        
    
    def adaptive_greedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay 
        
        
    




In [None]:
if __name__ == "__main__":
    
    #initialize gym environment and agent
    
    env = gym.make('CartPole-v1')
    
    agent = DQLAgent(env)
    
    
    episodes = 100
    for eps in range(episodes):
        
        #initialize environment (reset)
        state = env.reset()
        
        state = np.reshape(state, [1,4])
        
        batch_size = 16
        time = 0 #Higher the better
        
        while True:
            
            #Select an action
            action = agent.act(state)
            
            
            #Step
            
            next_state, reward , done, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            #Remember, memory, storage
            
            agent.remember(state, action, reward, next_state,done)
            
            #Update State
            state = next_state
            
            #Replay
            agent.replay(batch_size)
            
            #Adaptive Greedy / Decrease exploration rate
            agent.adaptive_greedy()
            
            
            time += 1
            
            if done:
                print("Episode {}, Time: {}".format(eps,time))
                break

Episode 0, Time: 18
Episode 1, Time: 10
Episode 2, Time: 17
Episode 3, Time: 29
Episode 4, Time: 9
Episode 5, Time: 18
Episode 6, Time: 13
Episode 7, Time: 31


In [None]:
import time
trained_model = agent
state = env.reset()
state = np.reshape(state,[1,4])
time_t = 0


while True:
    
    env.render()
    
    action = trained_model.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(state,[1,4])
    state = next_state
    time_t += 1
    print(time_t)
    time.sleep(0.4)
    if done:
        break
        
print("Done")        