In [None]:
import gym
import random
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
%matplotlib inline

#class agent
class DQNAgent():
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size 
        self.action_size = action_size 
        
        self.discount_factor = 0.99
        self.learning_rate = 0.01
        self.epsilon = 0.1
        self.epsilon_decay_factor = 0.999
        
        #Experience Replay technic คนละ Weight
        self.model = self.build_model() #ใช้หา 1.Optimal action ในแต่ละ timestep, 2.ใช้คำนวน Q(s,a) ใน loss function 
        self.target_model = self.build_model() #Target model ใช้คำนวน (r + gamma * maxQ(s',a)) 
        
        self.update_target_model() 
        
        self.memory = deque(maxlen=100000) #memory for train
        self.batch_size = 60  
        
    def update_target_model(self):  
        self.target_model.set_weights(self.model.get_weights())
        
    def get_action(self, state):
        optimal_action = np.argmax(self.model.predict(state)[0]) 
        random_action = random.randint(0,self.action_size-1)
        action = np.random.choice([optimal_action, random_action], p=[1-self.epsilon, self.epsilon])
        return action 
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(units=self.action_size, activation='linear'))
        model.summary()
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def save_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
         
    def train_model(self):
        if len(self.memory) >= self.batch_size:
            mini_batch = random.sample(self.memory, self.batch_size)  # random sample for cut correlation

            update_state = np.zeros((self.batch_size, self.state_size))
            update_next_state = np.zeros((self.batch_size, self.state_size))
            action, reward, done = np.empty([self.batch_size]), np.empty([self.batch_size]), np.empty([self.batch_size])

            for i in range(self.batch_size):
                update_state[i] = mini_batch[i][0]
                action[i] = mini_batch[i][1]
                reward[i] = mini_batch[i][2]
                update_next_state[i] = mini_batch[i][3]
                done[i] = mini_batch[i][4]
                
            q_next = self.target_model.predict(update_next_state)
            q_current = self.model.predict(update_state)
            
            for i in range(self.batch_size):
                if done[i]:
                    q_current[i][action.astype(int)[i]] = reward[i]
                else:
                    q_current[i][action.astype(int)[i]] = reward[i] + self.discount_factor * (np.amax(q_next[i]))

            self.model.fit(update_state, q_current, epochs=1, verbose=0)

        self.epsilon *= self.epsilon_decay_factor
        
    def evaluate(self):
        n_sample = 1
        sample = random.sample(self.memory, n_sample)

        state = np.zeros((n_sample, self.state_size))
        next_state = np.zeros((n_sample, self.state_size))
        action, reward, done = np.empty([n_sample]), np.empty([n_sample]), np.empty([n_sample])

        for i in range(n_sample):
            state[i] = sample[i][0]
            action[i] = sample[i][1]
            reward[i] = sample[i][2]
            next_state[i] = sample[i][3]
            done[i] = sample[i][4]

        target = reward + self.discount_factor * np.max(self.target_model.predict(next_state), axis=1) * (np.ones(n_sample) - done)
        loss = self.model.evaluate(state, target)
        
        return loss
        
if __name__ == "__main__":
    
    env = gym.make('CartPole-v1')
    agent = DQNAgent(4,2)
    number_of_episodes = 400
    total_reward = np.zeros(number_of_episodes)
    loss = np.zeros(number_of_episodes)
    

    for i in range(number_of_episodes):
        state = env.reset()
        state = np.reshape(state, [1,4])
           
        while True:
            #env.render()
            
            #take action
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            if done: 
                reward = -200
            else:
                reward = reward
                
            agent.save_sample(state, action, reward, next_state, done)
                
            agent.train_model()
            
            state = next_state
            total_reward[i] += reward

            if done:
                #env.render()
                
                agent.update_target_model()
                
                loss[i] = agent.evaluate()
    
                text_file = open("text.txt", "a") 
                text_file.write('{}\n{}\n'.format(total_reward[i], loss[i]))
                text_file.close()
                
                print("Episode {}, Reward {}".format(i+1, total_reward[i]))
                
                break
        
        
    plt.plot(total_reward)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Performance',fontsize=18)
    plt.show()
    env.close()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 24)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
____________________________________