In [2]:
import math
import numpy as np
import tensorflow as tf 
import sys
import os
import random
import time
import flappy_bird_gym
import yaml
import pygame

LOCAL_PATH=os.path.abspath('')
MAIN_PATH=LOCAL_PATH+''

CONF_PATH=MAIN_PATH+"/CONF"
DEMO_PATH=MAIN_PATH+"/DEMO"
DOC_PATH=MAIN_PATH+"/DOC"
ENV_PATH=MAIN_PATH+"/LIB/GAME"
LIB_PATH=MAIN_PATH+"/LIB"
LOG_PATH=MAIN_PATH+"/LOG"
IMG_PATH=MAIN_PATH+"/Images"
POLICIES_PATH=MAIN_PATH+"/POLICIES"
TEMP_PATH=MAIN_PATH+"/TEMP"
TRN_PATH=MAIN_PATH+"/LIB/TRAINING"

sys.path.insert(1, ENV_PATH)
sys.path.insert(1, TRN_PATH)
sys.path.insert(1, LIB_PATH)

class DQN(tf.keras.Model):
    def __init__(self,n_actions=4,fc1_dims=512):
        super().__init__()
        self.n_actions=n_actions
        self.fc1_dims=fc1_dims
        self.d0 = tf.keras.layers.Flatten()
        self.d1 = tf.keras.layers.Dense(self.fc1_dims,activation='relu')
        self.q_values = tf.keras.layers.Dense(self.n_actions,activation='linear')
        
    def call(self, state):
        x = tf.convert_to_tensor(state)
        x = self.d0(x)
        x = self.d1(x)
        x = self.q_values(x)
        return x
    
class ReplayBuffer():
    def __init__(self, buffer_size=50000,input_shape=[0,0,0,0]):
        self.buffer_size = buffer_size
        self.input_shape=input_shape
        self.state_mem = np.zeros((self.buffer_size,*(self.input_shape)), dtype=np.float32)
        self.action_mem = np.zeros((self.buffer_size), dtype=np.int32)
        self.reward_mem = np.zeros((self.buffer_size), dtype=np.float32)
        self.next_state_mem = np.zeros((self.buffer_size,*(self.input_shape)), dtype=np.float32)
        self.done_mem = np.zeros((self.buffer_size), dtype=np.bool)
        self.pointer = 0

    def add_exp(self, state, action, reward, next_state, done):
        idx  = self.pointer % self.buffer_size 
        self.state_mem[idx] = state
        self.action_mem[idx] = action
        self.reward_mem[idx] = reward
        self.next_state_mem[idx] = next_state
        self.done_mem[idx] = 1 - int(done)
        self.pointer += 1

    def sample_exp(self, batch_size= 64):
        max_mem = min(self.pointer, self.buffer_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        states = self.state_mem[batch]
        actions = self.action_mem[batch]
        rewards = self.reward_mem[batch]
        next_states = self.next_state_mem[batch]
        dones = self.done_mem[batch]

        return states, actions, rewards, next_states, dones

class SimpleQAgent():
    def __init__(self, gamma=0.99, replace=100, lr=0.05,n_actions=2,input_shape=[0,0,0,0],buffer_size= 50000,batch_size=64):
        self.buffer_size = buffer_size
        self.input_shape=input_shape
        self.n_actions=n_actions
        self.gamma = gamma
        self.epsilon = 1.0
        self.min_epsilon = 0.01
        self.epsilon_decay = 1e-3
        self.replace = replace
        self.trainstep = 0
        self.memory = ReplayBuffer(input_shape=self.input_shape,buffer_size=self.buffer_size)
        self.batch_size = batch_size
        self.q_net = DQN()
        self.target_net = DQN()
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.q_net.compile(loss='huber', optimizer=opt)
        self.target_net.compile(loss='huber', optimizer=opt)

    def update_mem(self, state, action, reward, next_state, done):
        self.memory.add_exp(state, action, reward, next_state, done)

    def update_target(self):
        self.target_net.set_weights(self.q_net.get_weights())  

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice([i for i in range(self.n_actions)],p=[0.93,0.07]),False
        else:
            state=tf.convert_to_tensor([state.flatten()],dtype=tf.float32)
            actions = self.q_net(state)
            action = np.argmax(actions)
            return action,True

    # def update_mem(self, state, action, reward, next_state, done):
    #     self.memory.add_exp(state, action, reward, next_state, done)

    def update_target(self):
        self.target_net.set_weights(self.q_net.get_weights())   

    def update_epsilon(self):
        self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.min_epsilon else self.min_epsilon
        return self.epsilon

    def train_simple(self):
        if self.memory.pointer < self.batch_size:
            return 0,0
        with tf.GradientTape() as tape:
            states, actions, rewards, next_states, dones = self.memory.sample_exp(self.batch_size)
            target = self.q_net(states)
            q_next = tf.math.reduce_max(self.target_net(next_states), axis=1,keepdims=True).numpy()
            q_target = np.copy(target)  #optional  
            maxq=tf.reduce_mean(tf.math.reduce_max(target, axis=1,keepdims=True).numpy())
            for idx, terminal in enumerate(dones):
                if terminal:
                    q_next[idx]=0.0
                q_target[idx,actions[idx]]=rewards[idx]+self.gamma*q_next[idx]
            loss=tf.reduce_mean((q_target-target)**2)
        gradient= tape.gradient(loss,self.q_net.trainable_variables)
        self.q_net.optimizer.apply_gradients(zip(gradient,self.q_net.trainable_variables))
        self.trainstep += 1
        return float(maxq.numpy()),float(1)

class DQL:
    def __init__(self):
        print("Flappy Bird Training")
        pygame.init()
        pygame.display.set_caption("Flappy Bird")
        self.experiment_name="dql"
        self.n_actions,self.max_epochs,self.max_episodes,self.max_steps,self.max_e,self.min_e,self.e_decay,self.w_updates,self.t_updates,self.t_steps,self.mini_batch_size,self.max_queue_length=self.load_params(CONF_PATH)
        self.environment=flappy_bird_gym.make("FlappyBird-v0",pipe_gap = 150)
        s = self.environment.reset()

        self.environment.step(0)
        self.agente=SimpleQAgent(n_actions=2,input_shape=s.shape,buffer_size=self.max_queue_length,batch_size=self.mini_batch_size)
        self.state=tf.convert_to_tensor([s.flatten()],dtype=tf.float32)
        self.agente.q_net(self.state)
        self.agente.target_net(self.state)
        self.rtrn = []
        self.avg_r = []
        self.loss = []
        self.loss_avg = []
        self.max_q_net = []
        self.max_q_net_avg = []
        self.environment.dt = 30.0/1000.0

    def episode_training(self,mode):
        normal=False
        if mode=="random":
            print("Episode training random")
            name="random_episodes"
            epsilon=1.0
        else:
            print("Episode training ",self.experiment_name)
            name=self.experiment_name+"_episodes"
            normal=True

        episode=0
        weight_updates=0
        step_num=0
        rwd=0
        rwd_nn=0
        avg_r_nn=[0]
        avg_r=[0]
        eoe_avg_r=[0]
        Rtrn=[0]
        eoe_Rtrn=[0]
        Rtrn_nn=[0]

        biased=0
        while episode<self.max_episodes:
            episode+=1
            d=False
            expert_action=False
            s=self.environment.reset()
            action=self.agente.act(s)
            Rtrn.append(rwd)
            eoe_Rtrn.append(rwd)
            Rtrn_nn.append(rwd_nn)
            # eoe_Rtrn_nn.append(rwd_nn)
            rwd=0
            rwd_nn=0
            last_score=0
            while not d:
                #time.sleep(0.1)
                step_num+=1
                if step_num==self.max_steps:
                    break
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
                if normal:
                    self.agente.epsilon= self.min_e + (self.max_e - self.min_e) * np.exp(-self.e_decay*(episode))
                    action,nn=self.agente.act(s)
                else:
                    self.agente.epsilon=1.0
                    action,nn=self.agente.act(s)

                s_, r, d, info = self.environment.step(action)

                if (abs(1/(math.e**(s_[1]+0.0)))**2)>1.1 or (abs(1/(math.e**(s_[1]+0.0)))**2)<0.9:
                    r=-1-abs(1-(1/(math.e**(s_[1].astype(float)+0.08))))
                else:
                    r=1

                r=float(r)

                if info['score']!=last_score:
                    biased=9
                    last_score=info['score']
                else:
                    biased=0
                r+=biased
                #self.environment.render()
                rwd+=r

                if nn:
                    rwd_nn+=r

                state=tf.convert_to_tensor([s],dtype=tf.float32)
                state_=tf.convert_to_tensor([s_],dtype=tf.float32)
                self.agente.update_mem(state, action, r, state_, d)

                if step_num%self.t_steps==0 and step_num>1:
                    if normal:
                        q_max,loss_t=self.agente.train_simple()
                        weight_updates+=1
                        # loss.append(float(loss_t))
                        # maxq.append(float(q_max))
                        # eoe_loss.append(float(loss_t))
                        # eoe_maxq.append(float(q_max))
                s=s_

                if normal:
                    if weight_updates%self.t_updates==0 and  weight_updates>0:
                        self.agente.target_net.set_weights(self.agente.q_net.get_weights())
            print("[",name,"]Training Episode ",episode," of ",self.max_episodes)

            if normal:
                avg_r_nn.append(sum(Rtrn_nn)/len(Rtrn_nn))

            eoe_avg_r.append(sum(eoe_Rtrn)/len(eoe_Rtrn))
            avg_r.append(sum(Rtrn)/len(Rtrn))
            self.save_data(TEMP_PATH,name,avg_r,eoe_avg_r,avg_r_nn)

        self.agente.q_net.save(POLICIES_PATH+"/"+name)

    def load_params(self,path):
        with open(r''+path+"/"+self.experiment_name+"_conf.yaml") as parameters:
            config_list = yaml.safe_load(parameters)
        #HYPERPARAMETERS
        mini_batch_size=config_list['training']['mini_batch_size']
        n_actions=config_list['training']['num_actions']
        learning_rate=config_list['training']['learning_rate']
        max_episodes=config_list['training']['num_episodes']
        max_epochs=config_list['training']['num_epochs']
        max_e=config_list['epsilon']['max_epsilon']
        max_steps=config_list['rl']['max_steps_per_episode']
        max_queue_length=config_list['rl']['max_queue_length']
        min_e=config_list['epsilon']['min_epsilon']
        e_decay=config_list['epsilon']['decay_epsilon']
        w_updates=config_list['training']['weight_updates']
        t_steps=config_list['training']['train_steps']
        t_updates=config_list['rl']['target_update_episodes']
        return n_actions,max_epochs,max_episodes,max_steps,max_e,min_e,e_decay,w_updates,t_updates,t_steps,mini_batch_size,max_queue_length    
           
    def save_data(self,path,name,avg_r,avg_maxq, avg_loss):
        with open(r''+path+"/"+name+'.yaml', 'w') as outfile:
                           data={'avr_r':avg_r}
                           yaml.dump(data, outfile, default_flow_style=False)
        return

if __name__ == "__main__":
    print("Training FlappyBirds environment with DQL")
    env=DQL()
    #env.episode_training("random")
    env.episode_training("qdl")
    pygame.quit()

Training FlappyBirds environment with DQL
Flappy Bird Training
Episode training  dql
[ dql_episodes ]Training Episode  1  of  2000
[ dql_episodes ]Training Episode  2  of  2000
[ dql_episodes ]Training Episode  3  of  2000
[ dql_episodes ]Training Episode  4  of  2000
[ dql_episodes ]Training Episode  5  of  2000
[ dql_episodes ]Training Episode  6  of  2000
[ dql_episodes ]Training Episode  7  of  2000
[ dql_episodes ]Training Episode  8  of  2000
[ dql_episodes ]Training Episode  9  of  2000
[ dql_episodes ]Training Episode  10  of  2000
[ dql_episodes ]Training Episode  11  of  2000
[ dql_episodes ]Training Episode  12  of  2000
[ dql_episodes ]Training Episode  13  of  2000
[ dql_episodes ]Training Episode  14  of  2000
[ dql_episodes ]Training Episode  15  of  2000
[ dql_episodes ]Training Episode  16  of  2000
[ dql_episodes ]Training Episode  17  of  2000
[ dql_episodes ]Training Episode  18  of  2000
[ dql_episodes ]Training Episode  19  of  2000
[ dql_episodes ]Training Episod