In [4]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models     import Sequential
from tensorflow.keras.layers     import Dense, Conv2D, multiply, Input, Lambda, Flatten
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from time import time
from collections import deque 
from tensorflow.keras.models import load_model, clone_model
from gc import collect
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from time import time, sleep

In [5]:
class DQ:
    def __init__(self, state_size, action_size,depth ):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.depth = depth
        self.build_model()
        #self.model.save('tmp_model')
        #self.guesser = load_model('tmp_model')
        self.guesser = clone_model(self.model)
        self.update_guesser()
        #print(self.model.get_weights())
        #print(self.guesser.get_weights())
    def huber_loss(self,a, b, in_keras=True):
        error = a - b
        quadratic_term = error*error / 2
        linear_term = abs(error) - 1/2
        use_linear_term = (abs(error) > 1.0)
        if in_keras:
            # Keras won't let us multiply floats by booleans, so we explicitly cast the booleans to floats
            use_linear_term = K.cast(use_linear_term, 'float32')
        return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term
    def build_model(self):
        ATARI_SHAPE = (210, 160, 3)

        # With the functional API we need to define the inputs.
        frames_input = Input(ATARI_SHAPE, name='frames')
        actions_input = Input((self.action_size,), name='mask')

        # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
        normalized = Lambda(lambda x: x / 255.0)(frames_input)

        # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
        conv_1 = Conv2D(16, kernel_size=(8,8), strides=(4,4), activation='relu')(normalized)
        # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
        conv_2 = Conv2D(32, kernel_size=(4,4), strides=(2,2), activation='relu')(conv_1)
        # Flattening the second convolutional layer.
        conv_flattened =Flatten()(conv_2)
        # "The final hidden layer is fully-connected and consists of 256 rectifier units."
        hidden = Dense(256, activation='relu')(conv_flattened)
        # "The output layer is a fully-connected linear layer with a single output for each valid action."
        output = Dense(self.action_size)(hidden)
        # Finally, we multiply the output by the mask!
        filtered_output = multiply([output, actions_input])

        self.model = Model(inputs=[frames_input, actions_input], outputs=filtered_output)
        optimizer = RMSprop(lr=0.0005, rho=0.95, epsilon=0.01)
        self.model.compile(optimizer, loss=self.huber_loss)

    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = np.argmax(self.model.predict([state,self.mask()])[0])
        return act_values
    def mask(self,action = -1):
        if not action in range(self.action_size):
            return np.reshape(np.ones((self.action_size,)),(1,self.action_size,))
        full_mask = np.zeros((4,))
        full_mask[action] = 1
        full_mask = np.reshape(full_mask,(1,self.action_size,))
        return np.array(full_mask)
            
            
            
    def update_guesser(self):
        #self.model.save('tmp_model')
        #self.guesser = load_model('tmp_model')
        #collect()
        self.guesser.set_weights(self.model.get_weights()) 
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                act = self.mask(action)
                #print(act.shape)
                target = reward + self.gamma *np.amax(self.guesser.predict([next_state,act])[0])
            target_f = self.guesser.predict([state,self.mask(action)])
            target_f[0][action] = target
            self.model.fit([state,self.mask(action)], target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    

        


In [7]:
class CartPole:
    def __init__(self,depth):
        self.depth = depth
        self.game_name = 'BreakoutDeterministic-v4'
        self.env = gym.make(self.game_name)
        self.state_size = self.env.observation_space.shape
        self.action_size = self.env.action_space.n
        self.scores = []  
        self.network_runs = []
        self.time_predictions = []
        self.time_stamps = []
        self.start = time()
        self.agent = DQ(self.state_size, self.action_size, self.depth)
        self.resized = [1]
        self.update_length = 200
        for i in self.state_size:
            self.resized.append(i)
        
    def train(self,episodes):
        self.env.reset()
        
        for e in range(episodes):
            #print('a')
            state = self.env.reset()
            state = np.reshape(state, self.resized)
            time_t = 0
            done = False
            while not done:
                time_t+=1
                #print('a')
                #self.env.render()
                action = self.agent.act(state)
                next_state, reward, done, _ = self.env.step(action)
                #reward = reward if not done else -.5
                next_state = np.reshape(next_state, self.resized)
                self.agent.remember(state, action, reward, next_state, done)
                state = next_state

                #if done and e % 20 == 0 and not e == 0:
        
            # print the score and break out of the loop
            if e % self.update_length == 1:
                print("episode: {}/{}, score: {}".format(e, episodes, time_t))
                time_per = (time()-self.start)/(e)
                time_pre = (time_per*(episodes-e))/3600
                print("Run Time:",'%.3f'%((time()-self.start)/3600),'hrs')
                print("Time Left:", '%.3f'%float(time_pre), 'hrs')
                self.time_stamps.append('%.3f'%((time()-self.start)/3600))
                self.time_predictions.append(time_pre)
                self.agent.update_guesser()
            self.scores.append(time_t)

        
            
            if(len(self.agent.memory) > 32):
                self.agent.replay(32)
        self.env.close()
        
    def evaluate(self,games = 1,frame_rate = 6):
        
        completed = 0
        env = gym.make(self.game_name)
        state = env.reset()
        state = np.resize(state, self.resized)
        while completed < games:
            start = time()
            env.render()
            action = self.agent.act(state)
            next_state, reward, done, _ = env.step(action)
            #reward = reward if not done else -.5
            next_state = np.reshape(next_state, self.resized)
            self.agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                completed += 1
                env.reset()
            sleep(1/frame_rate-(time()-start))
        env.close()
        
carts = []
#for depth in [2,12,24,64]:
with tf.device("/gpu:0"):
    # Setup operations
    for depth in [32]:
        carts.append(CartPole(depth))
        carts[-1].train(20000)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    # Run your code
    carts[0].evaluate()


episode: 1/20000, score: 210
Run Time: 0.002 hrs
Time Left: 46.862 hrs


KeyboardInterrupt: 

In [None]:

class CartPole_Grapher:
    def __init__(self,carts):
        self.carts = carts
        self.max = 500
    def moving_avg(self,lis,tail_p):
        curr = 0
        tail = max([1,int(len(lis)*tail_p)])
        m_lis = []
        for i in range(len(lis)):
            if i >= tail:
                if i > 0:
                    curr *= tail
                    curr -= lis[i-tail]
                    curr += lis[i]
                    curr /= tail
                else:
                    curr = lis[i]

            else:
                if i > 0:
                    curr *= i-1
                    curr += lis[i]
                    curr /= i
                else:
                    curr = lis[i]
            m_lis.append(curr)
        return m_lis
    def sub_graph(self, tail_p, names):
        plot = 0
        rows = np.ceil(len(carts)/2)
        for cart in self.carts:
            
            plt.subplot(2,2,plot+1)
            plt.axis([0, len(cart.scores), 0, self.max])
            plt.plot(self.moving_avg(cart.scores,tail_p))
            plt.title(names[plot])
            plt.figure(figsize=(20,10))
            
            
            plot += 1
        plt.show()  
        
    def graph(self, tail_p, names):
        plot = 0
        rows = np.ceil(len(carts)/2)
        for cart in self.carts:
            
            plt.axis([0, len(cart.scores), 0, self.max])
            plt.plot(self.moving_avg(cart.scores,tail_p))
            plt.title(names[plot])
            plt.figure(figsize=(20,10))

            plot +=1
            plt.show()
    def evaluate(self,games = 1):
        completed = 0
        env = gym.make(self.game_name)
        state = env.reset()
        state = np.resize(state, [1, self.state_size])
        while completed < games:
            start = time()
            env.render()
            action = self.agent.act(state)
            next_state, reward, done, _ = env.step(action)
            #reward = reward if not done else -.5
            next_state = np.reshape(next_state, [1, self.state_size])
            self.agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                completed += 1
                env.reset()
            sleep(.1-(time()-start))
        env.close()
            
cpg = CartPole_Grapher(carts)
cpg.graph(.05, names = [size + ' Network' for size in ['Tiny', 'Small', 'Medium', 'Large']])
#cpg.graph(.01, names = [size + ' Network' for size in ['Medium']])
cpg.chunk_graph(200, names = [size + ' Network' for size in ['Tiny', 'Small', 'Medium', 'Large']])


In [None]:
'''import matplotlib.pyplot as plt
temp = 0
ascores = []
avg = 50
for i in range(len(scores2)):
    if i%avg == 0 and i != 0:
        ascores.append(temp/avg)
        temp = 0
    else:
        temp += scores2[i]
 
def plot(x):
    plt.plot(range(len(x)),x)
    plt.show()
plot(ascores)'''


'''depths = ['Large','Medium','Small']
training_length = ['Short','Medium','Long']
avg = 100
plots = []
plt.figure(figsize=(20,10))
for run in network_runs:
    temp_plot = []
    temp = 0
    for i in range(int(len(run))):
        temp += run[i]
        if i % avg == 0:
            temp_plot.append(temp/avg)
            temp = 0
    plots.append(temp_plot)
for plot in range(len(plots)):
    plt.subplot(3, 3, plot+1)
    plt.plot(plots[plot])
    #title = depths[run%3]+' Nework with '+training_length[int(run/3)]+' Training Period'
    title = depths[plot%3]+' Nework'
    plt.title(title)
    plt.ylabel('Score')
    plt.xlabel('Training_Batches')
plt.show()  '''

In [None]:
'''import numpy as np
import matplotlib.pyplot as plt


x1 = np.linspace(0.0, 5.0)
x2 = np.linspace(0.0, 2.0)

y1 = np.cos(2 * np.pi * x1) * np.exp(-x1)
y2 = np.cos(2 * np.pi * x2)

plt.subplot(2, 2, 1)
plt.plot(x1, y1, 'o-')
plt.title('A tale of 2 subplots')
plt.ylabel('Damped oscillation')

plt.subplot(2, 2, 4)
plt.plot(x2, y2, '.-')
plt.xlabel('time (s)')
plt.ylabel('Undamped')

plt.show()'''

In [None]:
plt.plot(carts[0].time_stamps)
plt.show()
plt.plot(carts[0].time_predictions)
plt.show()
error = []
for i in range(len(carts[0].time_stamps)):
    error.append((float(carts[0].time_stamps[-1])-float(carts[0].time_predictions[i])-float(carts[0].time_stamps[i]))/float(carts[0].time_stamps[-1]))
plt.plot(error)
plt.show()
error_short = error[30:]
for i in error_short:
    print(i)

In [None]:
game_name = 'BreakoutDeterministic-v4'
env = gym.make(game_name)
state_size = env.observation_space.shape
action_size = env.action_space.n
print(state_size)