In [199]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense 
from keras.optimizers.legacy import Adam
from keras.initializers import he_normal


import numpy as np
from collections import deque
import random

In [200]:

class Deep_Q:
    def __init__(self, layout, memory_length = 16, sample_size = 16, learning_rate = 0.001, discount = 0.99, target_model_copy_period = 8):
        self.model = Sequential(layout)

        self.model.compile(Adam(learning_rate = learning_rate), loss = 'mse')

        
        self.sample_size = sample_size
        self.discount = discount
        self.target_model_copy_period = target_model_copy_period


        self.target_model = Sequential(layout)

        self.target_model.compile(Adam(learning_rate = learning_rate), loss = 'mse')

        self.replay_memory = deque(maxlen = memory_length)



        self.number_actions = self.model.layers[len(self.model.layers) - 1].output_shape[1]

        self.number_steps_since_copy = 0 
    def predict(self, state, epsilon):
        if random.random() < epsilon:
            action = random.randint(0, self.number_actions - 1)

        else:

            state = np.expand_dims(state, axis = 0)

            q_values = self.model.predict(state, verbose = 0)

            action = np.argmax(q_values)
        return action
    
    def add_experience(self, state, action, reward, new_state, done):
        '''
        state = np.expand_dims(state, axis = 0)
        new_state = np.expand_dims(new_state, axis = 0)

        print(state)
        '''

        self.replay_memory.append((state, action, reward, new_state, done))

    def update_network(self):



        replay_memory_length = len(self.replay_memory)

        if replay_memory_length < self.sample_size:
            return False

        
        if self.number_steps_since_copy > self.target_model_copy_period:
            print('copy target')

            self.target_model.set_weights(self.model.get_weights())
            self.number_steps_since_copy = 0

        batch = random.sample(self.replay_memory, self.sample_size)



        for _ in range(self.sample_size):
            self.replay_memory.popleft()



        states = np.array([experience[0] for experience in batch])
        actions = np.array([experience[1] for experience in batch])
        rewards = np.array([experience[2] for experience in batch])
        new_states = np.array([experience[3] for experience in batch])
        dones = np.array([experience[4] for experience in batch])

        q_values = self.model.predict(states, verbose = 0)

        next_q_values = self.target_model.predict(new_states, verbose = 0)

        target_q_values = q_values.copy()


        for i in range(len(batch)):
            if dones[i]:
                target_q_values[i, actions[i]] = rewards[i]

            else:
                target_q_values[i, actions[i]] = rewards[i] + self.discount * np.max(next_q_values[i])
        '''
        print(f'q_values are {q_values}')
        print(f'next_q_values are {next_q_values}')
        print(f'target_q_values are {target_q_values}')
        '''


        self.model.fit(states, target_q_values, epochs=1, verbose = 1)

        self.number_steps_since_copy += self.sample_size


In [201]:
def normalised_state(state):
    print(state)

In [202]:
import gymnasium as gym

In [203]:
learning_rate = 0.1
discount = 0.95

epsilon = 1
epsilon_decay = 0.01
min_epsilon = 0.1

max_steps = 10000

num_episodes = 10000

render_delay = 1000

render = False

scenario = 'CartPole-v1'

if render:
    env = gym.make(scenario, render_mode='human')

else:
    env = gym.make(scenario)



In [204]:
state, info = env.reset()

state = np.array(state)

dq = Deep_Q([
    Dense(2, input_shape=state.shape, activation='relu'),
    
    Dense(24, activation = 'relu'),
    Dense(24, activation = 'relu'),

    Dense(2, activation = 'linear')],

    memory_length = 1024,
    sample_size = 64,
    target_model_copy_period = 256
    
    )

In [206]:
for episode in range(num_episodes):

    state = env.reset()[0]

    state = np.array(state)
    
    done = False

    steps = 0

    score = 0



    while not done:


        action = dq.predict(state, epsilon)





        new_state, reward, done, truncation, _ = env.step(action)

        dq.add_experience(state, action, reward, new_state, done)



        state = new_state.copy()

        dq.update_network()

        steps += 1


        if epsilon > min_epsilon:
            epsilon -= epsilon_decay


        score += reward

        if steps > max_steps:  
            print('max steps reached')
            break

    print(f'Episode {episode} complete, score: {score}')

    
    if (episode + 1) % render_delay == 0 or render:
        env = gym.make(scenario, render_mode='human')
        env.reset()
    else:
        env = gym.make(scenario)
        env.reset()




Episode 0 complete, score: 10.0
Episode 1 complete, score: 11.0
Episode 2 complete, score: 10.0
Episode 3 complete, score: 9.0
Episode 4 complete, score: 9.0
Episode 5 complete, score: 11.0
Episode 6 complete, score: 12.0
Episode 7 complete, score: 10.0
Episode 8 complete, score: 12.0
Episode 9 complete, score: 9.0
Episode 10 complete, score: 10.0
Episode 11 complete, score: 10.0
Episode 12 complete, score: 11.0
Episode 13 complete, score: 10.0
Episode 14 complete, score: 11.0
Episode 15 complete, score: 9.0
copy target
Episode 16 complete, score: 9.0
Episode 17 complete, score: 10.0
Episode 18 complete, score: 11.0
Episode 19 complete, score: 11.0
Episode 20 complete, score: 9.0
Episode 21 complete, score: 10.0
Episode 22 complete, score: 10.0
Episode 23 complete, score: 10.0
Episode 24 complete, score: 10.0
Episode 25 complete, score: 11.0
Episode 26 complete, score: 11.0
Episode 27 complete, score: 10.0
Episode 28 complete, score: 10.0
Episode 29 complete, score: 9.0
Episode 30 comp

KeyboardInterrupt: 

: 