In [1049]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense 
from keras.optimizers import Adam
from keras.initializers import he_normal


import numpy as np
from collections import deque
import random

In [1050]:
class Deep_Q:
    def __init__(self, layout, memory_length = 16, sample_size = 16, learning_rate = 0.001, discount = 0.99, target_model_copy_period = 8):
        self.model = Sequential(layout)

        self.model.compile(Adam(learning_rate = learning_rate), loss = 'mse')

        
        self.sample_size = sample_size
        self.discount = discount
        self.target_model_copy_period = target_model_copy_period


        self.target_model = Sequential(layout)

        self.target_model.compile(Adam(learning_rate = learning_rate), loss = 'mse')

        self.replay_memory = deque(maxlen = memory_length)



        self.number_actions = self.model.layers[len(self.model.layers) - 1].output_shape[1]

        self.number_steps_since_copy = 0 
    def predict(self, state, epsilon):
        if random.random() < epsilon:
            action = random.randint(0, self.number_actions - 1)

        else:
            state = np.array([state])
            state = np.expand_dims(state, axis = 0)

            q_values = self.model.predict(state, verbose = 0)

            action = np.argmax(q_values)
        return action
    
    def add_experience(self, state, action, reward, new_state, done):

        state = np.expand_dims(state, axis = 0)
        new_state = np.expand_dims(new_state, axis = 0)


        self.replay_memory.append((state, action, reward, new_state, done))

    def update_network(self):



        replay_memory_length = len(self.replay_memory)

        if replay_memory_length < self.sample_size:
            return False

        
        if self.number_steps_since_copy > self.target_model_copy_period:
            print('copy target')

            self.target_model.set_weights(self.model.get_weights())
            self.number_steps_since_copy = 0

        batch = random.sample(self.replay_memory, self.sample_size)



        for _ in range(self.sample_size):
            self.replay_memory.popleft()



        states = np.array([experience[0] for experience in batch])
        actions = np.array([experience[1] for experience in batch])
        rewards = np.array([experience[2] for experience in batch])
        new_states = np.array([experience[3] for experience in batch])
        dones = np.array([experience[4] for experience in batch])

        q_values = self.model.predict(states, verbose = 0)

        next_q_values = self.target_model.predict(new_states, verbose = 0)

        target_q_values = q_values.copy()


        for i in range(len(batch)):
            if dones[i]:
                target_q_values[i, actions[i]] = rewards[i]

            else:
                target_q_values[i, actions[i]] = rewards[i] + self.discount * np.max(next_q_values[i])
        '''
        print(f'q_values are {q_values}')
        print(f'next_q_values are {next_q_values}')
        print(f'target_q_values are {target_q_values}')
        '''


        self.model.fit(states, target_q_values, epochs=1, verbose = 1)

        self.number_steps_since_copy += self.sample_size

    






In [1051]:
dq = Deep_Q([
    Dense(2, input_shape=(1,), activation='relu'),
    
    Dense(64, activation = 'relu', kernel_initializer=he_normal()),
    Dense(2, activation = 'linear')
])



In [1052]:
number_episodes = 256
epsilon = 1
epsilon_decay = 0.95

In [1053]:
state = 0
for i in range(number_episodes):
    action = dq.predict(state, epsilon)

    # The agent gets a reward only if the action is not equal to the state
    reward = int(action != state)

    reward = 0

    # Define the new state based on the action
    new_state = 1 if action == 0 else 0



    print(state, reward)


    # Add the experience to the replay memory
    dq.add_experience(state, action, reward, new_state, False)

    # Update the network
    dq.update_network()

    # Update the state and decay epsilon
    state = new_state
    epsilon *= epsilon_decay



0 0
1 0
0 0
1 0
1 0
0 0
0 0
0 0
1 0
1 0
0 0
1 0
0 0
1 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
1 0
1 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
copy target
0 0
0 0
0 0
0 0


KeyboardInterrupt: 

In [1054]:
dq.predict(1, 0)

0

In [1055]:
import gymnasium

In [1056]:
env = gymnasium.make('MountainCar-v0', render_mode='human')

In [1057]:
env.reset()
while True:
    env.step(0)

KeyboardInterrupt: 

: 