In [54]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow import keras

In [55]:
from collections import deque
memorySize=1000
lr=0.001
df=0.9
syncRate=10
miniBatchSize=32

In [56]:
# Define the model
def build_model():
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=(16,)))    # Input layer with 16 nodes
    model.add(layers.Dense(16, activation='relu'))     # Hidden layer with 16 nodes and ReLU activation
    model.add(layers.Dense(4, activation='linear'))   # Output layer with 4 nodes and Softmax activation
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss='mse')
    return model


In [57]:
def stateToDQNInput(state,num_states=16):
    one_hot_vector = np.zeros(num_states)
    one_hot_vector[state] = 1
    return one_hot_vector

In [58]:
import gym
env=gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
state=env.reset()
state

(0, {'prob': 1})

In [129]:
def optimize(miniBatch,policyDQN,targetDQN):
    numStates=16
    currentQList=[]
    targerQList=[]
    for state,action,newState,reward,terminated in miniBatch:
        if terminated:
            target=reward
        else:
            target=df*np.array(targetDQN(stateToDQNInput(newState).reshape((1,16)))).max()
        #currentQ=policyDQN(stateToDQNInput(state))
        #currentQList.append(currentQ)
        targetQ=np.array(targetDQN(stateToDQNInput(state).reshape((1,16))))[0]
        targetQ[action]=target
        #targerQList.append(targetQ)
        policyDQN.fit(stateToDQNInput(state).reshape((1,16)),targetQ.reshape((1,4)),epochs=1)


In [133]:
import gym
import random
def train(episodes,render=False, is_slippery=False):
    env=gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
    num_states=env.observation_space.n
    num_actions=env.action_space.n
    epsilon=1
    replayMemory=deque(maxlen=memorySize)
    policyDQN=build_model()
    targetDQN=build_model()
    targetDQN.set_weights(policyDQN.get_weights())
    stepCount=0
    rewardsPerEpisodes=np.zeros(episodes)
    for i in range(episodes):
        print(i)
        state=env.reset()[0]
        terminated=False
        truncated=False
        while(not truncated and not terminated):
            random_num = random.uniform(0, 1)
            if random_num > epsilon:
                action =np.array(policyDQN(stateToDQNInput(state).reshape((1,16)))).argmax().item()
            else:
                action = env.action_space.sample()
            newState,reward,terminated,truncated,_=env.step(action)
            replayMemory.append((state,action,newState,reward,terminated))
            state=newState
            stepCount=stepCount+1
        if reward== 1:
            rewardsPerEpisodes[i]=1
        if(len(replayMemory)>miniBatchSize and np.sum(rewardsPerEpisodes)>=1):
            miniBatch=random.sample(replayMemory,miniBatchSize)
            optimize(miniBatch,policyDQN,targetDQN)
            epsilon=max(epsilon-1/episodes,0)
            if(stepCount>syncRate):
                targetDQN.set_weights(policyDQN.get_weights())
                stepCount=0
    policyDQN.save('policy_network_model.h5')

In [139]:
train(episodes=100)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.0045
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 0.1324
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.0036
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.0193
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.0039
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.0245
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.0274
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.0237
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.0024
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.0201
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0



In [151]:
from tensorflow.keras import losses
images = []

In [152]:
def test():
    env=gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
    loaded_model = tf.keras.models.load_model('policy_network_model.h5', custom_objects={'mse': losses.MeanSquaredError()})
   
    terminated = False
    truncated = False
    state, info = env.reset()
    img = env.render()
    images.append(img)
    steps=0
    while not terminated or truncated:
        steps=steps+1
        if(steps>15):
            print("Failed to reach home. Maximum steps taken")
            break
        else:    
            action =np.array(loaded_model(stateToDQNInput(state).reshape(1,16))).argmax().item()
            state, reward, terminated, truncated, info = env.step(
                action
            ) 
            img = env.render()
            images.append(img)
            

In [153]:
test()



In [154]:
import imageio

In [155]:
imageio.mimsave("test.png", [np.array(img) for i, img in enumerate(images)], fps=5)