In [29]:
import numpy as np

In [30]:
import gym
env=gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

In [31]:
def stateToDQNInput(state, num_states=16):
    one_hot_vector = np.zeros(num_states, dtype=np.float32)
    one_hot_vector[state] = 1
    one_hot_tensor = torch.tensor(one_hot_vector, dtype=torch.float32)
    return one_hot_tensor

In [32]:
import torch

model=torch.nn.Sequential(
torch.nn.Linear(16,16),
torch.nn.ReLU(),
torch.nn.Linear(16,4),
torch.nn.Softmax(dim=-1)
)
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)    

In [33]:
dr=1
lr=0.01

In [34]:
from torch.distributions import Categorical

In [35]:
probs=model(stateToDQNInput(0))
probs

tensor([0.2558, 0.2603, 0.2318, 0.2520], grad_fn=<SoftmaxBackward0>)

In [36]:
m = Categorical(probs)
m

Categorical(probs: torch.Size([4]))

In [37]:
action = m.sample()
action

tensor(0)

In [38]:
env.reset()   

(0, {'prob': 1})

In [39]:
images_train = []

In [40]:
import imageio

In [41]:
def train(numEpisodes):
    for i in range(numEpisodes):
        logProbs=[]
        rewards=[]
        state=env.reset()[0] 
        img = env.render()
        images_train.append(img)  
        terminated=False
        truncated=False
        while(not truncated and not terminated):
            probs=model(stateToDQNInput(state))
            m = Categorical(probs)
            action = m.sample()
            log_prob = m.log_prob(action)
            state,reward,terminated,truncated,_ =env.step(action.item())
            rewards.append(reward)
            logProbs.append(log_prob)
            img = env.render()
            images_train.append(img)
        discountedReturns=[]
        for t in range(len(rewards)):
            G=0.0
            for k,r in enumerate(rewards[t:]):
                G=G+(dr**k)*r
            discountedReturns.append(G)
        loss=[]
        for LP,G in zip(logProbs,discountedReturns):
            loss.append(-LP*G)
        policy_loss = torch.sum(torch.stack(loss))
        print(policy_loss.item())
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
    

In [43]:
train(500)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14.256207466125488
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17.4809627532959
0.0
0.0
10.003314018249512
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18.559818267822266
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9.388357162475586
0.0
0.0
0.0
0.0
0.0
0.0
11.713854789733887
0.0
0.0
0.0
0.0
0.0
11.505023956298828
11.216676712036133
0.0
0.0
0.0
0.0
0.0
0.0
17.457794189453125
0.0
0.0
0.0
0.0
9.786622047424316
5.198916435241699
16.7315731048584
0.0
0.0
10.625507354736328
0.0
0.0
16.738798141479492
0.0
0.0
8.886836051940918
9.228595733642578
0.0
0.0
8.295734405517578
0.0
12.680243492126465
12.121889114379883
0.0
0.0
0.0
0.0
4.9340691566467285
6.975551605224609
7.185401916503906
14.138

In [44]:
imageio.mimsave("training.png", [np.array(img) for i, img in enumerate(images_train)], fps=30)

In [45]:
images_test = []

In [46]:
def test():
    state = env.reset()[0]
    terminated=False
    truncated=False
    steps=0
    img = env.render()
    images_test.append(img)
    while(not truncated and not terminated):
        steps=steps+1
        if(steps>15):
            print("Failed to reach home. Maximum steps taken")
            break
        else:
            probs=model(stateToDQNInput(state))
            m = Categorical(probs)
            action = m.sample()
            state,reward,terminated,truncated,_ =env.step(action.item())
            img = env.render()
            images_test.append(img)
            

In [47]:
test()

In [48]:
imageio.mimsave("test.png", [np.array(img) for i, img in enumerate(images_test)], fps=5)