# Semi-gradient SARSA for CartPole-v1 Task

In [118]:
import torch.nn.functional as F
import torch.nn as nn
import torch

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 4-dimensional state + 1-dimensional action space
        self.fc1 = nn.Linear(5, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 50)
        self.fc4 = nn.Linear(50, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return x

### Testing Net

In [119]:
device = torch.device('cuda:0')
f = lambda x: torch.sqrt(torch.sum(x * x))
n = 1000
train = torch.randint(0, 10, (n, 5), dtype=torch.float, device=device)
train_answers = [f(x) for x in train]
test = torch.randint(0, 10, (n, 5), dtype=torch.float, device=device)
test_answers = [f(x) for x in test]

net = Net().to(device)
learning_rate = 0.001

for epoch in range(20):
    for x, y in zip(train, train_answers):
        net.zero_grad()
        actual_y = net(x)
        actual_y.backward()
        for f in net.parameters():
            f.data -= learning_rate * (actual_y.item() - y) * f.grad.data

In [120]:
MSE = 0
for x, y in zip(test, test_answers):
    actual_y = net(x).item()
    MSE += (actual_y - y) ** 2
MSE /= n
print(MSE)

tensor(0.0101, device='cuda:0')


In [97]:
net(torch.tensor([1, 2, 3, 4, 5], device=device, dtype=torch.float))

tensor([7.4561], device='cuda:0', grad_fn=<ReluBackward0>)

In [98]:
np.sqrt(1 + 4 + 9 + 16 + 25)

7.416198487095663

In [99]:
from collections import defaultdict
import random
import gym
import numpy as np

env = gym.make('CartPole-v1')
epsilon = 0.05
learning_rate = 0.001
actions = [0, 1]

class ActionValueEstimator:
    def __init__(self):
        self.net = Net()
    
    def estimate(self, state, action):
        value = self.net(self._encode(state, action))
        return value.item()
    
    def update(self, state, action, estimate):
        self.net.zero_grad()
        value = self.net(self._encode(state, action))
        value.backward()
        
        for f in self.net.parameters():
            f.data += learning_rate * (estimate - value.item()) * f.grad.data
    
    def _encode(self, state, action):
        return torch.tensor(np.append(state, action), dtype=torch.float)

def train(av_estimator, number_of_episodes):
    for _ in range(number_of_episodes):
        train_episode(av_estimator)
    
def train_episode(av_estimator):
    state = env.reset()
    action = env.action_space.sample()
    
    while True:
        next_state, reward, done, _ = env.step(action)
        
        if done:
            break
        
        # choose next action epsilon-greedily
        next_action = None
        if random.random() < epsilon:
            next_action = env.action_space.sample()
        else:
            next_action = max(actions, key=lambda a: av_estimator.estimate(next_state, a))
        
        av_estimator.update(state, action, reward + av_estimator.estimate(next_state, next_action))
        state = next_state
        action = next_action

In [100]:
av_estimator = ActionValueEstimator()

In [101]:
train(av_estimator, 100)

In [102]:
def simulate(av_estimator):
    state = env.reset()
    score = 0
    
    while True:
        env.render()
        action = max(actions, key=lambda a: av_estimator.estimate(state, a))
        print(av_estimator.estimate(state, 0))
        print(av_estimator.estimate(state, 1))
        state, _, done, _ = env.step(action)
        
        if done:
            break
        
        score += 1
    
    print(f'Score: {score}')
    env.close()

In [104]:
simulate(av_estimator)

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Score: 8


In [235]:
env.close()