Copyright **`(c)`** 2024 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free under certain conditions — see the [`license`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [1]:
import numpy as np
from icecream import ic
import tqdm

: 

In [2]:
def true_f(x: np.ndarray) -> np.ndarray:
    return x[0] + np.sin(x[1]) / 5

In [3]:
TEST_SIZE = 10_000
TRAIN_SIZE = 1000

x_validation = np.vstack(
    [
        np.random.random_sample(size=TEST_SIZE) * 2 * np.pi - np.pi,
        np.random.random_sample(size=TEST_SIZE) * 2 - 1,
    ]
)
y_validation = true_f(x_validation)
train_indexes = np.random.choice(TEST_SIZE, size=TRAIN_SIZE, replace=False)
x_train = x_validation[:, train_indexes]
y_train = y_validation[train_indexes]
assert np.all(y_train == true_f(x_train)), "D'ho"

np.savez('problem_0.npz', x=x_train, y=y_train)

## Evaluation

In [7]:
import d3584

In [8]:
problem = np.load('problem_0.npz')
x = problem['x']
y = problem['y']
x.shape

In [9]:
print(f"MSE (train): {100*np.square(y_train-d3584.f(x_train)).sum()/len(y_train):g}")
print(f"MSE (real) : {100*np.square(y_validation-d3584.f(x_validation)).sum()/len(y_validation):g}")

In [None]:
LENGTH = 10
STATES = tuple(range(LENGTH))

ACTIONS = {s: {-1, +1} for s in range(1, LENGTH-1)}
ACTIONS[0] = {}
ACTIONS[LENGTH-1] = {}
REWARD = {(s,a) for s in STATES for a in ACTIONS[s]}
REWARD[(1, -1)] = 10
REWARD[(LENGTH-2, 1)] = 14

In [None]:
# Random policy
def random_policy(s):
    available_actions = ACTIONS[s]
    return {(1 / len(available_actions), a) for a in ACTIONS[s]}

In [None]:
# Greedy polocy
def greedy_policy(s):
    policy = dict()
    for s in STATES:
        if s==0 or s==LENGTH - 1: # Don't move if in best state
            policy[s] = {}
        elif REWARD(s,-1) + value[s-1] > REWARD(s,+1) + value[s+1]: # Going left is more useful
            policy[s] = (-1, 1)
        elif REWARD(s,-1) + value[s-1] > REWARD(s, +1) + value[s+1]: # Going right is more useful
            policy[s] = (1, 1)
        else:
            policy[s] = (0.5, -1), (0.5, +1)
    
    return policy

In [None]:
# Test random policy
value = {s: 0 for s in STATES}
new_value = dict()
TRAIN_STEPS = 100

for _ in tqdm(range(TRAIN_STEPS)):
    for s in STATES:
        for (p,a) in random_policy(s):
            new_value[s] += p * (REWARD[s, a] + value[s+a])
    value = dict(new_value)
    
print(value) # This should be the optimal value function

: 

In [None]:
# Test greedy policy
value = {s: 0 for s in STATES}
TRAIN_STEPS = 100

for _ in tqdm(range(TRAIN_STEPS)):
    policy = greedy_policy(value)
    
    for s in STATES:
        value[s] = 0
        for (p, a) in policy(s):
            value[s] += p * (REWARD[s, a] + value[s + a])