In [1]:
from numba_test import dot, slow_dot, np_dot

In [2]:
import numpy as np

In [18]:
from numba import njit

In [11]:
foo = np.random.rand(10000).astype(np.float32)
bar = np.random.rand(10000).astype(np.float32)

In [12]:
dot(foo, bar)

2484.2819370024185

In [13]:
%%timeit
for _ in range(100):
    dot(foo, bar)

980 µs ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit
for _ in range(100):
    slow_dot(foo, bar)

397 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
for _ in range(100):
    np_dot(foo, bar)

178 µs ± 664 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
slow_dot(foo,bar)

2484.2819370024185

In [17]:
np_dot(foo,bar)

2484.2822

In [33]:
@njit
def get_beta_vector(T: int,
                    α: float,
                    β: float) -> np.ndarray:
    discount = np.zeros((1, T))

    current_discount = 1
    for t in range(T):
        discount[0, t] = current_discount
        current_discount *= (α + t) / (α + β + t)

    return discount

@njit
def beta_gae(rewards: np.ndarray,  # [T, N]
             values: np.ndarray,  # [T, N]
             last_values: np.ndarray,  # [1, N]
             dones: np.ndarray,  # [T, N], actually next_dones (whether previous step was terminal)
             final_dones: np.ndarray,  # [1, ]
             α: float = 99.,
             β: float = 1.,
             λ: float = 0.95):
    T = rewards.shape[0]
    N = rewards.shape[1]  # Number of envs

    # TODO: Handle multiple envs separately to work with Hopper AAAGH

    advantages = np.zeros((T, N), dtype=np.float32)

    final_dones = final_dones.reshape((1, N))
    last_values = last_values.reshape((1, N))
    next_non_terminal = 1 - np.concatenate((dones, final_dones))[1:]
    next_values = np.concatenate((values, last_values))[1:]

    # Process dones
    # !!! Assume all environments have episode ends simultaneously !!!
    steps_until_eoe = np.zeros((T,), dtype=np.int32)
    is_final = np.zeros((T,), dtype=np.int32)  # Might be starting too early OOBE
    counter = 0
    final = 1
    done = False
    for i, d in list(enumerate(dones[:, 0]))[::-1]:
        if done:
            counter = 0
            done = False
            final = 0
        steps_until_eoe[i] = counter
        is_final[i] = final
        counter += 1
        done = d

    Γ = get_beta_vector(T + 1, α, β)
    lambdas = np.array([[λ ** l for l in range(T)]])

    #     γ = α / (α + β)
    #     Γ = np.array([[γ**l for l in range(T+1)]])

    factor = None

    for i in range(T):
        steps_left = steps_until_eoe[i]

        old_value = -values[i]
        future_rewards = (lambdas[:, :steps_left + 1] * Γ[:, :steps_left + 1]) @ rewards[i:i + steps_left + 1]

        if is_final[i]:
            steps_left += 1

            # Fix to properly handle the very last value of an episode
            if factor is None:
                factor = np.array([[1 - λ for i in range(steps_left)]]).T
                factor[-1] = 1.
            else:
                factor = factor[1:]

            future_values = (lambdas[:, :steps_left] * Γ[:, 1:steps_left + 1]) @ (
                        next_values[i:i + steps_left] * next_non_terminal[i:i + steps_left] * factor[-steps_left:])

        else:
            future_values = (1 - λ) * (lambdas[:, :steps_left] * Γ[:, 1:steps_left + 1]) @ (
                        next_values[i:i + steps_left] * next_non_terminal[i:i + steps_left])

        advantages[i] = old_value + future_rewards + future_values

    returns = advantages + values

    return returns, advantages




In [34]:
T = 10000

rewards = np.random.rand(T, 1)
values = np.random.rand(T, 1)
dones = np.zeros_like(rewards)
final_dones = np.array([[1]])
last_values = np.array([[0.]])

dones = np.zeros((T, 1))
for i in range(0,T,T//10):
    dones[i, 0] = 1 if i > 0 else 0

In [35]:
%%timeit
beta_ret, beta_adv = beta_gae(rewards, values, last_values, dones, final_dones, 9., 1., 0.95)


33.4 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
# @njit
def get_beta_vector(T: int,
                    α: float,
                    β: float) -> np.ndarray:
    discount = np.zeros((1, T))

    current_discount = 1
    for t in range(T):
        discount[0, t] = current_discount
        current_discount *= (α + t) / (α + β + t)

    return discount

# @njit
def beta_gae(rewards: np.ndarray,  # [T, N]
             values: np.ndarray,  # [T, N]
             last_values: np.ndarray,  # [1, N]
             dones: np.ndarray,  # [T, N], actually next_dones (whether previous step was terminal)
             final_dones: np.ndarray,  # [1, ]
             α: float = 99.,
             β: float = 1.,
             λ: float = 0.95):
    T = rewards.shape[0]
    N = rewards.shape[1]  # Number of envs

    # TODO: Handle multiple envs separately to work with Hopper AAAGH

    advantages = np.zeros((T, N), dtype=np.float32)

    final_dones = final_dones.reshape((1, N))
    last_values = last_values.reshape((1, N))
    next_non_terminal = 1 - np.concatenate((dones, final_dones))[1:]
    next_values = np.concatenate((values, last_values))[1:]

    # Process dones
    # !!! Assume all environments have episode ends simultaneously !!!
    steps_until_eoe = np.zeros((T,), dtype=np.int32)
    is_final = np.zeros((T,), dtype=np.int32)  # Might be starting too early OOBE
    counter = 0
    final = 1
    done = False
    for i, d in list(enumerate(dones[:, 0]))[::-1]:
        if done:
            counter = 0
            done = False
            final = 0
        steps_until_eoe[i] = counter
        is_final[i] = final
        counter += 1
        done = d

    Γ = get_beta_vector(T + 1, α, β)
    lambdas = np.array([[λ ** l for l in range(T)]])

    #     γ = α / (α + β)
    #     Γ = np.array([[γ**l for l in range(T+1)]])

    factor = None

    for i in range(T):
        steps_left = steps_until_eoe[i]

        old_value = -values[i]
        future_rewards = (lambdas[:, :steps_left + 1] * Γ[:, :steps_left + 1]) @ rewards[i:i + steps_left + 1]

        if is_final[i]:
            steps_left += 1

            # Fix to properly handle the very last value of an episode
            if factor is None:
                factor = np.array([[1 - λ for i in range(steps_left)]]).T
                factor[-1] = 1.
            else:
                factor = factor[1:]

            future_values = (lambdas[:, :steps_left] * Γ[:, 1:steps_left + 1]) @ (
                        next_values[i:i + steps_left] * next_non_terminal[i:i + steps_left] * factor[-steps_left:])

        else:
            future_values = (1 - λ) * (lambdas[:, :steps_left] * Γ[:, 1:steps_left + 1]) @ (
                        next_values[i:i + steps_left] * next_non_terminal[i:i + steps_left])

        advantages[i] = old_value + future_rewards + future_values

    returns = advantages + values

    return returns, advantages




In [37]:
%%timeit
beta_ret, beta_adv = beta_gae(rewards, values, last_values, dones, final_dones, 9., 1., 0.95)


229 ms ± 9.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
