In [36]:
import torch
import tqdm
import numpy as np


def mle(y, n_iters):
    """Maximum likelihood estimation given dataset y encoded in delta"""
    alpha = 1
    theta = torch.randn(6)

    delta = torch.zeros(y.numel(), 6).scatter(1, y, torch.ones_like(y).float())

    for iter in tqdm.tqdm(range(n_iters)):
        p_theta = torch.nn.Softmax(dim=0)(theta)
        g = torch.mean(p_theta - delta, 0)
        theta = theta - alpha * g

    return theta


def reinforce(R, n_iters, theta=None):
    """REINFORCE with given reward function."""
    np.random.seed(1)

    p_gt = torch.Tensor([1.0 / 12, 2.0 / 12, 3.0 / 12, 3.0 / 12, 2.0 / 12, 1.0 / 12])
    y = (
        torch.from_numpy(np.random.choice(list(range(6)), size=1000, p=p_gt.numpy()))
        .type(torch.int64)
        .view(-1, 1)
    )
    alpha = 1

    if theta is None:
        theta = torch.randn(6)

    for i in tqdm.tqdm(range(n_iters)):

        # current distribution
        p_theta = torch.nn.Softmax(dim=0)(theta)

        # sample from current distribution and compute reward

        # TODO: sample from p_theta, [#samples, 1]
        sample = torch.multinomial(p_theta,1000,replacement=True).reshape(-1,1)
        Reward = torch.tensor([R[i] for i in sample]).reshape(-1,1)
        
        # TODO: use your equation from 4(d) to compute gradient
        delta = torch.zeros(sample.numel(),6).scatter(1,sample,torch.ones_like(sample).float())
        g = torch.mean(Reward*(delta-p_theta), 0)

        # update the parameter
        theta = theta + alpha * g

    return theta


if __name__ == "__main__":

    np.random.seed(1)

    p_gt = torch.Tensor([1.0 / 12, 2.0 / 12, 3.0 / 12, 3.0 / 12, 2.0 / 12, 1.0 / 12])
    y = (
        torch.from_numpy(np.random.choice(list(range(6)), size=1000, p=p_gt.numpy()))
        .type(torch.int64)
        .view(-1, 1)
    )

    n_iters = 10000

    theta_mle = mle(y, n_iters)
    R = p_gt
    theta_rl = reinforce(R, n_iters)

    print(p_gt)
    print(torch.nn.Softmax(dim=0)(theta_mle))
    print(torch.nn.Softmax(dim=0)(theta_rl))


100%|██████████| 10000/10000 [00:00<00:00, 15587.52it/s]
100%|██████████| 10000/10000 [01:06<00:00, 149.76it/s]

tensor([0.0833, 0.1667, 0.2500, 0.2500, 0.1667, 0.0833])
tensor([0.0900, 0.1590, 0.2450, 0.2520, 0.1750, 0.0790])
tensor([1.3994e-04, 2.2268e-04, 1.5821e-03, 9.9768e-01, 2.5116e-04, 1.2774e-04])





In [33]:
y.shape,sample.shape

(torch.Size([1000, 1]), torch.Size([1000, 1]))

In [20]:
theta = torch.randn(6)


# current distribution
p_theta = torch.nn.Softmax(dim=0)(theta)

# sample from current distribution and compute reward

# TODO: sample from p_theta, [#samples, 1]
sample = torch.multinomial(p_theta,1000,replacement=True).reshape(-1,1)
sample += 1

In [30]:
sample

tensor([[3],
        [4],
        [4],
        [1],
        [6],
        [6],
        [3],
        [4],
        [3],
        [2],
        [5],
        [4],
        [6],
        [1],
        [4],
        [3],
        [3],
        [4],
        [3],
        [4],
        [6],
        [6],
        [5],
        [3],
        [2],
        [2],
        [3],
        [4],
        [3],
        [3],
        [4],
        [4],
        [4],
        [4],
        [6],
        [6],
        [3],
        [3],
        [3],
        [4],
        [5],
        [3],
        [6],
        [3],
        [4],
        [4],
        [2],
        [4],
        [6],
        [3],
        [4],
        [3],
        [4],
        [3],
        [4],
        [3],
        [4],
        [4],
        [6],
        [4],
        [3],
        [1],
        [3],
        [3],
        [2],
        [6],
        [3],
        [2],
        [3],
        [4],
        [4],
        [4],
        [4],
        [3],
        [4],
        [4],
        [3],

In [10]:
sample.shape

torch.Size([1000, 1])

In [12]:
p_gt = torch.Tensor([1.0 / 12, 2.0 / 12, 3.0 / 12, 3.0 / 12, 2.0 / 12, 1.0 / 12])
y = (
    torch.from_numpy(np.random.choice(list(range(6)), size=1000, p=p_gt.numpy()))
    .type(torch.int64)
    .view(-1, 1)
)


In [14]:
R = p_gt
R

tensor([0.0833, 0.1667, 0.2500, 0.2500, 0.1667, 0.0833])

In [16]:
p_theta

tensor([0.0579, 0.0428, 0.0844, 0.7294, 0.0336, 0.0520])