<a href="https://colab.research.google.com/github/ThomasWong-ST/Intro-to-RL/blob/main/RL_and_Monte_Carlo_Sim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Task
Estimate the integral of $e^{2}$ from 0 to N using the Monte Carlo method. Where N can be any number.

In [None]:
#MC simulation for x^2
X_sample = []
for i in range(10000):
  X_sample.append(np.random.uniform(0,2))
  i = i+1
#print(X_sample)

def x_squared(sampled_data):
  for i in range(len(sampled_data)):
    sampled_data[i] = sampled_data[i]**2
    i = i+1
  return sampled_data

(2-0)/len(X_sample) * np.array(x_squared(X_sample)).sum()

np.float64(2.6783640877375077)

# Task
Estimate the integral of $e^{-x^2}$ from -10 to 10 using the Monte Carlo method.

In [None]:
import numpy as np
from math import sqrt, pi

N = 1000000
X = np.random.normal(0.0, 1, size=N)

f_vals = np.exp(-X**2)                         # f(x) = e^{-x^2}
q_vals = (1/np.sqrt(2*np.pi)) * np.exp(-X**2/2) # standard normal pdf
weights = f_vals / q_vals                      # = sqrt(2π) * exp(-X^2/2)

I_hat = weights.mean()
print(I_hat)

1.7725805232251632


# Task
Grid World MC Predictions.

In [None]:
import numpy as np

UP, DOWN, LEFT, RIGHT = 0, 1, 2, 3
ACTIONS = [UP, DOWN, LEFT, RIGHT]

class GridWorld:
    def __init__(self, n=4, step_cost=-0.01, terminal_reward=1.0, seed=0):
        self.n = n
        self.step_cost = step_cost
        self.terminal = (n-1, n-1)
        self.terminal_reward = terminal_reward
        self.rng = np.random.default_rng(seed)
        self.state = None  # (row, col)

    def reset(self):
        """Reset to a random non-terminal cell; return state index."""
        while True:
            r = int(self.rng.integers(self.n))
            c = int(self.rng.integers(self.n))
            if (r, c) != self.terminal:
                self.state = (r, c)
                return self._to_index(self.state)#maps each element of the grid to an integer, (0,0)->0, (0,1)->1, (0,2)->2,...,(2,3)->11,...,(3,3)->16

    def step(self, action):
        """
        Apply action, return: next_state_idx, reward, done, info_dict
        Rules:
        - Move in grid if possible, else stay.
        - Reaching terminal yields terminal_reward and done=True.
        - Otherwise reward = step_cost and done=False.
        """
        r, c = self.state
        nr, nc = r, c  # TODO: compute next (nr, nc) from action with boundary checks
        if action == 0:       # UP
          nr = r - 1
        elif action == 1:     # DOWN
            nr = r + 1
        elif action == 2:     # LEFT
            nc = c - 1
        elif action == 3:     # RIGHT
            nc = c + 1
        if not (0 <= nr < self.n and 0 <= nc < self.n):
            nr, nc = r, c

        # TODO: set self.state = (nr, nc)
        self.state = (nr, nc)
        # TODO: compute reward and done based on whether (nr, nc) == terminal
        if (nr, nc) == self.terminal:
            reward = self.terminal_reward
            done = True
        else:
            done = False
            reward = self.step_cost
        # return self._to_index(self.state), reward, done, {}
        return self._to_index(self.state), reward, done, {} # a 4-tuple → (int, float, bool, dict)

    # --- helpers ---
    def _to_index(self, rc):#rc = self.state
        r, c = rc
        return r * self.n + c

    def _from_index(self, idx):
        return divmod(idx, self.n)

    @property
    def nS(self):  # number of states
        return self.n * self.n

    @property
    def nA(self):  # number of actions
        return len(ACTIONS)

In [None]:
def generate_episode(env, policy, max_steps=100):
    states, actions, rewards = [], [], []
    state = env.reset()
    for _ in range(max_steps):
        action = policy(state)
        #print(generate_episode.__closure__)
        next_state, reward, done, _ = env.step(action)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        state = next_state
        if done:
            break
    return states, actions, rewards

In [None]:
env = GridWorld()
episode = generate_episode(env, policy=lambda s: np.random.choice(env.nA))
print(episode)
#print(episode[0])
#print(episode[1])
#print(episode[2])


([14, 10, 14, 10, 9, 13, 13, 13, 13, 14, 10, 9, 8, 9, 13, 13, 9, 10, 14, 14, 10, 14, 13, 12, 12, 12, 12, 12, 8, 8, 4, 5, 6, 5, 9, 8, 12, 12, 8, 9, 13, 14, 10, 6, 7, 3, 7, 11, 10, 11, 11, 7, 7, 7, 11], [0, 1, 0, 2, 1, 1, 1, 1, 3, 0, 2, 2, 3, 1, 1, 0, 3, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 2, 0, 3, 3, 2, 1, 2, 1, 1, 0, 3, 1, 3, 0, 0, 3, 0, 1, 1, 2, 3, 3, 0, 3, 3, 1, 1], [-0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, -0.01, 1.0])


In [None]:
#Returns = {s: [] for s in range(env.nS)}   # if you want the textbook version

In [None]:
def mc_prediction(env, policy, num_episodes=1000, gamma=1.0):
    V = np.zeros(env.nS)
    N = np.zeros(env.nS)   # counts visits

    for _ in range(num_episodes):
        states, actions, rewards = generate_episode(env, policy)

        G = 0
        visited = set()
        # backward loop
        for t in reversed(range(len(states))):
            G = gamma * G + rewards[t]
            s = states[t]
            if s not in visited:  # first-visit
                visited.add(s)
                N[s] += 1
                V[s] += (G - V[s]) / N[s]
    return V

In [None]:
mc_prediction(env, policy=lambda s: np.random.choice(env.nA), num_episodes=10000)

NameError: name 'env' is not defined

In [None]:
# ---------- lookahead that does NOT mutate env ----------
def lookahead(env, state, action):
    r, c = env._from_index(state)
    nr, nc = r, c
    if action == 0: nr -= 1
    elif action == 1: nr += 1
    elif action == 2: nc -= 1
    elif action == 3: nc += 1
    if not (0 <= nr < env.n and 0 <= nc < env.n):
        nr, nc = r, c
    next_state = env._to_index((nr, nc))
    reward = env.terminal_reward if (nr, nc) == env.terminal else env.step_cost
    done = (nr, nc) == env.terminal
    return next_state, reward, done

# ---------- greedy and ε-greedy policies from V ----------
def make_greedy_policy(V, env, gamma=1.0):
    def policy(state):
        #print(policy.__closure__)
        qs = []
        for a in range(env.nA):
            ns, r, done = lookahead(env, state, a)
            q = r if done else (r + gamma * V[ns])  # one-step lookahead using V
            qs.append(q)
        return int(np.argmax(qs))
    return policy

def make_eps_greedy_policy(V, env, eps=0.1, gamma=1.0, seed=0):
    greedy = make_greedy_policy(V, env, gamma)
    rng = np.random.default_rng(seed)
    def policy(state):
        #print(policy.__closure__)
        if rng.random() < eps:
            return int(rng.integers(env.nA))
        return greedy(state)
    return policy

# ---------- simple runner to evaluate a policy ----------
def run_policy(env, policy, n_episodes=100, max_steps=200, gamma=1.0):
    returns = []
    lengths = []
    for _ in range(n_episodes):
        states, actions, rewards = generate_episode(env, policy, max_steps=max_steps)
        # discounted return from t=0 (MC return)
        G = 0.0
        for r in reversed(rewards):
            G = gamma * G + r
        returns.append(G)
        lengths.append(len(actions))
    return {
        "avg_return": float(np.mean(returns)),
        "std_return": float(np.std(returns)),
        "avg_length": float(np.mean(lengths)),
        "episodes": n_episodes,
    }

In [None]:
# ---------- example usage ----------
env = GridWorld(n=4, step_cost=-0.01, terminal_reward=1.0, seed=42)

# Your learned V:
V_interataion_1 = mc_prediction(env, policy=lambda s: np.random.choice(env.nA), num_episodes=10000)

greedy_pol_interataion_1 = make_greedy_policy(V_interataion_1, env, gamma=1.0)
epsgreedy_pol_interataion_1 = make_eps_greedy_policy(V_interataion_1, env, eps=0.1, gamma=1.0, seed=42)

# Try both:
stats_greedy = run_policy(env, greedy_pol_interataion_1, n_episodes=200)
stats_eps    = run_policy(env, epsgreedy_pol_interataion_1, n_episodes=200)

print("Greedy policy:", stats_greedy)
print("ε-greedy (ε=0.1):", stats_eps)

Greedy policy: {'avg_return': 0.97775, 'std_return': 0.014471955638406317, 'avg_length': 3.225, 'episodes': 200}
ε-greedy (ε=0.1): {'avg_return': 0.9753499999999999, 'std_return': 0.017772942918942842, 'avg_length': 3.465, 'episodes': 200}


In [None]:
#print(greedy_pol.__closure__, epsgreedy_pol.__closure__, lookahead.__closure__, run_policy.__closure__)

(<cell at 0x7ce7950d6a70: numpy.ndarray object at 0x7ce794a21f50>, <cell at 0x7ce7950d7460: GridWorld object at 0x7ce7950d5f10>, <cell at 0x7ce7950d4040: float object at 0x7ce7950e5010>) (<cell at 0x7ce7950735e0: GridWorld object at 0x7ce7950d5f10>, <cell at 0x7ce7963449a0: float object at 0x7ce796366b90>, <cell at 0x7ce796345510: function object at 0x7ce796362e80>, <cell at 0x7ce7950d5a50: numpy.random._generator.Generator object at 0x7ce796337680>) None None


In [None]:
#We can use the new greedy policy and Value function to re-generate the episdoes and value functions
print(generate_episode(env, greedy_pol_interataion_1, max_steps=200))
print(mc_prediction(env, policy= greedy_pol_interataion_1, num_episodes=1000))

([14], [3], [1.0])
[0.95 0.96 0.97 0.98 0.96 0.97 0.98 0.99 0.97 0.98 0.99 1.   0.98 0.99
 1.   0.  ]


In [None]:
#Now we can do policy improvement, i.e. improving our greedy policy by using the new V_interataion_2 and applying them onto our greedy_policy
V_interataion_2 = mc_prediction(env, policy=epsgreedy_pol_interataion_1, num_episodes=1000)
greedy_pol_interataion_2 = make_greedy_policy(V_interataion_2, env, gamma=1.0)

In [None]:
print(generate_episode(env, greedy_pol_interataion_2, max_steps=200))
print(mc_prediction(env, policy= greedy_pol_interataion_2, num_episodes=1000))

([9, 10, 14], [3, 1, 3], [-0.01, -0.01, 1.0])
[0.95 0.96 0.97 0.98 0.96 0.97 0.98 0.99 0.97 0.98 0.99 1.   0.98 0.99
 1.   0.  ]


# Test Function with functions
Understand how the variable state in the function lookahead is defined, and how does the def and local scope work.

1. Local Scope: Variables defined inside a function, like y in level_two, have a local scope. This means they only exist and are accessible within that specific function. Once the function finishes executing, the local scope is destroyed, and the variables defined within it are no longer available.

2. def and Scope: The def keyword is used to define functions, and defining a function creates a new local scope for the variables defined within that function's body. This is a key mechanism for organizing code and preventing unintended interference between different parts of your program.

3. The def keyword establishes a boundary (the local scope). When you refer to a variable name inside the function, Python first looks for it within that function's local scope. If it finds it, it uses that local variable. If it doesn't find it locally, it then looks in progressively wider scopes (like the global scope) following the LEGB rule (Local, Enclosing function locals, Global, Built-in). When you try to access y outside the function, Python only looks in the global scope and doesn't find y defined there, hence the NameError.

4. Function definitions created with def are the most common way to create a new local scope in Python. However, other constructs like classes also create their own namespaces (which are similar to scopes for attributes and methods), and list comprehensions, dictionary comprehensions, set comprehensions, and generator expressions also have their own local scopes. But for defining reusable blocks of code with their own isolated variables, def is the primary tool.

In [None]:
#x = "global x"
def level_two(v):
  print(v)
  if v:
    y = "local y"
  return y

In [None]:
level_two(True)

True


'local y'

In [None]:
print(y)

NameError: name 'y' is not defined

In [None]:
def x1():
  x = "x1"
  return x
def x2():
  x = "x1"
  return x

In [None]:
x1(),x2()

('x1', 'x2')

In [None]:
test_list = [0,1,2,3]
x = []
def test_loop1():
  for i in range(len(test_list)):
    x.append(test_list[i]+1) # Access the element at index i
  return x
def test_loop2():
  for i in range(len(test_list)):
    x.append(test_list[i]+2) # Access the element at index i
  return x

In [None]:
x = []
test_loop1(),test_loop2()

([1, 2, 3, 4, 2, 3, 4, 5], [1, 2, 3, 4, 2, 3, 4, 5])