**ReadMe**

The question 1 is answered in part 1.

In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import sys
%cd /content/drive/My\ Drive/Winter\ 2022/CME241

Mounted at /content/drive
/content/drive/My Drive/Winter 2022/CME241


In [9]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from typing import Dict

In [263]:
from collections import defaultdict
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]


def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]


def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """

    #Initialize container
    TabularValueFunction: ValueFunc = defaultdict(float)
    Count: Mapping[S, int] = defaultdict(int)

    #Find sum
    for state_return_sample in state_return_samples:
      Count[state_return_sample[0]] += 1
      TabularValueFunction[state_return_sample[0]] += state_return_sample[1]
    
    #Find average
    for state in Count.keys():
      TabularValueFunction[state] /= Count[state]

    return TabularValueFunction


def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]

#Assume that srs means state reward state
def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """

    #Initialize dict
    Counts: Mapping[S, int] = defaultdict(int)
    ProbFunct: ProbFunc = dict()
    RewardFunct: RewardFunc = defaultdict(float)

    for s, r, next_s in srs_samples:
      #Create a dict in dict if have never seen s before
      if not s in Counts.keys():
        ProbFunct[s] = defaultdict(int) 
      Counts[s] += 1
      ProbFunct[s][next_s] += 1 

      RewardFunct[s] += r

    for s in Counts.keys():
      RewardFunct[s] /= Counts[s]
      for next_s in ProbFunct[s].keys():
        ProbFunct[s][next_s] /= Counts[s]

    return tuple((ProbFunct, RewardFunct))  


def get_mrp_value_function(
    prob_func: ProbFunc,
    reward_func: RewardFunc
) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """

    non_terminals = list(prob_func.keys())
    rule = {state: i for i, state in enumerate(non_terminals)}
    num_non_terminals = len(non_terminals)

    #Convert into numpy array form
    A = np.zeros((num_non_terminals,num_non_terminals))
    r = np.zeros(num_non_terminals)

    for state in non_terminals:
      r[rule[state]] = reward_func[state]
      for next_state in prob_func[state].keys():
        if next_state in non_terminals:
          A[rule[state]][rule[next_state]] = prob_func[state][next_state]

    #We want to find v a (nontermial x 1) - matrix such that
    # Av + r = v
    # r = (I-A)v
    # With regulariies condition,

    v = np.linalg.inv(np.identity(num_non_terminals) - A) @ r

    #Convert back to dictionary form
    value_func: ValueFunc = dict()
    for i in range(num_non_terminals):
      value_func[non_terminals[i]] = v[i]

    return value_func


def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """
    #Initialize value function as being 0
    value_func: ValueFunc = defaultdict(float)

    num_samples = len(srs_samples)

    for t in range(num_updates):

      #Find learning rate
      alpha = learning_rate * (t / learning_rate_decay + 1) ** -0.5

      #Choose a sample randomly
      s, r, next_s = srs_samples[np.random.randint(num_samples)]

      #Expected
      # r + value_func[next_s]
      #What we have right now
      # value_func[s]
      #Discrepancy
      # r + value_func[next_s] - value_func[s]
      #Update
      # value_func[s] = (1-alpha)*value_func[s] + alpha*(r + value_func[next_s] - value_func[s])

      value_func[s] = (1-2*alpha)*value_func[s] + alpha*(r + value_func[next_s])

    return value_func

def get_lstd_value_function(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """

    non_terminal_states = list(set(s for s,_,_ in srs_samples))
    rule = {state: i for i, state in enumerate(non_terminals)}
    num_non_terminals = len(non_terminals)

    #We expect V[s] = r + V[s']
    #e_s - e_s' V = r

    n = len(srs_samps)
    A = np.zeros((n, num_non_terminals))
    R = np.zeros(n)

    for i, srs in enumerate(srs_samps):
      s, r, s_next = srs
      if s in non_terminals:
        A[i,rule[s]] = 1
      if s_next in non_terminals:
        A[i,rule[s_next]] = -1
      R[i] = r

    #Use pseudo inverse
    V = np.linalg.inv(A.T@A)@A.T@R

    #Convert back to dictionary form
    value_func: ValueFunc = dict()
    for i in range(num_non_terminals):
      value_func[non_terminals[i]] = v[i]

    return value_func



if __name__ == '__main__':
    given_data: DataType = [
        [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
        [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
        [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
        [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
        [('B', 8.), ('B', 2.)]
    ]

    sr_samps = get_state_return_samples(given_data)

    print("------------- MONTE CARLO VALUE FUNCTION --------------")
    print(get_mc_value_function(sr_samps))

    srs_samps = get_state_reward_next_state_samples(given_data)

    pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
    print("-------------- MRP VALUE FUNCTION ----------")
    print(get_mrp_value_function(pfunc, rfunc))

    print("------------- TD VALUE FUNCTION --------------")
    print(get_td_value_function(srs_samps))

    print("------------- LSTD VALUE FUNCTION --------------")
    print(get_lstd_value_function(srs_samps))

------------- MONTE CARLO VALUE FUNCTION --------------
defaultdict(<class 'float'>, {'A': 9.571428571428571, 'B': 5.642857142857143})
-------------- MRP VALUE FUNCTION ----------
{'A': 12.93333333333333, 'B': 9.599999999999998}
------------- TD VALUE FUNCTION --------------
defaultdict(<class 'float'>, {'B': 2.050786040602105, 'T': 0.0, 'A': 2.541605409072991})
------------- LSTD VALUE FUNCTION --------------
{'A': 12.93333333333333, 'B': 9.599999999999998}
