# Implement Random Walk

In [0]:
import numpy as np
import matplotlib.pyplot as plt

In [0]:
# make one class so that parameters can be shared
class RandomWalk():

    def __init__(self,
                 lamda = None, # for TD(\lambda)
                 interval = 10, 
                 start = 0, # start of random walk
                 end = 1, # end of random walk
                 tile_num = 10, # number of total tilings
                 alpha = None, # learning rate for TD update
                 discount = 1, # discount for TD update
                 termination = False # indicator if the episode ends
                 ):
        self.interval = interval
        self.start = start
        self.end = end
        self.tile_num = tile_num
        self.offset = np.zeros(self.tile_num) # offset for tilings
        self.tiling = np.tile(np.linspace(self.start, self.end, 1+1+self.interval), (self.tile_num, 1)) # to help determine feature vector
        self.w = np.zeros(self.tile_num * (1+self.interval))
        self.alpha = alpha # for TD

        self.lamda = lamda
        self.trace = np.zeros(self.tile_num * (1+self.interval)) #eligibility trace is same shape with self.w
        self.gamma = discount
        self.termination = termination

        


    def make_tiling(self):

        self.tiling = np.tile(np.linspace(self.start, self.end, 1+1+self.interval), (self.tile_num, 1))
        self.offset = np.random.uniform(low = 0, high = (self.end-self.start)/self.interval, 
                                        size = self.tile_num)
        self.tiling += self.offset.reshape(self.tile_num, -1) - (self.end-self.start)/self.interval

    # approximate value using tile coding
    def appr_value(self, loc):
        index = np.sum(self.tiling < loc, axis = 1) -1
        feature = np.zeros((self.tile_num, 1+self.interval))
        feature[:, index] = 1
        #return np.sum(self.w[feature==1]) 
        return np.matmul(feature.flatten().reshape((1, -1)), self.w)


        # change to new state
        # given current state, loc
        # need to specify seed
    def next_step(self, loc):
        dist = np.random.uniform(low = -0.2, high = 0.2, size = 1)
        next_loc = loc + dist
        if next_loc < 0 or next_loc > 1:
            reward = next_loc # episode terminates
            self.termination = True
        else:
            reward = 0
        return reward, next_loc

    # update state values and trace values
    def update(self, loc, reward, next_loc):
        curr_value = self.appr_value(loc)
        next_value = self.appr_value(next_loc)
        delta = reward + self.gamma * next_value - curr_value
        self.trace = self.gamma * self.lamda * self.trace + self.w
        self.w += self.alpha * delta * self.trace


    # random walk process, loop until self.terminate
    def walk(self, loc = 0.5):
        # need to re-initiated for each episode
        while not self.termination:
            reward, next_loc = self.next_step(loc)
            self.update(loc, reward, next_loc)

    def train_evaluate(self, episodes = 25):
        loss = 0
        self.make_tiling()
        for i in range(episodes):
            self.trace = np.zeros(self.tile_num * (1+self.interval))
            self.termination = False
            self.walk(loc = 0.5)
        #return self.w

        # evaluate stage
        points = np.linspace(0, 1, 21)
        for point in points:
            loss += (point - self.appr_value(point))**2
        return loss/21



# Plot Experiments

In [0]:

plt.rcParams.update({'font.size': 22})
fig = plt.figure(figsize=(10, 8)).add_subplot(111)
fig.title.set_text(r'TD($\lambda$) With Linear Function Approximation')
fig.set_ylabel('Mean Squred Value Error')
fig.set_xlabel(r'$\alpha$')


alpha_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
lamda_list = [0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1]

runs = 50
for lamda in lamda_list:
    print(f'lambda value is {lamda}')
    loss = np.zeros((len(alpha_list), runs))
    for i in range(len(alpha_list)):
        print(f'\t\t alpha value is {alpha_list[i]}')
        for run in range(runs):
            np.random.seed(run)
            experiment = RandomWalk(lamda = lamda, alpha = alpha_list[i])
            result = experiment.train_evaluate()
            loss[i, run] = result
    fig.plot(alpha_list, np.mean(loss, axis = 1), label = r'$\lambda$' + '=' + str(lamda))
        

plt.legend(loc = 'upper right')
plt.savefig('TD.png')
plt.show()


lambda value is 0
		 alpha value is 0.1
