# Part I: Monte Carlo ES

- Here you are given an executable that represents the Markov Decision Process. The executable is named [```MDP```](./MDP).

- You can query the number of states and actions of the MDP with ```./MDP states``` and ```./MDP actions```. The discount factor of the MDP can be obtained with ```./MDP gamma```.

- To start interacting with the MDP, run ```./MDP <starting state>```. At every iteration, the executable will display the current state and current return of the MDP, and ask you to choose an action, after which it will give a reward, and transition to a new state.

- You must implement the Monte Carlo ES algorithm that learns the optimal policy of the MDP by simulating episodes with exploring starts.


In [1]:
import subprocess
from subprocess import Popen, PIPE
import string
import random
import tqdm

In [2]:
class MDP:
    ret = 0
    sta = 0
    rew = 0
    df = 1.0

    def get_num_states(self):
        p = Popen(['./MDP.o', 'states'], stdin=PIPE, stdout=PIPE, stderr=PIPE)
        output, err = p.communicate()
        rc = p.returncode
        output = str(output)
        output = output.replace('b\'', '')
        output = output.replace('\\n\'', '')
        return int(output)

    def get_num_actions(self):
        p = Popen(['./MDP.o', 'actions'], stdin=PIPE, stdout=PIPE, stderr=PIPE)
        output, err = p.communicate()
        rc = p.returncode
        output = str(output)
        output = output.replace('b\'', '')
        output = output.replace('\\n\'', '')
        return int(output)
    
    def get_gamma(self):
        p = Popen(['./MDP.o', 'gamma'], stdin=PIPE, stdout=PIPE, stderr=PIPE)
        output, err = p.communicate()
        rc = p.returncode
        output = str(output)
        output = output.replace('b\'', '')
        output = output.replace('\\n\'', '')
        output =  float(output)
        self.df = output
        return output
    
    def act(self, curr_state, action):
        p = Popen(['./MDP.o', str(curr_state)], stdin=PIPE, stdout=PIPE, stderr=PIPE)
        output, err = p.communicate(input=str(action).encode())
        rc = p.returncode
        output=str(output)
        st = output.find("nCurrent sta")
        rem_str = string.ascii_letters + " :\\()"
        newst = "".join([i for i in output[st:] if rem_str.find(i) == -1])
        newst = int(newst[0])
        st , nd = output.find("Reward:"), output.find("nCurrent sta")
        output = float(output[st:nd].strip(string.ascii_letters + " :\\()"))
        return newst, output
    
    def episode(self, max_moves=100, start=None):
        if start == None:
            start = random.randint(0, self.get_num_states()-1)
        return


class MonteCarloES:
    def __init__(self, mdp_obj: MDP) -> None:
        self.na = mdp_obj.get_num_actions()
        self.ns = mdp_obj.get_num_states()
        self.gm = mdp_obj.get_gamma()
        self.q_arr = [[0.0 for i in range(self.na)] for _ in range(self.ns)]
        self.cnt = [[0 for i in range(self.na)] for _ in range(self.ns)]
        self.obj0 = mdp_obj

    
    def episode(self, max_moves=100, start=None, eps=0.5):

        if start == None:
            start = random.randint(0,self.ns-1)
        
        start %= self.ns
        
        ret = [[0.0, 0, start]]

        for i in range(max_moves):
            act1 = self.ep_greedy(ret[-1][-1], epsilon=eps)
            newst, rewew = self.obj0.act(ret[-1][-1], act1)
            ret.append([ret[-1][0] + rewew * (self.gm ** i), act1, newst])
        
        return ret
    
    def update_q(self, ret_arr):
        for i in range(len(ret_arr)-1):
            curr_ret = 0.0
            
            
            self.q_arr[ret_arr[i][2]][ret_arr[i+1][1]] = ((self.q_arr[ret_arr[i][2]][ret_arr[i+1][1]] * self.cnt[ret_arr[i][2]][ret_arr[i+1][1]]) + ((ret_arr[i+1][0] - ret_arr[i][0]) * (self.gm ** -i)))/(self.cnt[ret_arr[i][2]][ret_arr[i+1][1]]+1)
            
            self.cnt[ret_arr[i][2]][ret_arr[i+1][1]] += 1

    
    def ep_greedy(self, stt, epsilon=0.5):
        if random.random()<epsilon:
            return random.randint(0, self.na-1)
        
        f = lambda i: self.q_arr[stt][i]
        return max(range(self.na), key=f)
    
    def train(self, num_episodes=1000, printin = 500):

        for i in tqdm.tqdm(range(num_episodes)):
            self.update_q(self.episode(max_moves=1 ,eps=1.0))

            if (i+1)%printin == 0:
                print(f"Episodes trained: {i+1}")
                print(self.q_arr)







In [3]:
emmette = MDP()
baa = MonteCarloES(emmette)
baa.train()

 60%|██████    | 603/1000 [00:00<00:00, 746.47it/s]

Episodes trained: 500
[[-0.796296, 0.0193528, 0.5757120000000001, -0.20501900000000003, 0.771134], [0.956824, -0.9813140000000001, 0.230381, -1.2606499999999998, -0.46147999999999995], [-0.26165199999999994, -0.9018469999999998, 0.515194, -1.0142799999999996, -0.215068], [-0.47996900000000003, -0.546914, -2.70486, 0.6679079999999998, -0.00622682], [-1.52656, 0.36023199999999994, -0.585457, -0.7638320000000002, -0.08007620000000001], [-0.7123860000000001, -1.63486, -0.8567340000000001, -1.38673, -1.59846], [0.822831, -0.1826249999999999, 1.3308300000000004, -0.8347819999999999, 0.46691800000000006], [0.272681, -0.40601199999999993, 1.1867599999999998, -1.03232, -1.38811], [0.3279100000000001, -0.719341, -0.0804011, -0.6897570000000001, 0.842588], [-0.19968300000000003, -0.04721619999999999, -0.0304208, -0.9994289999999998, -0.189129]]


100%|██████████| 1000/1000 [00:01<00:00, 745.27it/s]

Episodes trained: 1000
[[-0.7962960000000002, 0.0193528, 0.575712, -0.20501900000000003, 0.771134], [0.9568239999999999, -0.9813139999999998, 0.23038099999999997, -1.2606499999999998, -0.46148], [-0.2616519999999999, -0.901847, 0.5151939999999999, -1.0142799999999996, -0.215068], [-0.4799689999999999, -0.5469139999999999, -2.7048599999999987, 0.6679080000000003, -0.0062268199999999975], [-1.5265600000000006, 0.36023199999999994, -0.585457, -0.7638320000000005, -0.08007619999999997], [-0.7123860000000002, -1.63486, -0.8567339999999999, -1.38673, -1.5984599999999995], [0.8228310000000004, -0.18262499999999987, 1.3308299999999995, -0.8347820000000001, 0.46691799999999994], [0.2726810000000001, -0.40601199999999976, 1.1867599999999998, -1.0323200000000001, -1.38811], [0.3279100000000001, -0.719341, -0.0804011, -0.6897570000000001, 0.8425879999999996], [-0.19968299999999997, -0.04721620000000002, -0.03042080000000001, -0.9994289999999996, -0.18912899999999996]]



