In [82]:
import numpy as np
import multiprocessing as mp
import pickle 
import pandas as pd
mp.set_start_method('spawn',True);
import torch
torch.multiprocessing.set_start_method('spawn',True);
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from itertools import product
from tqdm import tqdm
import time

class Machine_Replacement:
    def __init__(self,rep_cost=0.7,nS=6,nA=2):
        self.nS = nS;
        self.nA = nA;
        #self.cost = np.linspace(1, 10000,nS);
        #self.rep_cost = 7000;
        self.cost = np.linspace(0.1, 0.99,nS);
        self.rep_cost = rep_cost
    def gen_probability(self):
        self.P = np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            for j in range(self.nS):
                if(i<=j):
                    self.P[0,i,j]=(2*(i+1))*((j+1)*(i+1));
                else:
                    continue;
            self.P[0,i,:]=self.P[0,i,:]/np.sum(self.P[0,i,:])
            self.P[1,i,0]=1;
        return self.P;
    def gen_reward(self):
        self.R=np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            self.R[0,i,:] = self.cost[i];
            self.R[1,i,0] = self.rep_cost+self.cost[0];
        return self.R;
    def gen_expected_reward(self):
        self.R = np.zeros((self.nA,self.nS));
        for i in range(self.nS):
            self.R[0,i] = self.cost[i];
            self.R[1,i] = self.rep_cost + self.cost[0];
        return self.R;

For 4 states. Ideal state was initialising alpha = 0.9, S=np.ones(nPOL) and F = np.ones(nPOL)*2; lr=0.1

In [94]:
class get_hyperparameters:
    def __init__(self):
        self.T = 50000;
        self.runs = 5;
        self.lr = 0.01;
        self.batch_size = 50;
        self.start = 0;
        self.nS = 4;
        self.nA = 2;
        self.rep_cost = 0.7
        self.alpha = 0.9
        self.gamma = 0.5
        self.beta = 1;
    
    def ret_hyperparameters(self):
        return (self.T,self.runs,self.lr,self.batch_size,self.start,self.nS,self.nA,self.rep_cost,self.alpha,self.gamma,self.beta)

In [95]:
class weights(nn.Module):
    def __init__(self,input_size,output_size,hidden_size = 0):
        super(weights,self).__init__()
        self.input_size = input_size;
        self.hidden_size = hidden_size;
        self.output_size = output_size;
        if(hidden_size!=0):
            self.linear1 = nn.Linear(self.input_size, self.hidden_size, bias=False)
            self.linear2 = nn.Linear(self.hidden_size, self.output_size, bias=False)
        else:
            self.linear1 = nn.Linear(self.input_size, self.output_size, bias=False)
        nn.init.ones_(self.linear1.weight)
    '''
        forward(): We accept a state 's' as input. Then we convert this into one hot encoding which is accomplished by first two lines.
        Further we convert this one_hot vector 's' into pytorch tensor and then pass it through the network to obtain a output which is returned 
    '''
    def forward(self,state):
        s = np.zeros(self.input_size);
        #print(state,end='===>');
        s[state] = 1;
        state = torch.FloatTensor(s).to(device)
        #print(state);
        if(self.hidden_size == 0):
            output = torch.exp(self.linear1(state)) #To ensure that the outputs are always positive. giving Relu will cause problems.
        else:
            output = torch.exp(self.linear2(torch.exp(self.linear1(state))));
        return output
    
    def forward_batch(self,states):
        s = np.zeros((len(states),self.input_size));
        #print(state,end='===>');
        for i in range(len(states)):
            s[i][states[i]] = 1
        states = torch.FloatTensor(s).to(device)
        #print(state);
        if(self.hidden_size == 0):
            output = torch.exp(self.linear1(states)) #To ensure that the outputs are always positive. giving Relu will cause problems.
        else:
            output = torch.exp(self.linear2(torch.exp(self.linear1(state))));
        return output
    
    def fast_forward(self,s1,ns1,s2,ns2):
        return self.forward(s1),self.forward(ns1),self.forward(s2),self.forward(ns2);

In [96]:
class Target_Policy:
    '''
        First we create an initiualizer function namely a constructor to initialize the variables
        with initial data values
    '''
    def __init__(self,S,A,P,R,start):
        self.S=S # represant the states of the MDP
        self.nS = len(S) # Reperesants the number of states of the MDP
        self.nA = len(A);# Represants the number of actions in the MDP
        self.P=P # Represants the true Probability function
        self.R=R # Represants the true Reward  function
        self.A=A;# Represnats the Action Space
        self.K_pol = []; # To store all the policies
        self.s_start=start # Store the start state 
    '''
        In the generate_next_state(), we are generating our next state to be visited based on the following input parameters
        s : Current state
        a : Current action
    '''    
    def generate_next_state(self,s,a):
        #p = np.zeros(self.nS)
        p = self.P[a][s][:] # extrcat all the probabilities of the different statestransition given current s and a
        #print(p);
        return (np.argmax(np.random.multinomial(1,p)))
    
    '''
        Single function to find the plot between the cumulative regret generated by different algorithms
        Parameters:
            reg_list : A list containing the regret value at different runs instances averaged over several time
    '''    
    def plot_data(self,reg_list):
        plt.plot(np.arange(len(reg_list)),np.cumsum(np.array(reg_list)),marker = '+'); 
    '''
        Function to find the optimum policy out of the K policies found out.
        Parameters:
            runs : To find for how many runs the current policy to be runned
            T : Each run consisiting of how many time steps to find the average reward for each policy in one run
            Time complexity : O(#(policies) x #(episode runs) x #(number of Time steps in one episode))
    '''
    def find_optimum_policy(self):
        self.find_policies(); #Call the find_policies() to find all the policies and store it in 'self.K' list
        final_R = np.zeros(len(self.K_pol));
        for idx,pol in enumerate(self.K_pol):
            #policy = self.one_hot(pol);
            beh_obj = beh_pol_sd(self.P, pol, self.nS, self.nA)
            state_distribution = beh_obj.state_distribution_simulated(1);
            final_R[idx] = sum([state_distribution[state] *self.R[int(pol[state]),state] for state in range(self.nS)]);
        for l_pol in range(len(self.K_pol)):
            print(self.K_pol[l_pol],"    ====>    ",final_R[l_pol]); # Display the the expected reward for each policy
        return (final_R,self.K_pol[np.argmin(final_R)],np.min(final_R));# Return the minimum reward, the policy number which gives the minimum reward and the policy that gives minimum reward
    
    def find_policies(self):
        self.K_pol = [];
        pol=np.zeros(self.nS) # First policy is all 0's
        self.K_pol.append(np.array(pol)); # append it to our K_policy list namely self.K
        for i in reversed(range(self.nS)):
            pol[i] = 1; # Come from the end and since the structure is thresholding nature so make each position 1 from 0 and append the same
            print(pol);
            self.K_pol.append(np.array(pol));
        print(len(self.K_pol)," policies found");

In [97]:
device = torch.device("cpu")

class average_case_distribution:
    def __init__(self,nS,nA,behaviour_policy,state,lr,batch_size):
        self.nS = nS
        self.nA = nA
        self.behaviour_policy = behaviour_policy;
        self.state = state;
        self.lr = lr
        self.W_loss = 0
        self.weight_obj = weights(nS,1).to(device);
        self.W_loss = 0;
        self.batch_size = batch_size
    def set_target_policy(self,target_pol):
        self.target_policy = target_pol;
        self.optimizerW = optim.Adam(self.weight_obj.parameters(),lr = self.lr);
        self.batch=[];
    def show_policy(self):
        print(self.target_policy);
    def set_batch(self,data):
        self.data = data;
        self.T = len(data);
    def get_batch(self):
        if(self.T<=self.batch_size):
            return self.data
        else:
            i = 1;
            j=np.random.choice(self.T);
            #j = np.random.randint(0,self.T,self.batch_size);
            #batch = [[self.data[k][0],self.data[k][1],self.data[k][2]] for k in j]
            batch=[];
            while(i<=self.batch_size):
                if(np.random.random()<=0.9):
                    batch.append([self.data[j][0],self.data[j][1],self.data[j][2]])
                    i+=1;
                j = (j+1)%self.T;
            return batch; 
    
    def get_w(self,data,weight_obj,m,pair=0):
        eps = torch.tensor(0.00000000001)
        if(pair == 1):
            Z_w_state = torch.tensor(0);
            for i in range(len(data)):
                val = weight_obj(data[i][0]);
#                 print(val);
                Z_w_state = torch.add(Z_w_state,val);
            #print(Z_w_state.detach().numpy()[0]/self.batch_size);
#             print((Z_w_state/self.batch_size)+eps)
            return torch.add((Z_w_state/self.batch_size),eps);
#             print(Z_w_state)
        else:
            state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2 = list(),list(),list(),list(),list(),list(),list(),list();
            K = list();
            for i in range(len(data)):
                sample1 = data[i][0];
                sample2 = data[i][1];
                state1.append(sample1[0]);
                #print(sample1);
                w_state1.append(weight_obj(sample1[0]));
                w_next_state1.append(weight_obj(sample1[2]));
                state2.append(sample2[0]);
                w_state2.append(weight_obj(sample2[0]));
                w_next_state2.append(weight_obj(sample2[2]));
                beta1.append(self.target_policy[sample1[0],sample1[1]]/self.behaviour_policy[sample1[0],sample1[1]]);
                beta2.append(self.target_policy[sample2[0],sample2[1]]/self.behaviour_policy[sample2[0],sample2[1]]);
                K.append(sample1[2]==sample2[2]);
            return (state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K);
    def get_w_updated(self,data,weight_obj,m,Z_w_state,pair=0): #data should be batch. note all inputs to be passed 
        if(pair == 1):
            Z_w_state = 0;
            for i in range(len(data)):
                val = weight_obj(data[i][0]);
                print(val);
                Z_w_state+=val;
            #print(Z_w_state.detach().numpy()[0]/self.batch_size);
            Z_w_state = Z_w_state.item()/self.batch_size;
            if(Z_w_state<0.00000000000005):
                Z_w_state+=0.000000000001;
            return Z_w_state;
        else:
            data = torch.tensor(data)
#             print(data[:,0])
#             print(data[:,2])
#             print(data[:,0].size())
            new_data_current = torch.reshape(data[:,0],(self.batch_size,1))
#             w_current_state = weight_obj.forward_batch(new_data_current)
            
            new_data_next = torch.reshape(data[:,2],(self.batch_size,1))
            new_data = torch.cat((new_data_current,new_data_next),0)
            w = weight_obj.forward_batch(new_data)
#             print(w)
#             print(Z_w_state)
#             print(w/Z_w_state)
#             print("****")
            w = w/Z_w_state
#             print(w)
#             print(new_data.size())
#             print(w.size())
            w = torch.reshape(w,(2,self.batch_size))
            beta = torch.zeros(self.batch_size,1)
            for i in range(len(data)):
                beta[i] = self.target_policy[data[i][0],data[i][1]]/self.behaviour_policy[data[i][0],data[i][1]]
            
            K = torch.zeros(self.batch_size,self.batch_size)
            for i in range(len(data)):
                for j in range(len(data)):
                    K[i][j] = (data[i][2] == data[j][2])
#             print("here")
#             print(w.size())
#             print(beta.size())
#             print(K.size())
            w_0 = torch.reshape(w[0], (self.batch_size,1))
            w_1 = torch.reshape(w[1], (self.batch_size,1))
#             print(w_0)
#             print(beta)
#             print(w_1)
#             print(w_0 * beta - w_1)
#check here only
            delta = w_0 * beta - w_1
            temp1 = torch.matmul(K,delta) #50,50:50,1 = 50, 1
            final_sum = torch.matmul(torch.transpose(delta,0,1),temp1)
#             print(w_0)
#             print(w_1)
#             print(Z_w_state)
#             print(final_sum)
#             print(".........")
            return final_sum
    def updated_update_state_distribution_ratio(self):
        batch = self.get_batch();
        eps = 0.04;
        #self.data_used[run] =self.data_used[run]+batch;
        #self.selected_policy = selected_policy
        pairs = list(product(batch,repeat=2));
        self.loss_episode = [];
        for _ in range(500):
            batch = self.get_batch();
            pairs = list(product(batch,repeat=2));
            state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K = self.get_w(pairs, self.weight_obj, len(batch));
            Z_w_state = self.get_w(batch, self.weight_obj, len(batch),1);
            self.w_loss = 0
            for i in range(len(state1)):
                self.w_loss+=(beta1[i]*(w_state1[i]/Z_w_state) - (w_next_state1[i]/Z_w_state))*(beta2[i]*(w_state2[i]/Z_w_state)-(w_next_state2[i]/Z_w_state))*K[i];
            self.w_loss/=(self.batch_size);
            self.optimizerW.zero_grad();
            self.w_loss.backward();
            self.optimizerW.step();
            self.optimizerW.zero_grad();
            #self.Z.append(Z_w_state)
        #self.loss.append(self.w_loss.cpu().detach().numpy()[0]);
        state_dist=[];
        for i in range(self.nS):
            w_state = self.weight_obj(i);
            w_state = w_state.item();
            state_dist.append(w_state);
        return np.array(state_dist);
    def update_state_distribution_ratio(self):
        self.batch = self.get_batch();
        batch=self.batch;
        temp = 0
        #eps = 0.0004;
        for _ in range(5):
             start = time.time()
             batch = self.get_batch();
#             pairs = product(batch,repeat=2)
             Z_w_state = self.get_w(batch, self.weight_obj, len(batch),1);
#             state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K = self.get_w(pairs, self.weight_obj, len(batch));
             self.w_loss = 0
             self.w_loss = self.get_w_updated(batch,self.weight_obj,len(batch),Z_w_state)
#             print(time.time() - start)
#             print("*******")
#             for i in range(len(state1)):
#                 self.w_loss+=(beta1[i]*(w_state1[i]/Z_w_state) - (w_next_state1[i]/Z_w_state))*(beta2[i]*(w_state2[i]/Z_w_state)-(w_next_state2[i]/Z_w_state))*K[i];
#             print("sourav")
#             print(self.w_loss)
             self.w_loss/=(self.batch_size*self.batch_size);
             temp += self.w_loss
             start = time.time()
             self.optimizerW.zero_grad();
             self.w_loss.backward(); #Improving the forward pass computation
             self.optimizerW.step();
             self.optimizerW.zero_grad();
             if(self.batch_size==self.T):
                    break;
        state_dist=[];
        for i in range(self.nS):
            w_state = self.weight_obj(i);
            w_state = w_state.item();
            state_dist.append(w_state);
        return np.array(state_dist);
#             print(time.time() - start)
#            self.Z.append(Z_w_state)
            
#        self.loss.append(self.w_loss.cpu().detach().numpy()[0]);
#        state_dist=[];

In [98]:
class beh_pol_sd:
    def __init__(self,P,policy,nS,nA):
        self.P = P
        self.policy = policy
        self.nS = nS;
        self.nA = nA;
    
    def onehot(self):
        pol = np.zeros((self.nS,self.nA));
        for i in range(self.nS):
            pol[i][int(self.policy[i])]=1;
        return pol;
    def find_transition_matrix(self,onehot_encode=1):
        if(onehot_encode==1):
            self.policy = self.onehot()
        T_s_s_next = np.zeros((self.nS,self.nS));
        for s in range(self.nS):
            for s_next in range(self.nS):
                for a in range(self.nA):
                    #print(s,s_next,a);
                    #print(T[a,s,s_next]);
                    T_s_s_next[s,s_next]+=self.P[a,s,s_next]*self.policy[s,a];
        return T_s_s_next;
    def state_distribution_simulated(self,onehot_encode=1):
        P_policy = self.find_transition_matrix(onehot_encode)
        #print(P_policy);
        P_dash = np.append(P_policy - np.eye(self.nS),np.ones((self.nS,1)),axis=1);
        #print(P_dash);
        P_last = np.linalg.pinv(np.transpose(P_dash))[:,-1]
        return P_last;


In [99]:
def one_hot(target_policy,nS,nA):
  nPOL = len(target_policy);
  one_hot_target_policy = []
  for i in range(nPOL):
    pol=np.zeros((nS,nA));
    for s in range(nS):
        pol[s,target_policy[i,s]]=1
    one_hot_target_policy.append(pol);
  return np.array(one_hot_target_policy);

In [100]:
def simulate_episode(T,state,behaviour_policy,P,batch_size,run):
  #global P,behaviour_policy,batch_size;
  data={};temp=[];
  for t in range(1,T+1): 
    action = np.argmax(np.random.multinomial(1,behaviour_policy[state,:]))
    next_state = np.argmax(np.random.multinomial(1,P[action,state,:]));
    temp.append([state,action,next_state]);
    state = next_state;
    if(t%batch_size==0):
      #print(t);
      data[int(t/batch_size)-1] = temp[:];
  #print(len(data));
  with open('Data_used_'+str(run),'wb') as f:
    pickle.dump(temp,f);
  f.close();
  return data;

In [101]:
T,runs,lr,batch_size,start,nS,nA,rep_cost,alpha,gamma,beta = get_hyperparameters().ret_hyperparameters();
mr_obj = Machine_Replacement(0.7,nS,nA)
P = mr_obj.gen_probability()
R = mr_obj.gen_expected_reward()

In [102]:
nPOL = nS-1
behaviour_policy = np.ones((nS,nA))*0.5
behaviour_policy_state_distribution = beh_pol_sd(P,behaviour_policy,nS,nA).state_distribution_simulated(0);
print(behaviour_policy_state_distribution)

[0.52631579 0.05921053 0.11303828 0.30143541]


In [103]:
target_policy = np.zeros((nPOL,nS),dtype = np.int8)
for i in range(nPOL,0,-1):
    target_policy[nPOL-i][-(nPOL-i+1):] = 1
print(target_policy)
for pol in target_policy:
    dist = beh_pol_sd(P,pol,nS,nA).state_distribution_simulated(1)
    print(dist)

[[0 0 0 1]
 [0 0 1 1]
 [0 1 1 1]]
[0.35308953 0.09079445 0.23833544 0.31778058]
[0.46357616 0.1192053  0.17880795 0.2384106 ]
[0.52631579 0.10526316 0.15789474 0.21052632]


In [104]:
lr = 0.01
one_hot_target_policy = one_hot(target_policy,nS,nA)
policy_sampled=np.zeros((int(T/batch_size),runs))
value_sampled=np.zeros((int(T/batch_size),runs))
for run in range(runs):
    vf_arr = np.zeros(nPOL)
    data = simulate_episode(T,start,behaviour_policy,P,batch_size,0);
    all_network=dict();
    sampled_policy = 0;
    rew_vect=np.zeros(nPOL)
    n=np.zeros(nPOL)
    for i in range(len(one_hot_target_policy)):
        all_network[i] = average_case_distribution(nS,nA,behaviour_policy,start,lr,batch_size)
        all_network[i].set_target_policy(one_hot_target_policy[i]);
    for t in tqdm(range(1,int(T/batch_size)+1)):
        for i in range(nPOL):
            all_network[i].set_batch(data[t-1]);
            c=all_network[i].update_state_distribution_ratio()*behaviour_policy_state_distribution;
            sd=c/np.sum(c);
            rew_vect[i] = sum([sd[state] *R[int(target_policy[i,state]),state] for state in range(nS)]);
            if(t%500==0):
                print(i,"=>",sd)
                print(rew_vect)
        sampled_policy = np.argmin(rew_vect);
        value_sampled[t-1,run] = rew_vect[sampled_policy]
        n[sampled_policy]+=1;
        policy_sampled[t-1,run] = sampled_policy
    
pd.DataFrame(policy_sampled).to_excel("Policy_sampled_off_policy1_"+str(nS)+"_states.xlsx")
#[0.46147503 0.01742633 0.02773987 0.04027418 0.05034273 0.06041128 0.07047982 0.08054837 0.09061692 0.10068546]

 50%|████████████████████                    | 500/1000 [02:02<02:05,  3.99it/s]

0 => [0.3792091  0.08724087 0.22059296 0.31295707]
[0.47583656 0.41430797 0.43789966]
1 => [0.4895032  0.10319922 0.15634317 0.25095441]
[0.47583656 0.41572407 0.43789966]
2 => [0.52712627 0.09384142 0.15401095 0.22502136]
[0.47583656 0.41572407 0.43101161]


100%|███████████████████████████████████████| 1000/1000 [04:03<00:00,  4.11it/s]

0 => [0.38676223 0.09022126 0.21537948 0.30763703]
[0.46990339 0.43259923 0.43305229]
1 => [0.46399176 0.10614449 0.17329298 0.25657077]
[0.46990339 0.43239416 0.43305229]
2 => [0.52533772 0.0924042  0.14948252 0.23277555]
[0.46990339 0.43239416 0.43226359]



 50%|████████████████████                    | 500/1000 [01:58<01:58,  4.21it/s]

0 => [0.35518177 0.09749486 0.23893777 0.3083856 ]
[0.48656314 0.42158843 0.43675531]
1 => [0.48018535 0.10463643 0.15647229 0.25870593]
[0.48656314 0.4216669  0.43675531]
2 => [0.52428065 0.09433725 0.15408199 0.22730011]
[0.48656314 0.4216669  0.43300355]


100%|███████████████████████████████████████| 1000/1000 [03:57<00:00,  4.21it/s]

0 => [0.36649097 0.09583738 0.23134825 0.3063234 ]
[0.48012476 0.42605458 0.42822009]
1 => [0.47228932 0.10981137 0.17021691 0.2476824 ]
[0.48012476 0.42510689 0.42822009]
2 => [0.53379214 0.09417927 0.15968147 0.21234711]
[0.48012476 0.42510689 0.4263455 ]



 50%|████████████████████                    | 500/1000 [01:59<01:57,  4.27it/s]

0 => [0.35915481 0.09474402 0.22863669 0.31746449]
[0.4859903  0.43084508 0.4372022 ]
1 => [0.46693301 0.10980422 0.17238661 0.25087616]
[0.4859903  0.42885919 0.4372022 ]
2 => [0.52595626 0.09950127 0.15378577 0.22075669]
[0.4859903  0.42885919 0.43183062]


100%|███████████████████████████████████████| 1000/1000 [03:59<00:00,  4.18it/s]

0 => [0.38849353 0.09344365 0.21588926 0.30217356]
[0.4673374  0.42726909 0.43500759]
1 => [0.47372839 0.1037535  0.17555062 0.24696749]
[0.4673374  0.42654288 0.43500759]
2 => [0.52144372 0.09501916 0.14967209 0.23386502]
[0.4673374  0.42654288 0.4349894 ]



 50%|████████████████████                    | 500/1000 [02:00<02:00,  4.14it/s]

0 => [0.37874761 0.09359694 0.21217502 0.31548042]
[0.4744939  0.42808937 0.42937359]
1 => [0.4759367  0.10157842 0.16808708 0.2543978 ]
[0.4744939  0.42587435 0.42937359]
2 => [0.52688184 0.10443154 0.15300795 0.21567867]
[0.4744939  0.42587435 0.43118271]


100%|███████████████████████████████████████| 1000/1000 [04:00<00:00,  4.15it/s]

0 => [0.38117028 0.08965238 0.22066614 0.3085112 ]
[0.47348329 0.41828977 0.44354943]
1 => [0.48543352 0.10607748 0.16667342 0.24181559]
[0.47348329 0.41741196 0.44354943]
2 => [0.51170918 0.09344923 0.15501578 0.23982581]
[0.47348329 0.41741196 0.44180357]



 50%|████████████████████                    | 500/1000 [01:56<01:56,  4.30it/s]

0 => [0.38174738 0.09602476 0.2105066  0.31172125]
[0.47159281 0.41902    0.4522406 ]
1 => [0.48421882 0.10416407 0.16994118 0.24167593]
[0.47159281 0.41903399 0.4522406 ]
2 => [0.50673698 0.10283328 0.16326218 0.22716756]
[0.47159281 0.41903399 0.44528412]


100%|███████████████████████████████████████| 1000/1000 [03:52<00:00,  4.30it/s]

0 => [0.37824982 0.09734162 0.22935572 0.29505284]
[0.47149939 0.42034713 0.43427566]
1 => [0.47722009 0.1058692  0.17099934 0.24591137]
[0.47149939 0.42324536 0.43427566]
2 => [0.52750466 0.10163908 0.14810891 0.22274735]
[0.47149939 0.42324536 0.43074674]





In [66]:
tp=Target_Policy(np.arange(nS),np.arange(nA),P,R,0)
tp.find_optimum_policy()

[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
[0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
[0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
[0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]
[0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
[0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
11  policies found
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]     ====>     0.9900000000000165
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]     ====>     0.5980026712414057
[0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]     ====>     0.5152783499018663
[0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]     ====>     0.4731485120073595
[0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]     ====>     0.4503924838042788
[0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]     ====>     0.4394920053578094
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]     ====>     0.4363168141033407
[0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]     ====>     0.43798070324723415
[0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]     ====>     0.4421537396121882
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]     ====>     0.446788990825688
[1. 1. 1. 1

(array([0.99      , 0.59800267, 0.51527835, 0.47314851, 0.45039248,
        0.43949201, 0.43631681, 0.4379807 , 0.44215374, 0.44678899,
        0.8       ]),
 array([0., 0., 0., 0., 1., 1., 1., 1., 1., 1.]),
 0.4363168141033407)

In [67]:
sd

array([0.51172349, 0.01442877, 0.02128451, 0.03497395, 0.04241495,
       0.0457826 , 0.06536528, 0.07298361, 0.08121001, 0.10983283])

In [68]:
policy_sampled

array([[1.],
       [4.],
       [3.],
       ...,
       [5.],
       [6.],
       [6.]])

In [80]:
np.sum(np.where(policy_sampled==3,1,0))

60

In [81]:
#6+5+4+7+8+3
657+415+600+200+67+60

1999