In [1]:
import numpy as np
import multiprocessing as mp
import pickle 
import pandas as pd
mp.set_start_method('spawn',True);
import torch
torch.multiprocessing.set_start_method('spawn',True);
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from itertools import product
from tqdm import tqdm

class Machine_Replacement:
    def __init__(self,rep_cost=0.7,nS=6,nA=2):
        self.nS = nS;
        self.nA = nA;
        self.cost = np.linspace(0.1, 0.99,nS);
        self.rep_cost = rep_cost;
    def gen_probability(self):
        self.P = np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            for j in range(self.nS):
                if(i<=j):
                    self.P[0,i,j]=(i+1)*(j+1);
                else:
                    continue;
            self.P[0,i,:]=self.P[0,i,:]/np.sum(self.P[0,i,:])
            self.P[1,i,0]=1;
        return self.P;
    def gen_reward(self):
        self.R=np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            self.R[0,i,:] = self.cost[i];
            self.R[1,i,0] = self.rep_cost+self.cost[0];
        return self.R;
    def gen_expected_reward(self):
        self.R = np.zeros((self.nA,self.nS));
        for i in range(self.nS):
            self.R[0,i] = self.cost[i];
            self.R[1,i] = self.rep_cost + self.cost[0];
        return self.R;

In [2]:
class get_hyperparameters:
    def __init__(self):
        self.T = 50000;
        self.runs = 5;
        self.lr = 0.1;
        self.batch_size = 50;
        self.start = 0;
        self.nS = 20;
        self.nA = 2;
        self.rep_cost = 0.7
        self.alpha = 0.5
        self.gamma = 0.95
        self.beta = 1;
    
    def ret_hyperparameters(self):
        return (self.T,self.runs,self.lr,self.batch_size,self.start,self.nS,self.nA,self.rep_cost,self.alpha,self.gamma,self.beta)

In [3]:
class weights(nn.Module):
    def __init__(self,input_size,output_size,hidden_size = 0):
        super(weights,self).__init__()
        self.input_size = input_size;
        self.hidden_size = hidden_size;
        self.output_size = output_size;
        if(hidden_size!=0):
            self.linear1 = nn.Linear(self.input_size, self.hidden_size, bias=False)
            self.linear2 = nn.Linear(self.hidden_size, self.output_size, bias=False)
        else:
            self.linear1 = nn.Linear(self.input_size, self.output_size, bias=False)
    '''
        forward(): We accept a state 's' as input. Then we convert this into one hot encoding which is accomplished by first two lines.
        Further we convert this one_hot vector 's' into pytorch tensor and then pass it through the network to obtain a output which is returned 
    '''
    def forward(self,state):
        s = np.zeros(self.input_size);
        #print(state,end='===>');
        s[state] = 1;
        state = torch.FloatTensor(s).to(device)
        #print(state);
        if(self.hidden_size == 0):
            output = torch.exp(self.linear1(state)) #To ensure that the outputs are always positive. giving Relu will cause problems.
        else:
            output = torch.exp(self.linear2(torch.exp(self.linear1(state))));
        return output
    
    def fast_forward(self,s1,ns1,s2,ns2):
        return self.forward(s1),self.forward(ns1),self.forward(s2),self.forward(ns2);

In [4]:
class Target_Policy:
    '''
        First we create an initiualizer function namely a constructor to initialize the variables
        with initial data values
    '''
    def __init__(self,S,A,P,R,start):
        self.S=S # represant the states of the MDP
        self.nS = len(S) # Reperesants the number of states of the MDP
        self.nA = len(A);# Represants the number of actions in the MDP
        self.P=P # Represants the true Probability function
        self.R=R # Represants the true Reward  function
        self.A=A;# Represnats the Action Space
        self.K_pol = []; # To store all the policies
        self.s_start=start # Store the start state 
    '''
        In the generate_next_state(), we are generating our next state to be visited based on the following input parameters
        s : Current state
        a : Current action
    '''    
    def generate_next_state(self,s,a):
        #p = np.zeros(self.nS)
        p = self.P[a][s][:] # extrcat all the probabilities of the different statestransition given current s and a
        #print(p);
        return (np.argmax(np.random.multinomial(1,p)))
    
    '''
        Single function to find the plot between the cumulative regret generated by different algorithms
        Parameters:
            reg_list : A list containing the regret value at different runs instances averaged over several time
    '''    
    def plot_data(self,reg_list):
        plt.plot(np.arange(len(reg_list)),np.cumsum(np.array(reg_list)),marker = '+'); 
    '''
        Function to find the optimum policy out of the K policies found out.
        Parameters:
            runs : To find for how many runs the current policy to be runned
            T : Each run consisiting of how many time steps to find the average reward for each policy in one run
            Time complexity : O(#(policies) x #(episode runs) x #(number of Time steps in one episode))
    '''
    def find_optimum_policy(self):
        self.find_policies(); #Call the find_policies() to find all the policies and store it in 'self.K' list
        final_R = np.zeros(len(self.K_pol));
        for idx,pol in enumerate(self.K_pol):
            #policy = self.one_hot(pol);
            beh_obj = beh_pol_sd(self.P, pol, self.nS, self.nA)
            state_distribution = beh_obj.state_distribution_simulated(1);
            final_R[idx] = sum([state_distribution[state] *self.R[int(pol[state]),state] for state in range(self.nS)]);
        for l_pol in range(len(self.K_pol)):
            print(self.K_pol[l_pol],"    ====>    ",final_R[l_pol]); # Display the the expected reward for each policy
        return (final_R,self.K_pol[np.argmin(final_R)],np.min(final_R));# Return the minimum reward, the policy number which gives the minimum reward and the policy that gives minimum reward
    
    def find_policies(self):
        self.K_pol = [];
        pol=np.zeros(self.nS) # First policy is all 0's
        self.K_pol.append(np.array(pol)); # append it to our K_policy list namely self.K
        for i in reversed(range(self.nS)):
            pol[i] = 1; # Come from the end and since the structure is thresholding nature so make each position 1 from 0 and append the same
            print(pol);
            self.K_pol.append(np.array(pol));
        print(len(self.K_pol)," policies found");

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class average_case_distribution:
    def __init__(self,nS,nA,behaviour_policy,state,lr,batch_size):
        self.nS = nS
        self.nA = nA
        self.behaviour_policy = behaviour_policy;
        self.state = state;
        self.lr = lr
        self.W_loss = 0
        self.weight_obj = weights(nS,1).to(device);
        self.W_loss = 0;
        self.batch_size = batch_size
    def set_target_policy(self,target_pol):
        self.target_policy = target_pol;
        self.optimizerW = optim.Adam(self.weight_obj.parameters(),lr = self.lr);
        self.batch=[];
    def show_policy(self):
        print(self.target_policy);
    def set_batch(self,data):
        self.data = data;
        self.T = len(data);
    def get_batch(self):
        if(self.T<=self.batch_size):
            return self.data
        else:
            i = 1;
            j = np.random.choice(self.T);
            batch = [];
            while(i<=self.batch_size):
                if(np.random.random()<=0.5):
                    batch.append([self.data[j][0],self.data[j][1],self.data[j][2]])
                    j = (j+1)%self.T;
                    i+=1;
            return batch;
    
    def get_w(self,data,weight_obj,m,pair=0):
        eps = 0.00000000001
        if(pair == 1):
            Z_w_state = 0;
            for i in range(len(data)):
                val = weight_obj(data[i][0]);
                #print(val);
                Z_w_state+=val;
            #print(Z_w_state.detach().numpy()[0]/self.batch_size);
            return (Z_w_state.item()/self.batch_size)+eps;
        else:
            state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2 = list(),list(),list(),list(),list(),list(),list(),list();
            K = list();
            for i in range(len(data)):
                sample1 = data[i][0];
                sample2 = data[i][1];
                state1.append(sample1[0]);
                #print(sample1);
                w1,wn1,w2,wn2 = weight_obj.fast_forward(sample1[0],sample1[2],sample2[0],sample2[2]);
                w_state1.append(w1);
                w_next_state1.append(wn1);
                state2.append(sample2[0]);
                w_state2.append(w2);
                w_next_state2.append(wn2);
                beta1.append(self.target_policy[sample1[0],sample1[1]]/self.behaviour_policy[sample1[0],sample1[1]]);
                beta2.append(self.target_policy[sample2[0],sample2[1]]/self.behaviour_policy[sample2[0],sample2[1]]);
                K.append(sample1[2]==sample2[2]);
            return (state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K);
    
    def update_state_distribution_ratio(self):
        self.batch = self.get_batch();
        batch=self.batch;
        #eps = 0.0004;
        for _ in range(20):
            pairs = list(product(batch,repeat=2));
            state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K = self.get_w(pairs, self.weight_obj, len(batch));
            Z_w_state = self.get_w(batch, self.weight_obj, len(batch),1);
            self.w_loss = 0
            for i in range(len(state1)):
                self.w_loss+=(beta1[i]*(w_state1[i]/Z_w_state) - (w_next_state1[i]/Z_w_state))*(beta2[i]*(w_state2[i]/Z_w_state)-(w_next_state2[i]/Z_w_state))*K[i];
            self.w_loss/=(2*self.batch_size);
            self.optimizerW.zero_grad();
            self.w_loss.backward();
            self.optimizerW.step();
            self.optimizerW.zero_grad();
            batch = self.get_batch();

In [6]:
class beh_pol_sd:
    def __init__(self,P,policy,nS,nA):
        self.P = P
        self.policy = policy
        self.nS = nS;
        self.nA = nA;
    
    def onehot(self):
        pol = np.zeros((self.nS,self.nA));
        for i in range(self.nS):
            pol[i][int(self.policy[i])]=1;
        return pol;
    def find_transition_matrix(self,onehot_encode=1):
        if(onehot_encode==1):
            self.policy = self.onehot()
        T_s_s_next = np.zeros((self.nS,self.nS));
        for s in range(self.nS):
            for s_next in range(self.nS):
                for a in range(self.nA):
                    #print(s,s_next,a);
                    #print(T[a,s,s_next]);
                    T_s_s_next[s,s_next]+=self.P[a,s,s_next]*self.policy[s,a];
        return T_s_s_next;
    def state_distribution_simulated(self,onehot_encode=1):
        P_policy = self.find_transition_matrix(onehot_encode)
        #print(P_policy);
        P_dash = np.append(P_policy - np.eye(self.nS),np.ones((self.nS,1)),axis=1);
        #print(P_dash);
        P_last = np.linalg.pinv(np.transpose(P_dash))[:,-1]
        return P_last;


In [22]:
def one_hot(target_policy,nS,nA):
  nPOL = len(target_policy);
  one_hot_target_policy = []
  for i in range(nPOL):
    pol=np.zeros((nS,nA));
    for j in range(nS):
      pol[j] = 1;
    one_hot_target_policy.append(pol);
  return np.array(one_hot_target_policy);

In [23]:
def simulate_episode(T,state,behaviour_policy,P,batch_size,run):
  #global P,behaviour_policy,batch_size;
  data={};temp=[];
  for t in range(1,T+1): 
    action = np.argmax(np.random.multinomial(1,behaviour_policy[state,:]))
    next_state = np.argmax(np.random.multinomial(1,P[action,state,:]));
    state = next_state;
    temp.append([state,action,next_state]);
    if(t%batch_size==0):
      #print(t);
      data[int(t/batch_size)-1] = temp[:];
  #print(len(data));
  with open('Data_used_'+str(run),'wb') as f:
    pickle.dump(temp,f);
  f.close();
  return data;

In [24]:
def softmax(theta):
  theta = np.exp(theta);
  sum = np.sum(theta)
  return theta/sum;

In [25]:
def get_rho(obj,nPOL,nS,nA,R,behaviour_policy_state_distribution,target_policy,sampled_policy):
    rho = np.zeros(nPOL)
    state_dist=[]
    for i in range(nS):
        w_state = obj[sampled_policy].weight_obj(i);
        w_state = w_state.item()[0];
        state_dist.append(w_state);
    sd = np.array(state_dist);
    sd = sd * behaviour_policy_state_distribution;
    sd = sd/np.sum(sd);
    pol = sampled_policy
    rho[pol] = sum([sd[s] *R[target_policy[pol,s],s] for s in range(nS)]);
    return rho;

In [26]:
def processing(P,R,nPOL,nS,nA,T,behaviour_policy,behaviour_policy_state_distribution,target_policy,run,lr,beta,batch_size,alpha):
    start = 0;
    one_hot_target_policy = one_hot(target_policy,nS,nA)
    #batch_size = 50;
    obj = [average_case_distribution(nS, nA, behaviour_policy, start, lr,batch_size) for i in range(nPOL)]
    for index,pol in enumerate(one_hot_target_policy):
        obj[index].set_target_policy(pol);
    data = simulate_episode(T*batch_size,start,behaviour_policy,P,batch_size,run);
    #print(data)
    #input();
    #theta = np.ones(nPOL);
    result = []
    sample=np.zeros(T)
    n=np.ones(nPOL)
    S = np.ones(nPOL);
    F = np.ones(nPOL);
    rew = np.zeros(nPOL)
    #data_used=[]
    for t in tqdm(range(1,T+1)):
        if(t<=nPOL):
            sampled_policy = t-1;
        else:
            sampled_policy = np.argmin([np.random.beta(S[j],F[j]) for j in range(nPOL)])
        #sampled_policy = np.argmax(np.random.multinomial(1,P_i))
        obj[sampled_policy].set_batch(data[t-1])
        sample[t-1] = sampled_policy
        obj[sampled_policy].update_state_distribution_ratio()
        rho_vect = get_rho(obj,nPOL,nS,nA,R,behaviour_policy_state_distribution,target_policy,sampled_policy)
        rew[sampled_policy] = (1-alpha)*rew[sampled_policy] + alpha*rho_vect[sampled_policy];
        S[sampled_policy]+=rew[sampled_policy];
        F[sampled_policy]=F[sampled_policy]+t-rew[sampled_policy]
        result.append(rho_vect[sampled_policy]);
    with open('sampled_policy_20_states_'+str(run),'wb') as f:
        pickle.dump(sample,f);
    f.close();
    with open('estimated_value_function_20_states'+str(run),'wb') as f:
        pickle.dump(result,f);
    f.close();

In [27]:
if __name__=='__main__':
    T,runs,lr,batch_size,start,nS,nA,rep_cost,alpha,gamma,beta = get_hyperparameters().ret_hyperparameters();
    nPOL = nS
    T_update = int(T/batch_size);
    mr_obj = Machine_Replacement(rep_cost,nS,nA);
    P,R = mr_obj.gen_probability(),mr_obj.gen_expected_reward()
    behaviour_policy = np.ones((nS,nA))*0.5
    behaviour_policy_state_distribution = beh_pol_sd(P,behaviour_policy,nS,nA).state_distribution_simulated(0);
    #policy_sampled = np.zeros((T_update,runs))
    target_policy = np.ones((nPOL,nS),dtype = np.int8)
    #data_dict={0:[],1:[],2:[],3:[],4:[]};
    #data_used={0:[],1:[],2:[],3:[],4:[]};
    for i in range(nPOL-1,0,-1):
        target_policy[nPOL-i-1][0:i] = 0;
    
    #mp.set_start_method('spawn')
    processing(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,0,lr,beta,batch_size,alpha);
    '''p1 = mp.Process(target=processing,args=(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,0,lr,beta))
    p2 = mp.Process(target=processing,args=(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,1,lr,beta))
    p3 = mp.Process(target=processing,args=(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,2,lr,beta))
    p4 = mp.Process(target=processing,args=(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,3,lr,beta))
    p5 = mp.Process(target=processing,args=(P,R,nPOL,nS,nA,T_update,behaviour_policy,behaviour_policy_state_distribution,target_policy,4,lr,beta))
    p1.start()
    p2.start()
    p3.start()
    p4.start()
    p5.start()
    p1.join();
    p2.join();
    p3.join();
    p4.join();
    p5.join();'''

100%|████████████████████████████████████| 1000/1000 [13:59:38<00:00, 50.38s/it]


In [28]:
with open('sampled_policy_20_states_'+str(0),'rb') as f:
        sample=pickle.load(f);

In [29]:
pd.DataFrame(sample).to_excel("Thompson_Sampling_Policy_sampled_1st_variant_20_states.xlsx")

In [30]:
print(sample)

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 15. 11. 17. 15. 19. 15.  8. 12. 12. 19. 12. 18. 12. 18. 16. 17.
 16. 18. 11. 12.  7.  7.  7. 12. 11. 17. 16. 15.  7.  7.  8. 19. 14. 12.
  7. 11.  7.  7. 14. 11. 12. 14. 18. 19. 14. 11. 14. 11.  8. 16. 14. 14.
 19. 17.  2. 19. 14. 11.  8.  2. 14.  8.  2.  8.  2.  8. 14.  7. 14.  7.
  8.  2.  7. 17. 18. 17. 16.  2.  4. 11. 11.  8.  2. 11.  7.  4.  2.  1.
  2. 18.  4.  8.  2.  2. 17.  4.  2. 15.  4.  4.  4.  1.  1.  1.  1.  2.
  4.  4.  2.  2.  4.  4.  4.  4. 14.  2.  4.  2.  4.  2. 11.  4. 18.  2.
  4.  4.  4.  1.  1.  1.  1.  4.  4.  1.  4.  2.  1.  6.  4.  2.  1.  1.
  4. 11.  6.  1.  6.  6.  1.  6.  6. 16.  1.  3.  3.  3.  8. 16.  6.  4.
  4.  6.  3.  5.  3.  5.  6.  6.  5.  5.  3. 16.  5.  3.  3.  5.  3.  6.
  5.  3.  5.  6.  1.  6.  6.  4.  5.  5.  4.  1.  1.  6.  3.  3.  4.  3.
  3.  1.  1.  5.  4.  5.  3.  5.  6. 13. 15.  6. 15.  5. 16.  3.  6.  5.
  3. 13.  6.  4.  3.  3.  6.  3.  5.  6. 13.  3.  6