In [1]:
import numpy as np
import multiprocessing as mp
import pickle 
import pandas as pd
mp.set_start_method('spawn',True);
import torch
torch.multiprocessing.set_start_method('spawn',True);
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from itertools import product
#from tqdm import tqdm
import time

class Machine_Replacement:
    def __init__(self,rep_cost=0.7,nS=6,nA=2):
        self.nS = nS;
        self.nA = nA;
        self.cost = self.cost = np.linspace(0.1, 0.99,nS);
        self.rep_cost = rep_cost;
    def gen_probability(self):
        self.P = np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            for j in range(self.nS):
                if(i<=j):
                    self.P[0,i,j]=(i+1)*(j+1);
                else:
                    continue;
            self.P[0,i,:]=self.P[0,i,:]/np.sum(self.P[0,i,:])
            self.P[1,i,0]=1;
        return self.P;
    def gen_reward(self):
        self.R=np.zeros((self.nA,self.nS,self.nS));
        for i in range(self.nS):
            self.R[0,i,:] = self.cost[i];
            self.R[1,i,0] = self.rep_cost+self.cost[0];
        return self.R;
    def gen_expected_reward(self):
        self.R = np.zeros((self.nA,self.nS));
        for i in range(self.nS):
            self.R[0,i] = self.cost[i];
            self.R[1,i] = self.rep_cost + self.cost[0];
        return self.R;

For 4 states. Ideal state was initialising alpha = 0.9, S=np.ones(nPOL) and F = np.ones(nPOL)*2; lr=0.1

In [2]:
class get_hyperparameters:
    def __init__(self):
        self.T = 500000;
        self.runs = 5;
        self.lr = 0.01;
        self.batch_size = 50;
        self.start = 0;
        self.nS = 10;
        self.nA = 2;
        self.rep_cost = 0.7
        self.alpha = 0.9
        self.gamma = 0.95
        self.beta = 1;
    
    def ret_hyperparameters(self):
        return (self.T,self.runs,self.lr,self.batch_size,self.start,self.nS,self.nA,self.rep_cost,self.alpha,self.gamma,self.beta)

In [3]:
class weights(nn.Module):
    def __init__(self,input_size,output_size,hidden_size = 0):
        super(weights,self).__init__()
        self.input_size = input_size;
        self.hidden_size = hidden_size;
        self.output_size = output_size;
        if(hidden_size!=0):
            self.linear1 = nn.Linear(self.input_size, self.hidden_size, bias=False)
            self.linear2 = nn.Linear(self.hidden_size, self.output_size, bias=False)
        else:
            self.linear1 = nn.Linear(self.input_size, self.output_size, bias=False)
        #nn.init.ones_(self.linear1.weight);
    '''
        forward(): We accept a state 's' as input. Then we convert this into one hot encoding which is accomplished by first two lines.
        Further we convert this one_hot vector 's' into pytorch tensor and then pass it through the network to obtain a output which is returned 
    '''
    def forward(self,state):
        s = np.zeros(self.input_size);
        #print(state,end='===>');
        s[state] = 1;
        state = torch.FloatTensor(s).to(device)
        #print(state);
        if(self.hidden_size == 0):
            output = torch.exp(self.linear1(state)) #To ensure that the outputs are always positive. giving Relu will cause problems.
        else:
            output = torch.exp(self.linear2(torch.exp(self.linear1(state))));
        return output
    
    def forward_batch(self,states):
        s = np.zeros((len(states),self.input_size));
        #print(state,end='===>');
        for i in range(len(states)):
            s[i][states[i]] = 1
        states = torch.FloatTensor(s).to(device)
        #print(state);
        if(self.hidden_size == 0):
            output = torch.exp(self.linear1(states)) #To ensure that the outputs are always positive. giving Relu will cause problems.
        else:
            output = torch.exp(self.linear2(torch.exp(self.linear1(state))));
        return output
    
    def fast_forward(self,s1,ns1,s2,ns2):
        return self.forward(s1),self.forward(ns1),self.forward(s2),self.forward(ns2);

In [4]:
class Target_Policy:
    '''
        First we create an initiualizer function namely a constructor to initialize the variables
        with initial data values
    '''
    def __init__(self,S,A,P,R,start):
        self.S=S # represant the states of the MDP
        self.nS = len(S) # Reperesants the number of states of the MDP
        self.nA = len(A);# Represants the number of actions in the MDP
        self.P=P # Represants the true Probability function
        self.R=R # Represants the true Reward  function
        self.A=A;# Represnats the Action Space
        self.K_pol = []; # To store all the policies
        self.s_start=start # Store the start state 
    '''
        In the generate_next_state(), we are generating our next state to be visited based on the following input parameters
        s : Current state
        a : Current action
    '''    
    def generate_next_state(self,s,a):
        #p = np.zeros(self.nS)
        p = self.P[a][s][:] # extrcat all the probabilities of the different statestransition given current s and a
        #print(p);
        return (np.argmax(np.random.multinomial(1,p)))
    
    '''
        Single function to find the plot between the cumulative regret generated by different algorithms
        Parameters:
            reg_list : A list containing the regret value at different runs instances averaged over several time
    '''    
    def plot_data(self,reg_list):
        plt.plot(np.arange(len(reg_list)),np.cumsum(np.array(reg_list)),marker = '+'); 
    '''
        Function to find the optimum policy out of the K policies found out.
        Parameters:
            runs : To find for how many runs the current policy to be runned
            T : Each run consisiting of how many time steps to find the average reward for each policy in one run
            Time complexity : O(#(policies) x #(episode runs) x #(number of Time steps in one episode))
    '''
    def find_optimum_policy(self):
        self.find_policies(); #Call the find_policies() to find all the policies and store it in 'self.K' list
        final_R = np.zeros(len(self.K_pol));
        for idx,pol in enumerate(self.K_pol):
            #policy = self.one_hot(pol);
            beh_obj = beh_pol_sd(self.P, pol, self.nS, self.nA)
            state_distribution = beh_obj.state_distribution_simulated(1);
            final_R[idx] = sum([state_distribution[state] *self.R[int(pol[state]),state] for state in range(self.nS)]);
        for l_pol in range(len(self.K_pol)):
            print(self.K_pol[l_pol],"    ====>    ",final_R[l_pol]); # Display the the expected reward for each policy
        return (final_R,self.K_pol[np.argmin(final_R)],np.min(final_R));# Return the minimum reward, the policy number which gives the minimum reward and the policy that gives minimum reward
    
    def find_policies(self):
        self.K_pol = [];
        pol=np.zeros(self.nS) # First policy is all 0's
        self.K_pol.append(np.array(pol)); # append it to our K_policy list namely self.K
        for i in reversed(range(self.nS)):
            pol[i] = 1; # Come from the end and since the structure is thresholding nature so make each position 1 from 0 and append the same
            print(pol);
            self.K_pol.append(np.array(pol));
        print(len(self.K_pol)," policies found");

In [5]:
device = torch.device("cpu")

class average_case_distribution:
    def __init__(self,nS,nA,behaviour_policy,state,lr,batch_size):
        self.nS = nS
        self.nA = nA
        self.behaviour_policy = behaviour_policy;
        self.state = state;
        self.lr = lr
        self.W_loss = 0
        self.weight_obj = weights(nS,1).to(device);
        self.W_loss = 0;
        self.batch_size = batch_size
    def set_target_policy(self,target_pol):
        self.target_policy = target_pol;
        self.optimizerW = optim.Adam(self.weight_obj.parameters(),lr = self.lr);
        self.batch=[];
    def show_policy(self):
        print(self.target_policy);
    def set_batch(self,data):
        self.data = data;
        self.T = len(data);
    def get_batch(self):
        if(self.T<=self.batch_size):
            return self.data
        else:
            i = 1;
            j=np.random.choice(self.T);
            #j = np.random.randint(0,self.T,self.batch_size);
            #batch = [[self.data[k][0],self.data[k][1],self.data[k][2]] for k in j]
            batch=[];
            while(i<=self.batch_size):
                if(np.random.random()<=0.9):
                    batch.append([self.data[j][0],self.data[j][1],self.data[j][2]])
                    i+=1;
                j = (j+1)%self.T;
            return batch; 
    
    def get_w(self,data,weight_obj,m,pair=0):
        eps = 0.00000000001
        if(pair == 1):
            Z_w_state = torch.tensor(0);
            for i in range(len(data)):
                val = weight_obj(data[i][0]);
#                 print(val);
                Z_w_state = torch.add(Z_w_state,val);
            #print(Z_w_state.detach().numpy()[0]/self.batch_size);
#             print((Z_w_state/self.batch_size)+eps)
            return (Z_w_state/self.batch_size)+eps;
#             print(Z_w_state)
        else:
            state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2 = list(),list(),list(),list(),list(),list(),list(),list();
            K = list();
            for i in range(len(data)):
                sample1 = data[i][0];
                sample2 = data[i][1];
                state1.append(sample1[0]);
                #print(sample1);
                w_state1.append(weight_obj(sample1[0]));
                w_next_state1.append(weight_obj(sample1[2]));
                state2.append(sample2[0]);
                w_state2.append(weight_obj(sample2[0]));
                w_next_state2.append(weight_obj(sample2[2]));
                beta1.append(self.target_policy[sample1[0],sample1[1]]/self.behaviour_policy[sample1[0],sample1[1]]);
                beta2.append(self.target_policy[sample2[0],sample2[1]]/self.behaviour_policy[sample2[0],sample2[1]]);
                K.append(sample1[2]==sample2[2]);
            return (state1,state2,w_state1,w_state2,w_next_state1,w_next_state2,beta1,beta2,K);
    def get_w_updated(self,data,weight_obj,m,Z_w_state,pair=0): #data should be batch. note all inputs to be passed 
        if(pair == 1):
            Z_w_state = torch.Tensor(0);
            for i in range(len(data)):
                val = weight_obj(data[i][0]);
                print(val);
                Z_w_state+=val;
            #print(Z_w_state.detach().numpy()[0]/self.batch_size);
            Z_w_state = Z_w_state.item()/self.batch_size;
            if(Z_w_state<0.00000000000005):
                Z_w_state+=0.000000000001;
            return Z_w_state;
        else:
            data = torch.tensor(data)
#             print(data[:,0])
#             print(data[:,2])
#             print(data[:,0].size())
            new_data_current = torch.reshape(data[:,0],(self.batch_size,1))
#             w_current_state = weight_obj.forward_batch(new_data_current)
            
            new_data_next = torch.reshape(data[:,2],(self.batch_size,1))
            new_data = torch.cat((new_data_current,new_data_next),0)
            w = weight_obj.forward_batch(new_data)
#             print(w)
#             print(Z_w_state)
#             print(w/Z_w_state)
#             print("****")
            w = w/Z_w_state
#             print(w)
#             print(new_data.size())
#             print(w.size())
            w = torch.reshape(w,(2,self.batch_size))
            beta = torch.zeros(self.batch_size,1)
            for i in range(len(data)):
                beta[i] = self.target_policy[data[i][0],data[i][1]]/self.behaviour_policy[data[i][0],data[i][1]]
            
            K = torch.zeros(self.batch_size,self.batch_size)
            for i in range(len(data)):
                for j in range(len(data)):
                    K[i][j] = (data[i][2] == data[j][2])
#             print("here")
#             print(w.size())
#             print(beta.size())
#             print(K.size())
            w_0 = torch.reshape(w[0], (self.batch_size,1))
            w_1 = torch.reshape(w[1], (self.batch_size,1))
#             print(w_0)
#             print(beta)
#             print(w_1)
#             print(w_0 * beta - w_1)
#check here only
            delta = w_0 * beta - w_1
            temp1 = torch.matmul(K,delta) #50,50:50,1 = 50, 1
            final_sum = torch.matmul(torch.transpose(delta,0,1),temp1)
#             print(w_0)
#             print(w_1)
#             print(Z_w_state)
#             print(final_sum)
#             print(".........")
            return final_sum
    
    def update_state_distribution_ratio(self):
        self.batch = self.get_batch();
        batch=self.batch;
        temp = 0
        for _ in range(5):
             start = time.time()
             batch = self.get_batch();
             Z_w_state = self.get_w(batch, self.weight_obj, len(batch),1);
             self.w_loss = 0
             self.w_loss = self.get_w_updated(batch,self.weight_obj,len(batch),Z_w_state)
             self.w_loss/=(self.batch_size*self.batch_size);
             temp += self.w_loss
             start = time.time()
             self.optimizerW.zero_grad();
             self.w_loss.backward(); #Improving the forward pass computation
             self.optimizerW.step();
             self.optimizerW.zero_grad();
             if(self.T == self.batch_size):
                    break;
        state_dist=[];    
        for i in range(self.nS):
            w_state = self.weight_obj(i);
            w_s = w_state.item();
            state_dist.append(w_s);
        return np.array(state_dist);

In [6]:
class beh_pol_sd:
    def __init__(self,P,policy,nS,nA):
        self.P = P
        self.policy = policy
        self.nS = nS;
        self.nA = nA;
    
    def onehot(self):
        pol = np.zeros((self.nS,self.nA));
        for i in range(self.nS):
            pol[i][int(self.policy[i])]=1;
        return pol;
    def find_transition_matrix(self,onehot_encode=1):
        if(onehot_encode==1):
            self.policy = self.onehot()
        T_s_s_next = np.zeros((self.nS,self.nS));
        for s in range(self.nS):
            for s_next in range(self.nS):
                for a in range(self.nA):
                    #print(s,s_next,a);
                    #print(T[a,s,s_next]);
                    T_s_s_next[s,s_next]+=self.P[a,s,s_next]*self.policy[s,a];
        return T_s_s_next;
    def state_distribution_simulated(self,onehot_encode=1):
        P_policy = self.find_transition_matrix(onehot_encode)
        #print(P_policy);
        P_dash = np.append(P_policy - np.eye(self.nS),np.ones((self.nS,1)),axis=1);
        #print(P_dash);
        P_last = np.linalg.pinv(np.transpose(P_dash))[:,-1]
        return P_last;

In [7]:
def one_hot(target_policy,nS,nA):
  nPOL = len(target_policy);
  one_hot_target_policy = []
  for i in range(nPOL):
    pol=np.zeros((nS,nA));
    for s in range(nS):
        pol[s,target_policy[i,s]]=1
    one_hot_target_policy.append(pol);
  return np.array(one_hot_target_policy);

In [8]:
def simulate_episode(T,state,behaviour_policy,P,batch_size,run):
  #global P,behaviour_policy,batch_size;
  data={};temp=[];
  for t in range(1,T+1): 
    action = np.argmax(np.random.multinomial(1,behaviour_policy[state,:]))
    next_state = np.argmax(np.random.multinomial(1,P[action,state,:]));
    temp.append([state,action,next_state]);
    state = next_state;
    if(t%batch_size==0):
      #print(t);
      data[int(t/batch_size)-1] = temp[:];
  #print(len(data));
  with open('Data_used_'+str(run),'wb') as f:
    pickle.dump(temp,f);
  f.close();
  return data;

In [9]:
mr_obj = Machine_Replacement(0.7,10,2)
P = mr_obj.gen_probability()
R = mr_obj.gen_expected_reward()

In [10]:
T,runs,lr,batch_size,start,nS,nA,rep_cost,alpha,gamma,beta = get_hyperparameters().ret_hyperparameters();
nPOL = nS-1
# T = 100000
behaviour_policy = np.ones((nS,nA))*0.5
behaviour_policy_state_distribution = beh_pol_sd(P,behaviour_policy,nS,nA).state_distribution_simulated(0);
print(behaviour_policy_state_distribution)

[0.50458716 0.00934741 0.01443759 0.02006927 0.02656227 0.03445916
 0.04481574 0.06012546 0.08863322 0.19696272]


In [11]:
target_policy = np.zeros((nPOL,nS),dtype = np.int8)
#data_dict={0:[],1:[],2:[],3:[],4:[]};
#data_used={0:[],1:[],2:[],3:[],4:[]};
for i in range(nPOL,0,-1):
    target_policy[nPOL-i][-(nPOL-i+1):] = 1
#print(target_policy)
for pol in target_policy:
    dist = beh_pol_sd(P,pol,nS,nA).state_distribution_simulated(1)
    print(dist)

[0.24909499 0.00940638 0.01497343 0.0217392  0.03057075 0.0431587
 0.063406   0.10297516 0.22010939 0.24456599]
[0.31939735 0.01206116 0.0191994  0.02787468 0.03919877 0.05533943
 0.08130114 0.13203795 0.14854269 0.16504743]
[0.36798539 0.01389595 0.02212009 0.03211509 0.04516184 0.0637579
 0.09366901 0.10705029 0.12043158 0.13381287]
[0.40601655 0.01533209 0.02440619 0.03543417 0.0498293  0.07034725
 0.0820718  0.09379634 0.10552088 0.11724542]
[0.43674001 0.01649228 0.02625302 0.03811549 0.05359991 0.06431989
 0.07503988 0.08575986 0.09647984 0.10719982]
[0.46147503 0.01742633 0.02773987 0.04027418 0.05034273 0.06041128
 0.07047982 0.08054837 0.09061692 0.10068546]
[0.48084049 0.01815761 0.02890396 0.03853861 0.04817326 0.05780791
 0.06744256 0.07707721 0.08671187 0.09634652]
[0.49515235 0.01869806 0.02804709 0.03739612 0.04674515 0.05609418
 0.06544321 0.07479224 0.08414127 0.0934903 ]
[0.50458716 0.01834862 0.02752294 0.03669725 0.04587156 0.05504587
 0.06422018 0.0733945  0.082568

In [73]:
lr = 0.01
#T = 50000
#print(target_policy)
S=np.ones((runs,nPOL);
F=np.ones((runs,nPOL);
n = np.zeros((runs,nPOL))
policy_sampled = np.zeros((int(T/batch_size),runs))
one_hot_target_policy = one_hot(target_policy,nS,nA)
for run in range(runs):
    data = simulate_episode(T,start,behaviour_policy,P,batch_size,0);
    all_networks=dict()
    rho_vect = np.zeros(nPOL)
    for i in range(nPOL):
        pol = one_hot_target_policy[i]
        #print(pol)
        all_networks[i] = average_case_distribution(nS,nA,behaviour_policy,start,lr,batch_size)
        all_networks[i].set_target_policy(pol);
    for t in tqdm(range(1,int(T/batch_size)+1)):
        #sampled_policy = np.argmin(np.array([np.random.beta(S[i],F[i]) for i in range(nPOL)]))
        sampled_policy = pol
        all_networks[sampled_policy].set_batch(data[t-1]);
        c = all_networks[sampled_policy].update_state_distribution_ratio()*behaviour_policy_state_distribution;
        sd = c/np.sum(c);
        rew = sum([sd[state] *R[int(target_policy[sampled_policy,state]),state] for state in range(nS)]);
        rho_vect[sampled_policy] = alpha*rew + (1-alpha)*rho_vect[sampled_policy];
        S[run,sampled_policy]+=rho_vect[sampled_policy];
        F[run,sampled_policy]+=(1-rho_vect[sampled_policy])
        policy_sampled[t-1,run] = sampled_policy;
        n[run,sampled_policy]+=1;
    #print(pol,"========>",S[sampled_policy]/(S[sampled_policy]+F[sampled_policy]))
    #print(sd);
pd.DataFrame(policy_sampled).to_excel('Policy sampled Thompson Sampling final version_1st_var'+str(nS)+'.xlsx');

100%|███████████████████████████████████████| 1000/1000 [13:06<00:00,  1.27it/s]


[0.25527078 0.01247125 0.01692607 0.02756928 0.03633656 0.04573385
 0.06506076 0.10301017 0.20668818 0.2309331 ]


100%|███████████████████████████████████████| 1000/1000 [13:03<00:00,  1.28it/s]


[0.33832551 0.01360815 0.01948294 0.02525056 0.03780359 0.05633297
 0.07083274 0.12925706 0.14155256 0.16755393]


100%|███████████████████████████████████████| 1000/1000 [13:06<00:00,  1.27it/s]


[0.39179399 0.014248   0.01980788 0.02548619 0.04146277 0.05782922
 0.08025371 0.09471993 0.11646887 0.15792944]


100%|███████████████████████████████████████| 1000/1000 [13:08<00:00,  1.27it/s]


[0.41337559 0.01488642 0.02210817 0.03019002 0.04571123 0.06576535
 0.07633778 0.09028289 0.10658713 0.13475542]


100%|███████████████████████████████████████| 1000/1000 [13:44<00:00,  1.21it/s]


[0.45141559 0.01351027 0.02124598 0.02944617 0.04620017 0.05766113
 0.07032715 0.07954814 0.10475341 0.12589198]


100%|███████████████████████████████████████| 1000/1000 [13:50<00:00,  1.20it/s]


[0.48064017 0.01374404 0.02286471 0.0319161  0.04361042 0.05374572
 0.06504951 0.0824424  0.08643887 0.11954806]


100%|███████████████████████████████████████| 1000/1000 [13:24<00:00,  1.24it/s]


[0.48608117 0.01406211 0.0230646  0.03633346 0.03971624 0.04802136
 0.0728385  0.0750586  0.0902665  0.11455746]


100%|███████████████████████████████████████| 1000/1000 [13:24<00:00,  1.24it/s]


[0.49498419 0.0139151  0.02104728 0.03252242 0.04224776 0.05175884
 0.06764219 0.07339245 0.09417463 0.10831514]


100%|███████████████████████████████████████| 1000/1000 [13:24<00:00,  1.24it/s]

[0.49695384 0.01350032 0.02045462 0.03538268 0.03920179 0.05143226
 0.05727784 0.07803726 0.09016857 0.11759082]





In [None]:
S=np.ones((runs,nPOL));
F=np.ones((runs,nPOL));
n = np.zeros((runs,nPOL))
policy_sampled = np.zeros((int(T/batch_size),runs))
one_hot_target_policy = one_hot(target_policy,nS,nA)
for run in range(runs):
    data = simulate_episode(T,start,behaviour_policy,P,batch_size,0);
    all_networks=dict()
    rho_vect = np.zeros(nPOL)
    for i in range(nPOL):
        pol = one_hot_target_policy[i]
        #print(pol)
        all_networks[i] = average_case_distribution(nS,nA,behaviour_policy,start,lr,batch_size)
        all_networks[i].set_target_policy(pol);
    for t in range(1,int(T/batch_size)+1):
        sampled_policy = np.argmin(np.array([np.random.beta(S[run,i],F[run,i]) for i in range(nPOL)]))
        #sampled_policy = pol
        all_networks[sampled_policy].set_batch(data[t-1]);
        c = all_networks[sampled_policy].update_state_distribution_ratio()*behaviour_policy_state_distribution;
        sd = c/np.sum(c);
        rew = sum([sd[state] *R[int(target_policy[sampled_policy,state]),state] for state in range(nS)]);
        rho_vect[sampled_policy] = alpha*rew + (1-alpha)*rho_vect[sampled_policy];
        S[run,sampled_policy]+=rho_vect[sampled_policy];
        F[run,sampled_policy]+=(1-rho_vect[sampled_policy])
        policy_sampled[t-1,run] = sampled_policy;
        n[run,sampled_policy]+=1;
    #print(pol,"========>",S[sampled_policy]/(S[sampled_policy]+F[sampled_policy]))
    #print(sd);
pd.DataFrame(policy_sampled).to_excel('Policy sampled Thompson Sampling final version_1st_var'+str(nS)+'.xlsx');

100%|█████████████████████████████████████| 10000/10000 [12:55<00:00, 12.89it/s]


In [27]:
for i in range(nPOL):
    print("Policy number "+str(i));
    print(n[i]);
    print(S[i]/(S[i]+F[i]))
    print("************");

[ 34.  50.  81. 125. 143. 158. 134. 143. 132.]
