In [None]:
%tensorflow_version 1.x

!pip install stable-baselines[mpi]==2.10.0

TensorFlow 1.x selected.
Collecting stable-baselines[mpi]==2.10.0
[?25l  Downloading https://files.pythonhosted.org/packages/e5/fe/db8159d4d79109c6c8942abe77c7ba6b6e008c32ae55870a35e73fa10db3/stable_baselines-2.10.0-py3-none-any.whl (248kB)
[K     |████████████████████████████████| 256kB 2.8MB/s 
Installing collected packages: stable-baselines
  Found existing installation: stable-baselines 2.2.1
    Uninstalling stable-baselines-2.2.1:
      Successfully uninstalled stable-baselines-2.2.1
Successfully installed stable-baselines-2.10.0


In [None]:
from stable_baselines.common.env_checker import check_env

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(251754,	1.029973704,	45.49679449,	30885978, 31205576), 
                              (494104,	1.029945113,	46.22893156,	16165867, 16787941),
                              (133576,	1.029376535,	45.96409535,	32819208, 32988134),
                              (4631183,	1.030103971,	43.0974548,	106380541, 112374333), 
                              (27048,	1.027802425,	43.34516415,	1943497, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 1
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
locations = 5
episodes = 10
batch_sz = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, batch_sz)

# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)



AssertionError: ignored

In [None]:
print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

#repeats the same action over and over- no learning, mere demonstration
n_steps = 10
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step([20,20,20,20,20])
  print('obs=', obs, 'reward=', reward, 'done=', done)
  if done:
    print("Goal reached!", "reward=", reward)
    break

Box(5, 5)
Box(5,)
[74.94685154 57.02089102  8.71345956 32.67688245 78.75103319]
Step 1
Are we done? True
Observation Space for this episode is:  [[8.01880000e+04 3.19000349e-02 6.14817678e-01 1.67009530e+07
  1.67879410e+07]
 [1.59133000e+05 4.57039081e-02 5.29399936e-01 1.12211300e+08
  1.12374333e+08]
 [6.81600000e+03 1.32042254e-03 6.60211268e-01 3.11945600e+07
  3.12055760e+07]
 [3.87000000e+02 0.00000000e+00 4.23772610e-01 1.97521500e+06
  1.97850200e+06]
 [2.33900000e+03 5.13039761e-03 7.37067123e-01 3.29819950e+07
  3.29881340e+07]]
Distribution set:  [7, 16, 9, 8, 9]
New Count of Susceptible people:  [16700253.0, 112209700.0, 31193660.0, 1974415.0, 32981095.0]
Reward:  [0, 0, 0, 0, 0]
Reward:  [15228.659530897707, 12.62753819469565, 3849.6034693464926, 662.3424444701379, 4070.1904725141667]
Reward:  23823.423455423203
obs= [[8.01880000e+04 3.19000349e-02 6.14817678e-01 1.67002530e+07
  1.67879410e+07]
 [1.59133000e+05 4.57039081e-02 5.29399936e-01 1.12209700e+08
  1.12374333e+0

In [None]:
from stable_baselines import PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env

# Instantiate the env
env = StatesEnv(5,50,10000)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)



In [None]:
# Train the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")

In [None]:
# Test the trained agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")

obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  32926774.806749165
Are we done? False
Observation Space for this episode is:  [[2.51754000e+05 1.02997370e+00 4.54967945e+01 3.08837340e+07
  3.12055760e+07]
 [4.94104000e+05 1.02994511e+00 4.62289316e+01 1.61611120e+07
  1.67879410e+07]
 [1.33576000e+05 1.02937653e+00 4.59640954e+01 3.28186770e+07
  3.29881340e+07]
 [4.63118300e+06 1.03010397e+00 4.30974548e+01 1.06377446e+08
  1.12374333e+08]
 [2.70480000e+04 1.02780242e+00 4.33451642e+01 1.93923800e+06
  1.97850200e+06]]
Distribution set:  [ 3.  5. 16. 11. 12.]
New Count of Susceptible people:  [30883434.0, 16160612.0, 32817077.0, 106376346.0, 1938038.0]
Reward distribution:  [1537595.6399924138, 108889.34712878459, 3.6930754939703796, 1776.6659020516456, 11.907717020819497]
Reward:  1648277.2538157646
Are we done? False
Observation Space for this episode is:  [[2.51754000e+05 1.02997370e+00 4.54967945e+01 3.08834340e+07
  3.12055760e+07]
 [4.94104000e+05 1.02

In [None]:

import os
print(os.sys.version)

3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]




---

Sept 2, 2020

---





In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(266936,	1.029834867,	45.50566428,	30866744, 31205576), 
                              (522733,	1.029971324,	46.26051923,	16130100, 16787941),
                              (141489,	1.029762031,	45.96894458,	32809212, 32988134),
                              (4935595,	1.030088571,	43.14478801,	105991154, 112374333), 
                              (28826,	1.02685076,	43.36363006,	1941204, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 2
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Observation Space for this episode is:  [[2.66936000e+05 1.02983487e+00 4.55056643e+01 3.08631010e+07
  3.12055760e+07]
 [5.22733000e+05 1.02997132e+00 4.62605192e+01 1.61270350e+07
  1.67879410e+07]
 [1.41489000e+05 1.02976203e+00 4.59689446e+01 3.28086460e+07
  3.29881340e+07]
 [4.93559500e+06 1.03008857e+00 4.31447880e+01 1.05987697e+08
  1.12374333e+08]
 [2.88260000e+04 1.02685076e+00 4.33636301e+01 1.93750000e+06
  1.97850200e+06]]
Distribution set:  [0.02856561 0.         0.         0.         1.16056716]
New Count of Susceptible people:  [30863099.0, 16127035.0, 32808646.0, 105987697.0, 1937384.0]
Reward distribution:  [29993948.816060957, 16127035.0, 32808646.0, 105987697.0, 606998.74532431]
Reward:  185524325.56138527
Are we done? False
Observation Space for this episode is:  [[2.66936000e+05 1.02983487e+00 4.55056643e+01 3.08630990e+07
  3.12055760e+07]
 [5.22733000e+05 1.02997132e+00 4.62605192e+01 1.61270350e+



---

Sept 3, 2020

---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(283026,	1.029940712,	45.51489969,	30846367, 31205576), 
                              (552959,	1.030094455,	46.29366734,	16092371, 16787941),
                              (149868,	1.029572691,	45.97379027,	32798628, 32988134),
                              (5258962,	1.030089208,	43.19523511,	105578143, 112374333), 
                              (30721,	1.02861235,	43.38400443,	1938764, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  145103411.0690245
Are we done? False
Observation Space for this episode is:  [[2.83026000e+05 1.02994071e+00 4.55148997e+01 3.08435140e+07
  3.12055760e+07]
 [5.52959000e+05 1.03009445e+00 4.62936673e+01 1.60899590e+07
  1.67879410e+07]
 [1.49868000e+05 1.02957269e+00 4.59737903e+01 3.27959420e+07
  3.29881340e+07]
 [5.25896200e+06 1.03008921e+00 4.31952351e+01 1.05572966e+08
  1.12374333e+08]
 [3.07210000e+04 1.02861235e+00 4.33840044e+01 1.93621600e+06
  1.97850200e+06]]
Distribution set:  [17.  5. 11. 19.  2.]
New Count of Susceptible people:  [30841814.0, 16089459.0, 32794842.0, 105571066.0, 1936016.0]
Reward distribution:  [1.2768318909435643, 108409.92198595866, 547.7296385473816, 0.5914931924918118, 262011.27371061398]
Reward:  370970.79366020346
Are we done? False
Observation Space for this episode is:  [[2.83026000e+05 1.02994071e+00 4.55148997e+01 3.08418140e+07
  3.12055760e+07]
 [5.52959000e+05 1.0300



---
Sept 4, 2020


---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(300075,	1.030075814,	45.5249521,	30824781, 31205576), 
                              (584861,	1.029988322,	46.32912778,	16052586, 16787941),
                              (158741,	1.029979652,	45.97866966,	32787421, 32988134),
                              (5602319,	1.030091289,	43.24898314,	105140299, 112374333), 
                              (32737,	1.029416257,	43.40348841,	1936168, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  6608630.244921634
Are we done? False
Observation Space for this episode is:  [[3.00075000e+05 1.03007581e+00 4.55249521e+01 3.08177410e+07
  3.12055760e+07]
 [5.84861000e+05 1.02998832e+00 4.63291278e+01 1.60484250e+07
  1.67879410e+07]
 [1.58741000e+05 1.02997965e+00 4.59786697e+01 3.27830890e+07
  3.29881340e+07]
 [5.60231900e+06 1.03009129e+00 4.32489831e+01 1.05134944e+08
  1.12374333e+08]
 [3.27370000e+04 1.02941626e+00 4.34034884e+01 1.92813400e+06
  1.97850200e+06]]
Distribution set:  [ 0. 13. 10. 18. 16.]
New Count of Susceptible people:  [30817741.0, 16047125.0, 32782089.0, 105133144.0, 1926534.0]
Reward distribution:  [30817741.0, 36.27178853500085, 1488.3045380675273, 1.601175653617956, 0.21680284029259314]
Reward:  30819267.394305095
Are we done? False
Observation Space for this episode is:  [[3.00075000e+05 1.03007581e+00 4.55249521e+01 3.08177410e+07
  3.12055760e+07]
 [5.84861000e+05 1.02998832e+00



---

Sept 5, 2020

---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(318139,	1.030052901,	45.53544206,	30801916, 31205576), 
                              (618526,	1.030029457,	46.36636132,	16010643, 16787941),
                              (168138,	1.030106222,	45.98425103,	32775557, 32988134),
                              (5966744,	1.030092794,	43.30624877,	104676379, 112374333), 
                              (34882,	1.029184106,	43.42640904,	1933407, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Training begins here




Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  86148808.66954133
Are we done? False
Observation Space for this episode is:  [[3.18139000e+05 1.03005290e+00 4.55354421e+01 3.07997160e+07
  3.12055760e+07]
 [6.18526000e+05 1.03002946e+00 4.63663613e+01 1.60073410e+07
  1.67879410e+07]
 [1.68138000e+05 1.03010622e+00 4.59842510e+01 3.27705490e+07
  3.29881340e+07]
 [5.96674400e+06 1.03009279e+00 4.33062488e+01 1.04672625e+08
  1.12374333e+08]
 [3.48820000e+04 1.02918411e+00 4.34264090e+01 1.93023200e+06
  1.97850200e+06]]
Distribution set:  [16.  6.  2. 12. 10.]
New Count of Susceptible people:  [30798116.0, 16006741.0, 32770349.0, 104671425.0, 1929232.0]
Reward distribution:  [3.4658713650840096, 39676.74409508464, 4434984.4636776475, 643.1234625254672, 87.58699729553818]
Reward:  4475395.384103918
Are we done? False
Observation Space for this episode is:  [[3.18139000e+05 1.03005290e+00 4.55354421e+01 3.07981160e+07
  3.12055760e+07]
 [6.18526000e+05 1.0300294



---
Sept 6, 2020


---



---




In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(337277,	1.030013905,	45.5465389,	30777699, 31205576), 
                              (654042,	1.030056174,	46.40573541,	15966441, 16787941),
                              (178085,	1.029845299,	45.98983631,	32762997, 32988134),
                              (6353348,	1.030102554,	43.36724511,	104185113, 112374333), 
                              (37165,	1.027848782,	43.44948204,	1930471, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  147040548.22862944
Are we done? False
Observation Space for this episode is:  [[3.37277000e+05 1.03001390e+00 4.55465389e+01 3.07752990e+07
  3.12055760e+07]
 [6.54042000e+05 1.03005617e+00 4.64057354e+01 1.59643350e+07
  1.67879410e+07]
 [1.78085000e+05 1.02984530e+00 4.59898363e+01 3.27608810e+07
  3.29881340e+07]
 [6.35334800e+06 1.03010255e+00 4.33672451e+01 1.04182828e+08
  1.12374333e+08]
 [3.71650000e+04 1.02784878e+00 4.34494820e+01 1.92896400e+06
  1.97850200e+06]]
Distribution set:  [0.         0.         0.         1.10422873 0.        ]
New Count of Susceptible people:  [30775299.0, 15964335.0, 32760881.0, 104182718.0, 1928964.0]
Reward distribution:  [30775299.0, 15964335.0, 32760881.0, 34533073.82701043, 1928964.0]
Reward:  115962552.82701042
Are we done? False
Observation Space for this episode is:  [[3.37277000e+05 1.03001390e+00 4.55465389e+01 3.07752990e+07
  3.12055760e+07]
 [6.54042000e+05 1.0



---
Sept 7, 2020


---



---




In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(357552,	1.030059963,	45.55840829,	30752051, 31205576), 
                              (691499,	1.030081027,	46.44750029,	15919872, 16787941),
                              (188618,	1.029594206,	45.99614035,	32749700, 32988134),
                              (6763277,	1.030092365,	43.4322001,	103665204, 112374333), 
                              (39594,	1.027933525,	43.47375865,	1927348, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  157845784.71427903
Are we done? False
Observation Space for this episode is:  [[3.57552000e+05 1.03005996e+00 4.55584083e+01 3.07496120e+07
  3.12055760e+07]
 [6.91499000e+05 1.03008103e+00 4.64475003e+01 1.59170150e+07
  1.67879410e+07]
 [1.88618000e+05 1.02959421e+00 4.59961403e+01 3.27465560e+07
  3.29881340e+07]
 [6.76327700e+06 1.03009237e+00 4.34322001e+01 1.03663130e+08
  1.12374333e+08]
 [3.95940000e+04 1.02793352e+00 4.34737587e+01 1.92554800e+06
  1.97850200e+06]]
Distribution set:  [0.         0.05043521 0.68989432 0.         0.05336864]
New Count of Susceptible people:  [30749612.0, 15917010.0, 32746488.0, 103663130.0, 1925543.0]
Reward distribution:  [30749612.0, 15134140.2765583, 16426590.62044351, 103663130.0, 1825473.4224570394]
Reward:  167798946.31945884
Are we done? False
Observation Space for this episode is:  [[3.57552000e+05 1.03005996e+00 4.55584083e+01 3.07496120e+07
  3.12055760e+07]
 [6.



---
Sept 8, 2020


---



---




In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(379028,	1.03000306,	45.57077577,	30724892, 31205576), 
                              (730991,	1.029971641,	46.49154367,	15870828, 16787941),
                              (199769,	1.029689291,	46.00263304,	32735625, 32988134),
                              (7197710,	1.030091515,	43.50134973,	103115337, 112374333), 
                              (42179,	1.028948055,	43.49794922,	1924029, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Are we done? False
Observation Space for this episode is:  [[3.79028000e+05 1.03000306e+00 4.55707758e+01 3.07203190e+07
  3.12055760e+07]
 [7.30991000e+05 1.02997164e+00 4.64915437e+01 1.58644490e+07
  1.67879410e+07]
 [1.99769000e+05 1.02968929e+00 4.60026330e+01 3.27309250e+07
  3.29881340e+07]
 [7.19771000e+06 1.03009152e+00 4.35013497e+01 1.03112437e+08
  1.12374333e+08]
 [4.21790000e+04 1.02894806e+00 4.34979492e+01 1.92067700e+06
  1.97850200e+06]]
Distribution set:  [0.         0.         0.03474879 0.05659324 2.61417866]
New Count of Susceptible people:  [30720319.0, 15864449.0, 32730922.0, 103112432.0, 1920416.0]
Reward distribution:  [30720319.0, 15864449.0, 31613096.242575537, 97439018.4108571, 140628.04846371597]
Reward:  175777510.70189634
Are we done? False
Observation Space for this episode is:  [[3.79028000e+05 1.03000306e+00 4.55707758e+01 3.07203190e+07
  3.12055760e+07]
 [7.30991000e+05 1.02997164e+00 



---

Sept 9, 2020

---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(401776,	1.029927124,	45.58410657,	30696135, 31205576), 
                              (772619,	1.030003145,	46.53820318,	15819196, 16787941),
                              (211575,	1.029894836,	46.00921659,	32720726, 32988134),
                              (7657858,	1.030092227,	43.57495268,	102534183, 112374333), 
                              (44926,	1.028357744,	43.52490763,	1920501, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  16579153.978772026
Are we done? False
Observation Space for this episode is:  [[4.01776000e+05 1.02992712e+00 4.55841066e+01 3.06937470e+07
  3.12055760e+07]
 [7.72619000e+05 1.03000315e+00 4.65382032e+01 1.58170370e+07
  1.67879410e+07]
 [2.11575000e+05 1.02989484e+00 4.60092166e+01 3.27173910e+07
  3.29881340e+07]
 [7.65785800e+06 1.03009223e+00 4.35749527e+01 1.02531565e+08
  1.12374333e+08]
 [4.49260000e+04 1.02835774e+00 4.35249076e+01 1.91748900e+06
  1.97850200e+06]]
Distribution set:  [19.  9. 17. 10. 10.]
New Count of Susceptible people:  [30691847.0, 15816137.0, 32715691.0, 102530565.0, 1916489.0]
Reward distribution:  [0.17196017103303887, 1951.866368578084, 1.354409231670204, 4654.8804495078875, 87.00846599057483]
Reward:  6695.28165347925
Are we done? False
Observation Space for this episode is:  [[4.01776000e+05 1.02992712e+00 4.55841066e+01 3.06918470e+07
  3.12055760e+07]
 [7.72619000e+05 1.030003



---

Sep 10, 2020

---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(425867,	1.029899006,	45.59803882,	30665691, 31205576), 
                              (816482,	1.030028831,	46.58743242,	15764860, 16787941),
                              (224073,	1.030021466,	46.01625363,	32704956, 32988134),
                              (8144955,	1.030097772,	43.65326757,	101920403, 112374333), 
                              (47849,	1.028234655,	43.55367928,	1916753, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Are we done? False
Observation Space for this episode is:  [[4.25867000e+05 1.02989901e+00 4.55980388e+01 3.06629110e+07
  3.12055760e+07]
 [8.16482000e+05 1.03002883e+00 4.65874324e+01 1.57626410e+07
  1.67879410e+07]
 [2.24073000e+05 1.03002147e+00 4.60162536e+01 3.27035330e+07
  3.29881340e+07]
 [8.14495500e+06 1.03009777e+00 4.36532676e+01 1.01918704e+08
  1.12374333e+08]
 [4.78490000e+04 1.02823466e+00 4.35536793e+01 1.91559200e+06
  1.97850200e+06]]
Distribution set:  [0.         1.090832   0.68406039 0.         0.        ]
New Count of Susceptible people:  [30662911.0, 15762532.0, 32703465.0, 101918704.0, 1915592.0]
Reward distribution:  [30662911.0, 5295215.814619426, 16500994.318667594, 101918704.0, 1915592.0]
Reward:  156293417.133287
Are we done? False
Observation Space for this episode is:  [[4.25867000e+05 1.02989901e+00 4.55980388e+01 3.06629110e+07
  3.12055760e+07]
 [8.16482000e+05 1.03002883e+00 4.6587432



---
Sept 11, 2020


---




In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(451381,	1.029950308,	45.61290794,	30633463, 31205576), 
                              (862685,	1.030039933,	46.63938749,	15707702, 16787941),
                              (237304,	1.029902572,	46.02366585,	32688266, 32988134),
                              (8660256,	1.03009657,	43.73657084,	101272663, 112374333), 
                              (50956,	1.028338174,	43.5826988,	1912771, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  11270601.540687438
Are we done? False
Observation Space for this episode is:  [[4.51381000e+05 1.02995031e+00 4.56129079e+01 3.06296190e+07
  3.12055760e+07]
 [8.62685000e+05 1.03003993e+00 4.66393875e+01 1.57025020e+07
  1.67879410e+07]
 [2.37304000e+05 1.02990257e+00 4.60236659e+01 3.26822340e+07
  3.29881340e+07]
 [8.66025600e+06 1.03009657e+00 4.37365708e+01 1.01269665e+08
  1.12374333e+08]
 [5.09560000e+04 1.02833817e+00 4.35826988e+01 1.90867100e+06
  1.97850200e+06]]
Distribution set:  [0.         0.27046475 0.53045142 0.         0.10496917]
New Count of Susceptible people:  [30629619.0, 15702475.0, 32682181.0, 101269665.0, 1908661.0]
Reward distribution:  [30629619.0, 11981377.808102017, 19228212.255794737, 101269665.0, 1718467.2768272227]
Reward:  164827341.340724
Are we done? False
Observation Space for this episode is:  [[4.51381000e+05 1.02995031e+00 4.56129079e+01 3.06296190e+07
  3.12055760e+07]
 [8



---
Sept 12, 2020


---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(478396,	1.029899916,	45.62851696,	30599352, 31205576), 
                              (911336,	1.030026247,	46.69441348,	15647602, 16787941),
                              (251308,	1.029812023,	46.03156286,	32670602, 32988134),
                              (9205037,	1.030099064,	43.8251579,	100589632, 112374333), 
                              (54259,	1.028400818,	43.61304115,	1908542, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Observation Space for this episode is:  [[4.78396000e+05 1.02989992e+00 4.56285170e+01 3.05992330e+07
  3.12055760e+07]
 [9.11336000e+05 1.03002625e+00 4.66944135e+01 1.56473260e+07
  1.67879410e+07]
 [2.51308000e+05 1.02981202e+00 4.60315629e+01 3.26699150e+07
  3.29881340e+07]
 [9.20503700e+06 1.03009906e+00 4.38251579e+01 1.00589164e+08
  1.12374333e+08]
 [5.42590000e+04 1.02840082e+00 4.36130412e+01 1.90734900e+06
  1.97850200e+06]]
Distribution set:  [0.         0.30744845 0.19809872 0.         0.84641641]
New Count of Susceptible people:  [30599233.0, 15647296.0, 32669896.0, 100589164.0, 1907265.0]
Reward distribution:  [30599233.0, 11505781.817398924, 26798752.103735305, 100589164.0, 818120.0971698526]
Reward:  170311051.01830408
Are we done? False
Observation Space for this episode is:  [[4.78396000e+05 1.02989992e+00 4.56285170e+01 3.05992330e+07
  3.12055760e+07]
 [9.11336000e+05 1.03002625e+00 4.66944135e+01 1.



---
Sept 13, 2020


---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(506999,	1.029982308,	45.64506044,	30563253, 31205576), 
                              (962544,	1.030082781,	46.75246015,	15584437, 16787941),
                              (266132,	1.02994003,	46.03993507,	32651909, 32988134),
                              (9780582,	1.030102299,	43.91932914,	99870003, 112374333), 
                              (57768,	1.029981997,	43.64527074,	1904052, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  165846072.24990952
Are we done? False
Observation Space for this episode is:  [[5.06999000e+05 1.02998231e+00 4.56450604e+01 3.05604730e+07
  3.12055760e+07]
 [9.62544000e+05 1.03008278e+00 4.67524601e+01 1.55814450e+07
  1.67879410e+07]
 [2.66132000e+05 1.02994003e+00 4.60399351e+01 3.26511420e+07
  3.29881340e+07]
 [9.78058200e+06 1.03010230e+00 4.39193291e+01 9.98679030e+07
  1.12374333e+08]
 [5.77680000e+04 1.02998200e+00 4.36452707e+01 1.90116000e+06
  1.97850200e+06]]
Distribution set:  [0.         0.         0.47615054 0.35513103 0.        ]
New Count of Susceptible people:  [30560473.0, 15581445.0, 32651095.0, 99867868.0, 1901160.0]
Reward distribution:  [30560473.0, 15581445.0, 20281879.464265313, 70015522.13073355, 1901160.0]
Reward:  138340479.59499887
Are we done? False
Observation Space for this episode is:  [[5.06999000e+05 1.02998231e+00 4.56450604e+01 3.05604730e+07
  3.12055760e+07]
 [9.62544000e



---

Sept 14, 2020

---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(537277,	1.030008729,	45.66266563,	30525056, 31205576), 
                              (1016423,	1.030082948,	46.81377733,	15518081, 16787941),
                              (281822,	1.030082818,	46.04856966,	32632129, 32988134),
                              (10388178,	1.030103643,	44.0194036,	99112496, 112374333), 
                              (61497,	1.029318503,	43.68017952,	1899287, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  162732527.70294744
Are we done? False
Observation Space for this episode is:  [[5.37277000e+05 1.03000873e+00 4.56626656e+01 3.05229510e+07
  3.12055760e+07]
 [1.01642300e+06 1.03008295e+00 4.68137773e+01 1.55151550e+07
  1.67879410e+07]
 [2.81822000e+05 1.03008282e+00 4.60485697e+01 3.26290710e+07
  3.29881340e+07]
 [1.03881780e+07 1.03010364e+00 4.40194036e+01 9.91109530e+07
  1.12374333e+08]
 [6.14970000e+04 1.02931850e+00 4.36801795e+01 1.89765200e+06
  1.97850200e+06]]
Distribution set:  [0.         0.30249003 0.         1.2124548  0.11191201]
New Count of Susceptible people:  [30522951.0, 15515125.0, 32629071.0, 99110832.0, 1897641.0]
Reward distribution:  [30522951.0, 11465302.823524682, 32629071.0, 29482118.978796896, 1696724.3234892478]
Reward:  105796168.12581083
Are we done? False
Observation Space for this episode is:  [[5.37277000e+05 1.03000873e+00 4.56626656e+01 3.05229510e+07
  3.12055760e+07]
 [1



---
September 15, 2020


---



In [None]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Death Rate                                           0           100
        2	Recovery Rate                                        0           100
        3	Population                                           0           Inf
        4	Infection rate                                       0           100
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros((5,5))
        high = np.array([np.inf, 1, 1, np.inf, np.inf]*5).reshape((5,5))
        self.observation_space = spaces.Box(low, high, shape=(5, 5), dtype = np.float)
        #actions are vectors of the form [n1, n2, n3,...nk] for k states 
        self.action_space = spaces.Box(low = np.zeros((s, ), dtype = int), high = np.array([100]*(s)), shape = (s, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.20
        self.epsilon = 0.4
        self.susc = [0]*self.states
        
            
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(569327,	1.029988039,	45.68130442,	30484645, 31205576), 
                              (1073088,	1.030018041,	46.87854118,	15448410, 16787941),
                              (298427,	1.030067655,	46.05782989,	32611200, 32988134),
                              (11029114,	1.030100877,	44.12571128,	98315871, 112374333), 
                              (65457,	1.029683609,	43.71724949,	1894230, 1978502)])
                               # Confirmed DR RR Susc Population 
                               # Assam, Delhi, Jh, Maha, Naga 
                               #Sept 3
        #store the actions in an array 
        self.action_list = np.array([100/(self.states)]*(self.states))

        return self.states_cond
        

    def step(self, action):
        """
        Assumptions:
        1. Vaccine has 100% efficacy- it is preventive in its action, not curative.  
        2. Vaccine is passive, not antigen based- introduces antibodies when administered. 
        3. 1 person requires 1 vial (dose) only.
        
    """
#         self.valueMap = temp 
#         print("Episode: ", self.curr_step+1)
        
#         #self.states_cond, self.action_list = self.reset()
         
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
        print("Are we done?", self.done)
            
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        else:
            print('Observation Space for this episode is: ', self.states_cond)
       
                    
        #start with equal distribution 
        if self.curr_step == 1:
          self.action_list = np.array([100/(self.states)]*(self.states))
        else:
          self.action_list = action
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
              action[i] = np.random.randint(0, 100/(self.states))
            self.action_list = action
                        
        else:
            self.action_list = action
            
        #update action_list to store only the most recently used action values 
        # self.action_list = action
        print("Distribution set: ",self.action_list)
        
        #no of units distrbuted to respective states              
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
              
        
        #simulation
        for i in range(self.states):
            self.susc[i] = self.states_cond[i, 3]-self.get_discrete_int(self.received[i])  #new count of susc people
        print("New Count of Susceptible people: ", self.susc)
        self.states_cond = np.array(self.states_cond)
        self.states_cond[:, 3] = self.susc                            #update values in states_cond matrix 
        
                  
        #reward only when task done 
        reward = self.get_reward()
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode_number': self.curr_step}
    
    def get_reward(self):
      reward = [0]*self.states              
      for i in range(self.states):          
        reward[i] = self.states_cond[i, 3]*math.exp(-self.action_list[i])
      print("Reward distribution: ", reward)
      reward = sum(reward)
      print("Reward: ", reward)
      return reward 

    
    #def render(self, mode='human', close= False):
      
   
    def close(self):
        pass 



In [None]:
#Training the agent
env = StatesEnv(5,50,10000)
env = make_vec_env(lambda: env, n_envs=1)

print("Training begins here")
model = ACKTR('MlpPolicy', env, verbose=1).learn(500)
model.save("acktr_")
model = ACKTR.load("acktr_")
# Test the trained agent
obs = env.reset()
n_steps = 50
print("Testing begins here")
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reward:  74565439.64250393
Are we done? False
Observation Space for this episode is:  [[5.69327000e+05 1.02998804e+00 4.56813044e+01 3.04822060e+07
  3.12055760e+07]
 [1.07308800e+06 1.03001804e+00 4.68785412e+01 1.54437550e+07
  1.67879410e+07]
 [2.98427000e+05 1.03006766e+00 4.60578299e+01 3.26078540e+07
  3.29881340e+07]
 [1.10291140e+07 1.03010088e+00 4.41257113e+01 9.83115990e+07
  1.12374333e+08]
 [6.54570000e+04 1.02968361e+00 4.37172495e+01 1.89228600e+06
  1.97850200e+06]]
Distribution set:  [0.         2.14539719 0.90768051 0.49164662 0.07685197]
New Count of Susceptible people:  [30482206.0, 15443541.0, 32607764.0, 98311550.0, 1892279.0]
Reward distribution:  [30482206.0, 1807227.0815099524, 13155894.48568484, 60129159.08813899, 1752301.2977196851]
Reward:  107326787.95305347
Are we done? False
Observation Space for this episode is:  [[5.69327000e+05 1.02998804e+00 4.56813044e+01 3.04822060e+07
  3.12055760e+07