In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

In [7]:
find_search_prob = 0.5 
find_wait_prob =  0.2
penality = -3
canReward = 1
rechargeReward = 0
alpha = 0.6
beta = 0.8
numStates = 2
numActions = 3
notFoundReward = 0

In [36]:
class recyclingRobot:
    def __init__(self):
        self.actions = ["search","wait","recharge"]
        self.states = ["high","low"]
        self.state = self.states[0]
        self.next_action_probability = np.array([
        [0.34, 0.50],
        [0.33, 0.50],
        [0.33, 0.00]]) # p(a|s)
        self.next_state_probability = {
                "search":{
                    "high":alpha,
                    "low":1-beta},
                "wait":{
                    "high":1,
                    "low":0},
                "recharge":{
                    "high":1,
                    "low":1}
        } # p(s'= high |s,a)
        self.expected_rewards_table = {
            "high":{
                "search":{},
                "wait":{},
                "recharge":{},
            },
            "low":{
                "search":{},
                "wait":{},
                "recharge":{},
            },
        } # r(s,a,s')
        self.alpha = 0.1 # step size
        self.gamma = 0.9 # discount factor
        self.epsilon = 0.1 # greedy-probability
        self.rewards = {
            "high":{
                "search":[canReward,notFoundReward],
                "wait":[canReward,notFoundReward],
                "recharge":[rechargeReward]
            },
            "low":{
                "search":[penality,canReward,notFoundReward],
                "wait":[canReward,notFoundReward],
                "recharge":[rechargeReward]
            }
        } #r(s,a)
        self.probabilities = {
           "high":{
                "search":[find_search_prob,1-find_search_prob],
                "wait":[find_wait_prob,1-find_wait_prob],
                "recharge":[1]
            },
            "low":{
                "search":[1-beta,beta*find_search_prob,beta*(1-find_search_prob)],
                "wait":[find_wait_prob,1-find_wait_prob],
                "recharge":[1]
            } 
        } #p(r|s,a)
        self.calculate_expected_rewards_table()

    def calculate_expected_rewards_table(self):
        for state in self.states:
            for action in self.actions:
                for nextState in self.states:
                    nextStateProbabily = self.next_state_probability[action][state] if nextState == "high" else (1-self.next_state_probability[action][state])
                    num = sum([self.rewards[state][action][i]*nextStateProbabily*self.probabilities[state][action][i] for i in range(len(self.probabilities[state][action]))])
                    self.expected_rewards_table[state][action][nextState] = num/nextStateProbabily if nextStateProbabily != 0 else 0


    def search(self,successProbability):
        if self.state == "high":
            if np.random.rand() < alpha:
                self.state = "low"
        else:
            if np.random.rand() < beta:
                self.state = "high"
                return penality
        
        if np.random.rand() < successProbability:
             return canReward
        else:
             return 0

    def wait(self,successProbability):
         if np.random.rand() < successProbability:
             return canReward
         else:
             return 0

    def recharge(self):
        self.state = "high"
        return rechargeReward

    def updateActions(self):
        if self.state == "high":
            self.actions = [self.search,self.wait]
        else:
            self.actions = [self.search,self.wait,self.recharge]

In [37]:
robot = recyclingRobot()
robot.expected_rewards_table

{'high': {'search': {'high': 0.5, 'low': 0.5},
  'wait': {'high': 0.2, 'low': 0},
  'recharge': {'high': 0.0, 'low': 0}},
 'low': {'search': {'high': -0.19999999999999982, 'low': -0.1999999999999999},
  'wait': {'high': 0, 'low': 0.2},
  'recharge': {'high': 0.0, 'low': 0}}}