<a href="https://colab.research.google.com/github/SIVAGORAM/DEEPLEARNING/blob/main/Markov_Decision_Process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%9921005015
#MARKOV DECISION PROCESS
import numpy as np

class SimpleMDP:
    def __init__(self):
        self.num_states = 3
        self.num_actions = 2
        self.transitions = np.zeros((self.num_states, self.num_actions, self.num_states)) # transition probabilities
        self.rewards = np.zeros((self.num_states, self.num_actions)) # rewards

        # Define transition probabilities and rewards
        self.transitions[0, 0, :] = [0.7, 0.3, 0.0]
        self.transitions[0, 1, :] = [0.0, 0.9, 0.1]
        self.transitions[1, 0, :] = [0.0, 1.0, 0.0]
        self.transitions[1, 1, :] = [0.8, 0.2, 0.0]
        self.transitions[2, :, :] = [0.0, 0.0, 1.0]
        self.rewards[1, 0] = 1.0
        self.rewards[1, 1] = 1.0
        self.rewards[2, :] = 10.0

    def step(self, state, action):
        next_state_probs = self.transitions[state, action, :]
        next_state = np.random.choice(self.num_states, p=next_state_probs)
        reward = self.rewards[state, action]
        return next_state, reward

class QLearningAgent:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((num_states, num_actions))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.q_table[state, :])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state, :])
        td_target = reward + self.discount_factor * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * td_error

# Create MDP
mdp = SimpleMDP()

# Create Q-learning agent
agent = QLearningAgent(num_states=mdp.num_states, num_actions=mdp.num_actions)

# Training
num_episodes = 1000
for episode in range(num_episodes):
    state = np.random.randint(mdp.num_states) # Start from a random state
    while state != 2: # Continue until reaching the terminal state
        action = agent.choose_action(state)
        next_state, reward = mdp.step(state, action)
        agent.update_q_table(state, action, reward, next_state)
        state = next_state

# Evaluate
total_rewards = 0
num_eval_episodes = 100
for episode in range(num_eval_episodes):
    state = 0
    while state != 2:
        action = agent.choose_action(state)
        next_state, reward = mdp.step(state, action)
        total_rewards += reward
        state = next_state

average_reward = total_rewards / num_eval_episodes
print("Average reward over", num_eval_episodes, "evaluation episodes:", average_reward)


Average reward over 100 evaluation episodes: 221.5
