<a href="https://colab.research.google.com/github/NithinReddychallagonda/RFML-AIML/blob/main/RLML_LAB_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

In [16]:
env = gym.make('Blackjack-v1', sab=True)

# Function to create a random policy
def create_random_policy(env):
    return lambda state: np.random.choice(env.action_space.n)

# Function to create a greedy policy based on Q-values
def create_greedy_policy(Q):
    def policy_fn(state):
        return np.argmax(Q[state])
    return policy_fn

In [17]:
def mc_policy_evaluation(policy, env, num_episodes, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for _ in range(num_episodes):
        episode = []
        state = env.reset()[0]
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated # Blackjack env can be terminated or truncated
            episode.append((state, action, reward))
            state = next_state

        visited_states = set()
        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if state not in visited_states:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]
                visited_states.add(state)
    return V

In [18]:
def mc_control_epsilon_greedy(env, num_episodes, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    def policy_fn(state):
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        episode = []
        state = env.reset()[0]
        done = False

        while not done:
            action = policy_fn(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated # Blackjack env can be terminated or truncated
            episode.append((state, action, reward))
            state = next_state

        visited_state_action_pairs = set()
        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if (state, action) not in visited_state_action_pairs:
                old_q = Q[state][action]
                Q[state][action] = old_q + (G - old_q) / (1 + sum(1 for s, a, r in episode if s == state and a == action))
                visited_state_action_pairs.add((state, action))

    return Q, create_greedy_policy(Q)

In [20]:
if __name__ == "__main__":
    random_policy = create_random_policy(env)

    print("Evaluating random policy...")
    V = mc_policy_evaluation(random_policy, env, num_episodes=50000)
    print("Value function for random policy (sample):")
    for i, (state, value) in enumerate(list(V.items())[:10]):
        print(f"State: {state}, Value: {value:.2f}")

    print("\nTraining control policy with epsilon-greedy strategy...")
    Q, greedy_policy = mc_control_epsilon_greedy(env, num_episodes=500000)
    print("Learned Q-values (sample):")
    for i, (state, actions) in enumerate(list(Q.items())[:10]):
        print(f"State: {state}, Actions: {actions}")

Evaluating random policy...
Value function for random policy (sample):
State: (13, 4, 0), Value: -0.40
State: (13, 2, 0), Value: -0.41
State: (12, 2, 0), Value: -0.38
State: (18, 1, 0), Value: -0.60
State: (21, 6, 0), Value: 0.09
State: (21, 6, 1), Value: 0.47
State: (20, 6, 1), Value: 0.05
State: (9, 6, 0), Value: -0.25
State: (18, 2, 0), Value: -0.30
State: (21, 1, 0), Value: -0.09

Training control policy with epsilon-greedy strategy...
Learned Q-values (sample):
State: (8, 7, 0), Actions: [ 0.05518437 -0.90878566]
State: (20, 10, 0), Actions: [ 0.43015276 -0.99987793]
State: (8, 10, 0), Actions: [-0.99803923  0.28521755]
State: (10, 6, 0), Actions: [-0.97872804  0.97436158]
State: (20, 7, 0), Actions: [ 0.43624878 -0.98242187]
State: (18, 3, 0), Actions: [-0.46004596 -0.96874999]
State: (18, 1, 0), Actions: [-0.85539124 -0.99214935]
State: (15, 4, 1), Actions: [-0.79238863 -0.85382517]
State: (13, 9, 0), Actions: [-0.99748707 -0.2484118 ]
State: (10, 9, 0), Actions: [-0.99943345  0