<a href="https://colab.research.google.com/github/Sathvikapolepelly/REINFORCEMENT-LEARNING/blob/main/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import random
import random
from typing import Dict, Tuple, List



In [12]:
def make_epsilon_greedy_policy(Q: Dict, nA: int, epsilon: float):
    """Return a policy function that takes state and returns action probabilities."""
    def policy_fn(state):
        probs = np.ones(nA) * (epsilon / nA)
        best_a = np.argmax(Q[state])
        probs[best_a] += (1.0 - epsilon)
        return probs
    return policy_fn

def generate_episode(env, policy):
    """Generate an episode: returns list of (state, action, reward)."""
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        probs = policy(state)
        action = np.random.choice(len(probs), p=probs)
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        done = terminated or truncated
    return episode


In [15]:
def first_visit_mc_policy_evaluation(env, policy_fn, gamma=1.0, num_episodes=500000):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for i_episode in range(1, num_episodes + 1):
        if i_episode % 10000 == 0:
            print(f"Episode {i_episode}/{num_episodes}")

        episode = generate_episode(env, policy_fn)
        visited_states = set()

        for t, (state, _, _) in enumerate(episode):
            if state not in visited_states:
                visited_states.add(state)
                G = sum([r * (gamma ** i) for i, (_, _, r) in enumerate(episode[t:])])
                returns_sum[state] += G
                returns_count[state] += 1.0
                V[state] = returns_sum[state] / returns_count[state]
    return V

def mc_control_epsilon_greedy(env, num_episodes=500000, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    for i_episode in range(1, num_episodes + 1):
        if i_episode % 10000 == 0:
            print(f"Episode {i_episode}/{num_episodes}")

        policy = make_epsilon_greedy_policy(Q, env.action_space.n, epsilon)
        episode = generate_episode(env, policy)
        visited_state_actions = set()

        for t, (state, action, _) in enumerate(episode):
            if (state, action) not in visited_state_actions:
                visited_state_actions.add((state, action))
                G = sum([r * (gamma ** i) for i, (_, _, r) in enumerate(episode[t:])])
                Q[state][action] += 0.01 * (G - Q[state][action])  # constant α=0.01
    return Q

if __name__ == "__main__":
    env = gym.make("Blackjack-v1", sab=True)

In [16]:
    def simple_policy(state):
        score, dealer, usable_ace = state
        return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])

    V = first_visit_mc_policy_evaluation(env, simple_policy, num_episodes=100000)
    print("Policy Value Estimates for simple_policy:")
    print(list(V.items())[:10])

    # Example: MC Control to learn optimal policy
    Q = mc_control_epsilon_greedy(env, num_episodes=500000)
    optimal_policy = {state: np.argmax(actions) for state, actions in Q.items()}
    print("Sample learned policy entries:")
    print(list(optimal_policy.items())[:10])

Episode 10000/100000
Episode 20000/100000
Episode 30000/100000
Episode 40000/100000
Episode 50000/100000
Episode 60000/100000
Episode 70000/100000
Episode 80000/100000
Episode 90000/100000
Episode 100000/100000
Policy Value Estimates for simple_policy:
[((20, 3, 0), 0.616135328562134), ((21, 4, 1), 0.9761904761904762), ((11, 8, 0), -0.03529411764705882), ((12, 8, 0), -0.5407035175879397), ((17, 8, 0), -0.6569821930646673), ((20, 10, 0), 0.43704680290046144), ((19, 2, 0), -0.7127272727272728), ((10, 9, 0), -0.1095890410958904), ((21, 9, 1), 0.9910913140311804), ((10, 10, 0), -0.21257861635220127)]
Episode 10000/500000
Episode 20000/500000
Episode 30000/500000
Episode 40000/500000
Episode 50000/500000
Episode 60000/500000
Episode 70000/500000
Episode 80000/500000
Episode 90000/500000
Episode 100000/500000
Episode 110000/500000
Episode 120000/500000
Episode 130000/500000
Episode 140000/500000
Episode 150000/500000
Episode 160000/500000
Episode 170000/500000
Episode 180000/500000
Episode 1