# **Monte Carlo 1000 episode**

In [None]:
import random
import gym
import numpy as np

# Set up the CartPole environment
env = gym.make('CartPole-v1')

# Helper function to discretize the state
def discretize(x):
    return tuple((x / np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int))

# Create bins for discretization
def create_bins(i, num):
    return np.arange(num + 1) * (i[1] - i[0]) / num + i[0]

ints = [(-5, 5), (-2, 2), (-0.5, 0.5), (-2, 2)]  # Parameter intervals
nbins = [20, 20, 10, 10]  # Number of bins for each parameter
bins = [create_bins(ints[i], nbins[i]) for i in range(4)]

def discretize_bins(x):
    return tuple(np.digitize(x[i], bins[i]) for i in range(4))

# Define action space
actions = (0, 1)  # Left, right

# Hyperparameters
gamma = 0.9  # Discount factor
epsilon = 0.6 # Exploration probability

# Store results and rewards
cum_rewards = []
avg_reward_intervals = []

# Simulation loop with stopping criterion
num_episodes = 1000
threshold = 600  # Threshold for cumulative rewards

# Function to choose action using epsilon-greedy policy
def choose_action(s, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Random action
    else:
        # Choose action with highest value
        q_values = [Q.get((s, a), 0) for a in actions]
        return actions[np.argmax(q_values)]

# Initialize Q table
Q = {}

# Function to update Q table using Monte Carlo
def update_Q_with_MC(episode_transitions, gamma):
    G = 0  # Initialize return
    for t in reversed(range(len(episode_transitions))):
        s, a, r = episode_transitions[t]
        G = r + gamma * G  # Calculate return
        if (s, a) not in [(x[0], x[1]) for x in episode_transitions[0:t]]:
            old_q = Q.get((s, a), 0)  # Get current Q-value
            Q[(s, a)] = old_q + alpha * (G - old_q)  # Update Q-value

# Main loop
for episode in range(num_episodes):
    obs = env.reset()  # Initialize environment and get the first observation
    s = discretize_bins(obs)  # Discretize the state
    done = False
    total_reward = 0
    episode_transitions = []  # Store episode transitions

    # Run through an episode
    while not done:
        # Choose an action using epsilon-greedy policy
        a = choose_action(s, epsilon)

        # Perform the action and observe the result
        obs, rew, done, _ = env.step(a)
        total_reward += rew

        # Discretize the new observation
        s_new = discretize_bins(obs)

        # Store transition
        episode_transitions.append((s, a, rew))

        # Update the current state
        s = s_new

    cum_rewards.append(total_reward)

    # Check if the threshold is reached
    if total_reward > threshold:
        print(f"Threshold reached! Total reward: {total_reward}")
        break  # Exit the training loop

    # Calculate average reward for every 100 consecutive episodes
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(cum_rewards[episode - 99:episode + 1])
        avg_reward_intervals.append(avg_reward)
        print(f"Average reward for last 100 episodes ({episode - 99}-{episode}): {avg_reward}")

        # Periodically adjust exploration rate
        epsilon = max(0.01, epsilon * 0.4)  # Gradually reduce exploration

    # Update Q table using Monte Carlo
    update_Q_with_MC(episode_transitions, gamma)

# Display the total reward for the last episode
print(f"Total reward for the final episode: {total_reward}")

env.close()  # Close the environment when done


Average reward for last 100 episodes (0-99): 27.84
Average reward for last 100 episodes (100-199): 64.48
Average reward for last 100 episodes (200-299): 145.49
Average reward for last 100 episodes (300-399): 173.09
Average reward for last 100 episodes (400-499): 121.17
Average reward for last 100 episodes (500-599): 102.64
Average reward for last 100 episodes (600-699): 122.27
Average reward for last 100 episodes (700-799): 112.27
Average reward for last 100 episodes (800-899): 102.18
Average reward for last 100 episodes (900-999): 123.24
Total reward for the final episode: 144.0


# **Monte Carlo 3800 episode**

In [None]:
import random
import gym
import numpy as np

# Set up the CartPole environment
env = gym.make('CartPole-v1')

# Helper function to discretize the state
def discretize(x):
    return tuple((x / np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int))

# Create bins for discretization
def create_bins(i, num):
    return np.arange(num + 1) * (i[1] - i[0]) / num + i[0]

ints = [(-5, 5), (-2, 2), (-0.5, 0.5), (-2, 2)]  # Parameter intervals
nbins = [20, 20, 10, 10]  # Number of bins for each parameter
bins = [create_bins(ints[i], nbins[i]) for i in range(4)]

def discretize_bins(x):
    return tuple(np.digitize(x[i], bins[i]) for i in range(4))

# Define action space
actions = (0, 1)  # Left, right

# Hyperparameters
gamma = 0.9  # Discount factor
epsilon = 0.9 # Exploration probability
epsilon_decay = 0.015 # Epsilon decay
epsilon_decay_interval = 20# Epsilon decay interval

# Store results and rewards
cum_rewards = []
avg_reward_intervals = []

# Simulation loop with stopping criterion
num_episodes =6000
threshold = 195# Threshold for cumulative rewards

# Function to choose action using epsilon-greedy policy
def choose_action(s, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Random action
    else:
        # Choose action with highest value
        q_values = [Q.get((s, a), 0) for a in actions]
        return actions[np.argmax(q_values)]

# Initialize Q table
Q = {}

# Function to update Q table using Monte Carlo
def update_Q_with_MC(episode_transitions, gamma):
    G = 0  # Initialize return
    for t in reversed(range(len(episode_transitions))):
        s, a, r = episode_transitions[t]
        G = r + gamma * G  # Calculate return
        if (s, a) not in [(x[0], x[1]) for x in episode_transitions[0:t]]:
            old_q = Q.get((s, a), 0)  # Get current Q-value
            Q[(s, a)] = old_q +(G - old_q)  # Update Q-value

# Main loop
for episode in range(num_episodes):
    obs = env.reset()  # Initialize environment and get the first observation
    s = discretize_bins(obs)  # Discretize the state
    done = False
    total_reward = 0
    episode_transitions = []  # Store episode transitions

    # Run through an episode
    while not done:
        # Choose an action using epsilon-greedy policy
        a = choose_action(s, epsilon)

        # Perform the action and observe the result
        obs, rew, done, _ = env.step(a)
        total_reward += rew

        # Discretize the new observation
        s_new = discretize_bins(obs)

        # Store transition
        episode_transitions.append((s, a, rew))

        # Update the current state
        s = s_new

    cum_rewards.append(total_reward)

    # Print reward at each episode
    print(f"Episode {episode + 1} - Total Reward: {total_reward}")

    # Calculate average reward for every 100 consecutive episodes
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(cum_rewards[episode - 99:episode + 1])
        avg_reward_intervals.append(avg_reward)
        print(f"Average reward for last 100 episodes ({episode - 99}-{episode}): {avg_reward}")

    # Decay epsilon every 20 episodes
    if (episode + 1) % epsilon_decay_interval == 0:
        epsilon = max(0.01, epsilon - epsilon_decay)

    # Update Q table using Monte Carlo
    update_Q_with_MC(episode_transitions, gamma)

    # Check if the threshold is reached
    if avg_reward > threshold:
        print(f"Threshold reached! avg_reward : {avg_reward}")
        break  # Exit the training loop

# Display the total reward for the last episode
print(f"avg_reward for the final episode: {avg_reward}")

env.close()  # Close the environment when done

Episode 1 - Total Reward: 12.0
Episode 2 - Total Reward: 12.0
Episode 3 - Total Reward: 24.0
Episode 4 - Total Reward: 19.0
Episode 5 - Total Reward: 14.0
Episode 6 - Total Reward: 18.0
Episode 7 - Total Reward: 20.0
Episode 8 - Total Reward: 19.0
Episode 9 - Total Reward: 15.0
Episode 10 - Total Reward: 26.0
Episode 11 - Total Reward: 28.0
Episode 12 - Total Reward: 25.0
Episode 13 - Total Reward: 13.0
Episode 14 - Total Reward: 12.0
Episode 15 - Total Reward: 13.0
Episode 16 - Total Reward: 25.0
Episode 17 - Total Reward: 15.0
Episode 18 - Total Reward: 45.0
Episode 19 - Total Reward: 20.0
Episode 20 - Total Reward: 35.0
Episode 21 - Total Reward: 29.0
Episode 22 - Total Reward: 14.0
Episode 23 - Total Reward: 19.0
Episode 24 - Total Reward: 11.0
Episode 25 - Total Reward: 20.0
Episode 26 - Total Reward: 32.0
Episode 27 - Total Reward: 31.0
Episode 28 - Total Reward: 23.0
Episode 29 - Total Reward: 24.0
Episode 30 - Total Reward: 15.0
Episode 31 - Total Reward: 24.0
Episode 32 - Tota

# **SARSA**

In [None]:
import random
import gym
import numpy as np

# Set up the CartPole environment
env = gym.make('CartPole-v1')

# Helper function to discretize the state
def discretize(x):
    return tuple((x / np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int))

# Create bins for discretization
def create_bins(i, num):
    return np.arange(num + 1) * (i[1] - i[0]) / num + i[0]

ints = [(-5, 5), (-2, 2), (-0.5, 0.5), (-2, 2)]  # Parameter intervals
nbins = [20, 20, 10, 10]  # Number of bins for each parameter
bins = [create_bins(ints[i], nbins[i]) for i in range(4)]

def discretize_bins(x):
    return tuple(np.digitize(x[i], bins[i]) for i in range(4))

# Define action space
actions = (0, 1)  # Left, right

# Hyperparameters
gamma = 0.9  # Discount factor
epsilon =0.9 # Exploration probability
epsilon_decay = 0.02 # Epsilon decay
epsilon_decay_interval = 50# Epsilon decay interval
alpha = 0.1 # Learning rate

# Store results and rewards
cum_rewards = []
avg_reward_intervals = []

# Simulation loop with stopping criterion
num_episodes = 3000
threshold = 195# Threshold for cumulative rewards

# Function to choose action using epsilon-greedy policy
def choose_action(s, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Random action
    else:
        # Choose action with highest value
        q_values = [Q.get((s, a), 0) for a in actions]
        return actions[np.argmax(q_values)]

# Initialize Q table
Q = {}

# Function to update Q table using SARSA
def update_Q_with_SARSA(s, a, r, s_new, a_new):
    old_q = Q.get((s, a), 0)  # Get current Q-value
    next_q = Q.get((s_new, a_new), 0)
    Q[(s, a)] = old_q + alpha * (r + gamma * next_q - old_q)  # Update Q-value

# Main loop
for episode in range(num_episodes):
    obs = env.reset()  # Initialize environment and get the first observation
    s = discretize_bins(obs)  # Discretize the state
    done = False
    total_reward = 0

    # Choose an action using epsilon-greedy policy
    a = choose_action(s, epsilon)

    # Run through an episode
    while not done:
        # Perform the action and observe the result
        obs, rew, done, _ = env.step(a)
        total_reward += rew

        # Discretize the new observation
        s_new = discretize_bins(obs)

        # Choose the next action using epsilon-greedy policy
        a_new = choose_action(s_new, epsilon)

        # Update Q table using SARSA
        update_Q_with_SARSA(s, a, rew, s_new, a_new)

        # Update the current state and action
        s = s_new
        a = a_new

    cum_rewards.append(total_reward)

    # Print reward at each episode
    print(f"Episode {episode + 1} - Total Reward: {total_reward}")

    # Calculate average reward for every 100 consecutive episodes
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(cum_rewards[episode - 99:episode + 1])
        avg_reward_intervals.append(avg_reward)
        print(f"Average reward for last 100 episodes ({episode - 99}-{episode}): {avg_reward}")

    # Decay epsilon every 20 episodes
    if (episode + 1) % epsilon_decay_interval == 0:
        epsilon = max(0.01, epsilon - epsilon_decay)

    # Check if the threshold is reached
    if avg_reward > threshold:
        print(f"Threshold reached! avg_reward : {avg_reward}")
        break  # Exit the training loop

# Display the total reward for the last episode
print(f"avg_reward for the final 100 episode: {avg_reward}")

env.close()  # Close the environment when done

Episode 1 - Total Reward: 19.0
Episode 2 - Total Reward: 14.0
Episode 3 - Total Reward: 44.0
Episode 4 - Total Reward: 21.0
Episode 5 - Total Reward: 11.0
Episode 6 - Total Reward: 13.0
Episode 7 - Total Reward: 15.0
Episode 8 - Total Reward: 28.0
Episode 9 - Total Reward: 20.0
Episode 10 - Total Reward: 36.0
Episode 11 - Total Reward: 18.0
Episode 12 - Total Reward: 24.0
Episode 13 - Total Reward: 17.0
Episode 14 - Total Reward: 48.0
Episode 15 - Total Reward: 23.0
Episode 16 - Total Reward: 13.0
Episode 17 - Total Reward: 27.0
Episode 18 - Total Reward: 30.0
Episode 19 - Total Reward: 10.0
Episode 20 - Total Reward: 14.0
Episode 21 - Total Reward: 19.0
Episode 22 - Total Reward: 58.0
Episode 23 - Total Reward: 39.0
Episode 24 - Total Reward: 21.0
Episode 25 - Total Reward: 27.0
Episode 26 - Total Reward: 27.0
Episode 27 - Total Reward: 16.0
Episode 28 - Total Reward: 50.0
Episode 29 - Total Reward: 36.0
Episode 30 - Total Reward: 20.0
Episode 31 - Total Reward: 24.0
Episode 32 - Tota

# **Q_Learning 1000 episode**

In [None]:
import random
import gym
import numpy as np

# Set up the CartPole environment
env = gym.make('CartPole-v1')

# Helper function to discretize the state
def discretize(x):
    return tuple((x / np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int))

# Create bins for discretization
def create_bins(i, num):
    return np.arange(num + 1) * (i[1] - i[0]) / num + i[0]

ints = [(-5, 5), (-2, 2), (-0.5, 0.5), (-2, 2)]  # Parameter intervals
nbins = [20, 20, 10, 10]  # Number of bins for each parameter
bins = [create_bins(ints[i], nbins[i]) for i in range(4)]

def discretize_bins(x):
    return tuple(np.digitize(x[i], bins[i]) for i in range(4))

# Define action space and Q-table
actions = (0, 1)  # Left, right
Q = {}

# Hyperparameters
alpha = 0.3  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.8 # Exploration probability

# Function to choose an action using epsilon-greedy policy
def choose_action(state, epsilon):
    if random.random() < epsilon:
        return random.choice(actions)
    else:
        return actions[np.argmax([Q.get((state, a), 0) for a in actions])]

# Store results and rewards
cum_rewards = []
avg_reward_intervals = []
avg_reward=0

# Simulation loop with stopping criterion
num_episodes = 1000
threshold = 600  # Threshold for cumulative rewards

for episode in range(num_episodes):
    obs = env.reset()  # Initialize environment and get the first observation
    s = discretize_bins(obs)  # Discretize the state
    done = False
    total_reward = 0

    # Run through an episode
    while not done:
        # Choose an action using epsilon-greedy policy
        a = choose_action(s, epsilon)

        # Perform the action and observe the result
        obs, rew, done, _ = env.step(a)
        total_reward += rew

        # Discretize the new observation
        s_new = discretize_bins(obs)

        # Q-learning update rule
        max_next_q = max([Q.get((s_new, a), 0) for a in actions])  # Get the max Q-value for the next state
        old_q = Q.get((s, a), 0)  # Get the current Q-value
        Q[(s, a)] = old_q + alpha * (rew + gamma * max_next_q - old_q)  # Update the Q-value

        # Update the current state
        s = s_new

    cum_rewards.append(total_reward)

    # Print average reward after every 100 episodes
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(cum_rewards[max(0, episode - 99):episode + 1])
        avg_reward_intervals.append(avg_reward)
        print(f"Average reward for last 100 episodes (Episode {episode+1}): {avg_reward}")

        # Periodically adjust exploration rate
        epsilon = max(0.01, epsilon * 0.4)  # Gradually reduce exploration

    # Check if the threshold is reached
    if total_reward > threshold:
        print(f"Threshold reached! Total reward: {total_reward}")
        break  # Exit the training loop

# Display the total reward for the last episode
print(f"Total reward for the final episode: {total_reward}")

env.close()  # Close the environment when done

Average reward for last 100 episodes (Episode 100): 25.67
Average reward for last 100 episodes (Episode 200): 43.31
Average reward for last 100 episodes (Episode 300): 65.37
Average reward for last 100 episodes (Episode 400): 80.89
Average reward for last 100 episodes (Episode 500): 96.12
Average reward for last 100 episodes (Episode 600): 121.09
Average reward for last 100 episodes (Episode 700): 106.64
Average reward for last 100 episodes (Episode 800): 111.88
Average reward for last 100 episodes (Episode 900): 128.59
Average reward for last 100 episodes (Episode 1000): 127.62
Total reward for the final episode: 122.0


# **Q_learning 2700 episode**

In [None]:
import random
import gym
import numpy as np

# Set up the CartPole environment
env = gym.make('CartPole-v1')

# Helper function to discretize the state
def discretize(x):
    return tuple((x / np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int))

# Create bins for discretization
def create_bins(i, num):
    return np.arange(num + 1) * (i[1] - i[0]) / num + i[0]

ints = [(-5, 5), (-2, 2), (-0.5, 0.5), (-2, 2)]  # Parameter intervals
nbins = [20, 20, 10, 10]  # Number of bins for each parameter
bins = [create_bins(ints[i], nbins[i]) for i in range(4)]

def discretize_bins(x):
    return tuple(np.digitize(x[i], bins[i]) for i in range(4))

# Define action space and Q-table
actions = (0, 1)  # Left, right
Q = {}

# Hyperparameters
alpha = 0.2  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.95 # Exploration probability

# Function to choose an action using epsilon-greedy policy
def choose_action(state, epsilon):
    if random.random() < epsilon:
        return random.choice(actions)
    else:
        return actions[np.argmax([Q.get((state, a), 0) for a in actions])]

# Store results and rewards
cum_rewards = []
avg_reward_intervals = []
avg_reward=0

# Simulation loop with stopping criterion
num_episodes = 9000
threshold =195 # Threshold for cumulative rewards

for episode in range(num_episodes):
    obs = env.reset()  # Initialize environment and get the first observation
    s = discretize_bins(obs)  # Discretize the state
    done = False
    total_reward = 0

    # Run through an episode
    while not done:
        # Choose an action using epsilon-greedy policy
        a = choose_action(s, epsilon)

        # Perform the action and observe the result
        obs, rew, done, _ = env.step(a)
        total_reward += rew

        # Discretize the new observation
        s_new = discretize_bins(obs)

        # Q-learning update rule
        max_next_q = max([Q.get((s_new, a), 0) for a in actions])  # Get the max Q-value for the next state
        old_q = Q.get((s, a), 0)  # Get the current Q-value
        Q[(s, a)] = old_q + alpha * (rew + gamma * max_next_q - old_q)  # Update the Q-value

        # Update the current state
        s = s_new

    cum_rewards.append(total_reward)

    # Print average reward after every 100 episodes
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(cum_rewards[max(0, episode - 99):episode + 1])
        avg_reward_intervals.append(avg_reward)
        print(f"Average reward for last 100 episodes (Episode {episode+1}): {avg_reward}")

        # Periodically adjust exploration rate
        epsilon = max(0.01, epsilon * 0.3)  # Gradually reduce exploration

    # Check if the threshold is reached
    if avg_reward > threshold:
        print(f"Threshold reached! avg_reward: {avg_reward}")
        break  # Exit the training loop


env.close()  # Close the environment when done

Average reward for last 100 episodes (Episode 100): 22.04
Average reward for last 100 episodes (Episode 200): 48.72
Average reward for last 100 episodes (Episode 300): 94.08
Average reward for last 100 episodes (Episode 400): 118.03
Average reward for last 100 episodes (Episode 500): 126.02
Average reward for last 100 episodes (Episode 600): 135.88
Average reward for last 100 episodes (Episode 700): 135.12
Average reward for last 100 episodes (Episode 800): 177.37
Average reward for last 100 episodes (Episode 900): 150.2
Average reward for last 100 episodes (Episode 1000): 131.57
Average reward for last 100 episodes (Episode 1100): 139.13
Average reward for last 100 episodes (Episode 1200): 128.26
Average reward for last 100 episodes (Episode 1300): 132.59
Average reward for last 100 episodes (Episode 1400): 137.17
Average reward for last 100 episodes (Episode 1500): 140.44
Average reward for last 100 episodes (Episode 1600): 124.46
Average reward for last 100 episodes (Episode 1700): 