In [None]:
!pip install gymnasium


In [None]:
!pip3 install gymnasium\[box2D\]

In [None]:
!brew install swig
!pip install "gymnasium[box2D]"


In [6]:
import gymnasium as gym
import numpy as np

class LinearValueFunction:
    def __init__(self, state_dim, learning_rate=0.01):
        self.weights = np.zeros(state_dim)
        self.learning_rate = learning_rate

    def predict(self, state):
        return np.dot(self.weights, state)

    def update(self, state, target):
        prediction = self.predict(state)
        error = target - prediction
        self.weights += self.learning_rate * error * state

def linear_value_function_approximation(env, num_episodes=50000, gamma=0.9, learning_rate=0.01):
    state_dim = env.observation_space.shape[0]
    value_function = LinearValueFunction(state_dim, learning_rate)
    rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = env.action_space.sample()
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            target = reward
            if not done:
                target += gamma * value_function.predict(next_state)

            value_function.update(state, target)
            state = next_state

        rewards.append(total_reward)
        if (episode + 1) % 1000 == 0:
            print(f'Episode {episode + 1}, Total Reward: {total_reward}')

    return value_function, rewards

env_cont = gym.make("LunarLander-v2", render_mode = "human")
value_function, rewards_cont = linear_value_function_approximation(env_cont)


KeyboardInterrupt: 

In [None]:
# def create_bins(num_bins, lower_bound, upper_bound):
#     return np.linspace(lower_bound, upper_bound, num_bins + 1)[1:-1]

# def digitize_state(state, bins):
#     digitized_state = []
#     for i in range(len(state)):
#         digitized_state.append(np.digitize(state[i], bins[i]))
#     return tuple(digitized_state)

# class QLearningAgent:
#     def __init__(self, state_bins, action_space, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0):
#         self.state_bins = state_bins
#         self.action_space = action_space
#         self.learning_rate = learning_rate
#         self.discount_factor = discount_factor
#         self.exploration_rate = exploration_rate
#         # self.exploration_decay = exploration_decay
#         # self.min_exploration_rate = min_exploration_rate
#         self.q_table = np.zeros(state_bins + (action_space.n,))

#     def choose_action(self, state):
#         if np.random.rand() < self.exploration_rate:
#             return self.action_space.sample()
#         return np.argmax(self.q_table[state])

#     def update_q_value(self, state, action, reward, next_state):
#         best_next_action = np.argmax(self.q_table[next_state])
#         td_target = reward + self.discount_factor * self.q_table[next_state][best_next_action]
#         td_error = td_target - self.q_table[state][action]
#         self.q_table[state][action] += self.learning_rate * td_error

#     # def decay_exploration_rate(self):
#     #     if self.exploration_rate > self.min_exploration_rate:
#     #         self.exploration_rate *= self.exploration_decay

# def q_learning(env, num_episodes=50000, num_bins = 24, learning_rate=0.1, gamma=0.9):
#     bins = [
#         create_bins(num_bins, -1, 1),    # x position
#         create_bins(num_bins, -1, 1),    # y position
#         create_bins(num_bins, -1, 1),    # x velocity
#         create_bins(num_bins, -1, 1),    # y velocity
#         create_bins(num_bins, -1, 1),    # angle
#         create_bins(num_bins, -1, 1),    # angular velocity
#         create_bins(num_bins, 0, 1),     # left leg contact
#         create_bins(num_bins, 0, 1)      # right leg contact
#     ]
#     agent = QLearningAgent(tuple([num_bins] * len(bins)), env.action_space, learning_rate, gamma)
#     rewards = []

#     for episode in range(num_episodes):
#         state, _ = env.reset()
#         state = digitize_state(state, bins)
#         total_reward = 0
#         done = False

#         while not done:
#             action = agent.choose_action(state)
#             next_state, reward, terminated, truncated, _ = env.step(action)
#             done = terminated or truncated
#             next_state = digitize_state(next_state, bins)
#             total_reward += reward

#             agent.update_q_value(state, action, reward, next_state)
#             state = next_state
#         rewards.append(total_reward)
#         if (episode + 1) % 1000 == 0:
#             print(f'Episode {episode + 1}, Total Reward: {total_reward}')

#     return agent, rewards

In [3]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the environment
env = gym.make("LunarLander-v2")
env.reset()

# Initialize hyperparameters
episodes = 50000
alpha = 0.1
gamma = 0.9
epsilon = 0.1

# Initialize array to store total reward for each episode
rewards = np.zeros(episodes)


In [7]:

# Create bins for discretizing the state space
def create_bins(num_bins, lower_bound, upper_bound):
    return np.linspace(lower_bound, upper_bound, num_bins + 1)[1:-1]

# Digitize the state
def digitize_state(state, bins):
    digitized_state = []
    for i in range(len(state)):
        digitized_state.append(np.digitize(state[i], bins[i]))
    return tuple(digitized_state)

# Define the number of bins
num_bins = 4
bins = [
    create_bins(num_bins, -1, 1),  # x position
    create_bins(num_bins, -1, 1),  # y position
    create_bins(num_bins, -1, 1),  # x velocity
    create_bins(num_bins, -1, 1),  # y velocity
    create_bins(num_bins, -1, 1),  # angle
    create_bins(num_bins, -1, 1),  # angular velocity
    create_bins(num_bins, 0, 1),   # left leg contact
    create_bins(num_bins, 0, 1)    # right leg contact
]

# Initialize Q-table
num_columns = env.action_space.n
qtable = np.zeros(tuple([num_bins] * len(bins)) + (num_columns,))

# Function to run each episode
def episode():
    total_reward = 0
    state = env.reset()[0]
    state = digitize_state(state, bins)
    done = False
    g = 0
    while not done:
        # Choose action by epsilon-greedy policy
        g += 1 
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        elif np.max(qtable[state]) > 0:
            action = np.argmax(qtable[state])
        else:
            action = env.action_space.sample()

        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated  # Becomes true when episode terminates or gets truncated
        total_reward += reward
        new_state = digitize_state(new_state, bins)
        
        # Update Q-table
        qtable[state + (action,)] += alpha * (reward + gamma * np.max(qtable[new_state]) - qtable[state + (action,)])

        state = new_state
        avg_rew = total_reward/g

    return avg_rew

# Run the episodes
for epi in range(episodes):
    rewards[epi] = episode()

print(qtable)

# Plot reward vs episode curves
def plot_rewards(rewards, title):
    plt.figure(figsize=(10, 5))
    plt.plot(rewards)
    plt.title(title)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.show()

plot_rewards(rewards, 'Reward vs Episode (Discrete State, Q-Learning)')




In [None]:
import matplotlib.pyplot as plt

def plot_rewards(rewards, title):
    plt.figure(figsize=(10, 5))
    plt.plot(rewards)
    plt.title(title)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.show()

plot_rewards(rewards_cont, 'Reward vs Episode (Continuous State, Linear Value Function)')
plot_rewards(rewards_disc, 'Reward vs Episode (Discrete State, Q-Learning)')


In [None]:
import seaborn as sns

def plot_heatmap(weights, title):
    plt.figure(figsize=(10, 5))
    sns.heatmap(weights.reshape(1, -1), annot=True, fmt=".2f", cmap="viridis")
    plt.title(title)
    plt.show()

plot_heatmap(value_function.weights, 'Linear Value Function Weights')


In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class LinearQFunction:
    def __init__(self, state_dim, action_dim, learning_rate=0.01):
        self.weights = np.zeros((state_dim, action_dim))
        self.learning_rate = learning_rate

    def predict(self, state):
        return np.dot(state, self.weights)

    def update(self, state, action, target):
        prediction = self.predict(state)[action]
        error = target - prediction
        self.weights[:, action] += self.learning_rate * error * state

class QLearningAgent:
    def __init__(self, state_dim, action_dim, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_function = LinearQFunction(state_dim, action_dim, learning_rate)

    def choose_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return np.random.randint(self.action_dim)
        return np.argmax(self.q_function.predict(state))

    def update_q_value(self, state, action, reward, next_state, done):
        best_next_action = np.argmax(self.q_function.predict(next_state))
        td_target = reward + self.discount_factor * self.q_function.predict(next_state)[best_next_action] * (not done)
        self.q_function.update(state, action, td_target)

def q_learning(env, num_episodes=50000, learning_rate=0.1, gamma=0.9):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = QLearningAgent(state_dim, action_dim, learning_rate, gamma)
    rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            agent.update_q_value(state, action, reward, next_state, done)
            state = next_state
        rewards.append(total_reward)
        if (episode + 1) % 1000 == 0:
            print(f'Episode {episode + 1}, Total Reward: {total_reward}')

    return agent, rewards

env_disc = gym.make("LunarLander-v2")
agent, rewards_disc = q_learning(env_disc)

# Plot reward vs episode curves
def plot_rewards(rewards, title):
    plt.figure(figsize=(10, 5))
    plt.plot(rewards)
    plt.title(title)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.show()

plot_rewards(rewards_disc, 'Reward vs Episode (Continuous State, Linear Q-learning)')

# Heatmap of the Q-function weights for a sample action (e.g., action 0)
def plot_heatmap(weights, action, title):
    plt.figure(figsize=(10, 5))
    sns.heatmap(weights[:, action].reshape(1, -1), annot=True, fmt=".2f", cmap="viridis")
    plt.title(title)
    plt.show()

plot_heatmap(agent.q_function.weights, 0, 'Linear Q-learning Weights for Action 0')


In [5]:
env.close()