In [1]:
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gymnasium as gym
from gymnasium.core import Env
from collections import deque, OrderedDict
from tqdm import tqdm
from typing import Dict, Optional, Callable, List, Generator, Tuple, Literal, overload, Any
import math
#from env_sets import EnvSet, possible_envs
from scipy.stats import norm, uniform
import wandb
import random

In [2]:
class NormalizeEnv(gym.Env):
    def __init__(self, env):
        self.env = env
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

        # Running mean and variance for normalization
        self.obs_mean = np.zeros(self.observation_space.shape)
        self.obs_var = np.ones(self.observation_space.shape)
        self.epsilon = 1e-8
        self.clip_obs = 10.0
        self.count = 0

    def update_obs_stats(self, obs):
        # Increment count
        self.count += 1
        # Update mean and variance
        #print(obs[0].shape, self.obs_mean.shape)
        delta = obs - self.obs_mean
        self.obs_mean += delta / self.count
        delta2 = obs - self.obs_mean
        self.obs_var += delta * delta2

    def normalize_obs(self, obs):
        return np.clip((obs - self.obs_mean) / (np.sqrt(self.obs_var / self.count) + self.epsilon), -self.clip_obs, self.clip_obs)

    def denormalize_obs(self, obs):
        return obs * (np.sqrt(self.obs_var / self.count) + self.epsilon) + self.obs_mean

    def get_normalized_stats(self):
        return self.obs_mean, np.sqrt(self.obs_var / self.count)

    def set_normalized_stats(self, obs_mean, obs_std):
        self.obs_mean = obs_mean
        self.obs_var = obs_std ** 2
        self.count = 1  # Reset count to 1 since we are setting stats manually

    def step(self, action):
        truncated  =False
        obs, reward, done, truncated, info = self.env.step(action)
        
        self.update_obs_stats(obs)
        obs_normalized = self.normalize_obs(obs)
        return obs_normalized, reward, done, truncated, info

    def reset(self):
        obs = self.env.reset()[0]
        self.update_obs_stats(obs)
        obs_normalized = self.normalize_obs(obs)
        return obs_normalized

    def render(self, mode='human'):
        return self.env.render(mode=mode)

    def close(self):
        return self.env.close()

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return state, action, reward, next_state, done
        #return (np.array(states), np.array(actions), np.array(rewards),np.array(next_states), np.array(dones))
    
    def __len__(self):
        return len(self.buffer)

class ContinuousMountainCarCustom(gym.Env):
    def __init__(self, gravity, mountain_height, slope):
        self.env = gym.make('MountainCar-v0')  #, render_mode="human")
        self.env.gravity = gravity
        self.env.height = mountain_height
        self.env.slope = slope
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        return self.env.reset()

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()

In [3]:
def generate_tasks(gravity_range, height_range, slope_range, num_samples=10):
    gravity = np.random.uniform(*gravity_range, size=num_samples)
    height = np.random.uniform(*height_range, size=num_samples)
    slope = np.random.uniform(*slope_range, size=num_samples)
    return list(zip(gravity, height, slope))

# Define parameter ranges
gravity_range = [0.002, 0.003]
height_range = [0.35, 0.55]
slope_range = [2.0, 4.0]

# Create task pool
task_pool = generate_tasks(gravity_range, height_range, slope_range, num_samples=10)
random.shuffle(task_pool)

# Train and test splits
train_tasks = task_pool[:7]
test_tasks = task_pool[7:]

## MAML for Mountain car

In [4]:
def maml_train(meta_model, tasks, outer_steps=25, inner_steps=10, lr_inner=0.01, lr_outer=0.001, gamma=0.99, capacity = 1000, max_steps=500):
    meta_optimizer = optim.Adam(meta_model.parameters(), lr=lr_outer)
    task_models = []
    
    for outer_step in range(outer_steps):
        meta_gradients = None

        for task_idx, task_params in enumerate(tasks):
            gravity, height, slope = task_params
            env = ContinuousMountainCarCustom(gravity, height, slope)
            env = NormalizeEnv(env)
            model = deepcopy(meta_model)
            optimizer = optim.Adam(model.parameters(), lr=lr_inner)

            # Inner loop
            buffer = ReplayBuffer(capacity=10000)
            
            for _ in range(inner_steps):
                state = env.reset()
                done = False
                for _ in range(max_steps):
                    #print(state) #, torch.FloatTensor(state), torch.FloatTensor(state).detach())
                    #action = model(torch.FloatTensor(state)).detach().numpy()      # for continuous action space
                    action_values = model(torch.FloatTensor(state)).detach().numpy()
                    action = np.argmax(action_values)
                    #print(action)
                    next_state, reward, done, truncated, _ = env.step(action)
                    buffer.add(state, action, reward, next_state, done)
                    state = next_state

                print(f"Testing on task {task_idx + 1}/{len(test_tasks)}: {task_params}")
                # we can update 
                #task_rewards.append(sum(rewards))
                # Update model with sampled experiences
                states, actions, rewards, next_states, dones = buffer.sample(batch_size=32)
                states, actions, rewards, next_states, dones = map(torch.FloatTensor, (states, actions, rewards, next_states, dones))
                q_values = model(states).gather(1, actions.long().unsqueeze(1)).squeeze(1)
                next_q_values = model(next_states).max(1)[0]
                target_q_values = rewards + (1 - dones) * gamma * next_q_values
                loss = nn.MSELoss()(q_values, target_q_values)
                # for applying DDQN like meta model we have to make one target_meta model as well

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Compute gradients for the meta-model
            task_gradients = torch.autograd.grad(loss, meta_model.parameters(), retain_graph=True)
            if meta_gradients is None:
                meta_gradients = task_gradients
            else:
                meta_gradients = [g + tg for g, tg in zip(meta_gradients, task_gradients)]

        # Outer loop update
        meta_optimizer.zero_grad()
        for param, grad in zip(meta_model.parameters(), meta_gradients):
            param.grad = grad / len(tasks)
        meta_optimizer.step()

        print(f"Outer step {outer_step + 1}/{outer_steps} completed.")


In [5]:
def maml_test(meta_model, tasks, num_episodes=10):
    rewards = []

    for task_params in tasks:
        gravity, height, slope = task_params
        env = ContinuousMountainCarCustom(gravity, height, slope)
        env = NormalizeEnv(env)
        total_reward = 0

        for _ in range(num_episodes):
            state = env.reset()
            done = False
            while not done:
                with torch.no_grad():
                    #action = meta_model(torch.FloatTensor(state)).numpy()     # for cont action space
                    action_values = model(torch.FloatTensor(state)).detach().numpy()
                    action = np.argmax(action_values)
                state, reward, done, _ = env.step(action)
                total_reward += reward

        rewards.append(total_reward / num_episodes)

    return np.mean(rewards)


In [None]:
Env = gym.make('MountainCar-v0')

state_dim = Env.observation_space.shape[0]  # MountainCar state space
action_dim = Env.action_space.n  
meta_model = DQN(state_dim, action_dim)

# Train and test
maml_train(meta_model, train_tasks)
meta_reward = maml_test(meta_model, test_tasks)

print(f"Meta-Learning Test Reward: {meta_reward}")

## Non-MAML training of agent for mountain car

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        return (
            np.array(states), 
            np.array(actions), 
            np.array(rewards), 
            np.array(next_states), 
            np.array(dones)
        )
    
    def size(self):
        return len(self.buffer)

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

def train_agent(model, train_tasks, episodes=2000, lr=0.001, gamma=0.99, epsilon_start=1.0, epsilon_end=0.1,epsilon_decay=0.995, batch_size=64, buffer_capacity=50000, update_freq=4, max_steps=500):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(capacity=buffer_capacity)
    epsilon = epsilon_start
    step_count = 0
    
    for episode in range(episodes):
        # Randomly sample a task at each episode
        task_idx = np.random.randint(0, len(train_tasks))
        gravity, height, slope = train_tasks[task_idx]
        env = ContinuousMountainCarCustom(gravity, height, slope)
        env = NormalizeEnv(env)

        state = env.reset()
        state = np.concatenate([state, [gravity, height, slope]])  # Include task-specific parameters
        done = False
        episode_reward = 0

        for _ in range(max_steps):
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = model(torch.FloatTensor(state)).argmax().item()

            next_state, reward, done, truncated, _ = env.step(action)
            next_state = np.concatenate([next_state, [gravity, height, slope]])  # Include task-specific parameters
            replay_buffer.add(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            step_count += 1

            # Perform training on the replay buffer
            if step_count % update_freq == 0 and replay_buffer.size() > batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
                states = torch.FloatTensor(states)
                actions = torch.LongTensor(actions)
                rewards = torch.FloatTensor(rewards)
                next_states = torch.FloatTensor(next_states)
                dones = torch.FloatTensor(dones)
    
                q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                next_q_values = model(next_states).max(1)[0]
                target_q_values = rewards + (1 - dones) * gamma * next_q_values
    
                loss = nn.MSELoss()(q_values, target_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Reduce epsilon
                epsilon = max(epsilon_end, epsilon * epsilon_decay)
        print(step_count)
        if (episode + 1) % 20 == 0:
            print(f"Episode {episode + 1}, Task {task_idx + 1}/{len(train_tasks)}, Reward: {episode_reward}")

# Test the agent
def test_agent(model, test_tasks, episodes=100):
    rewards = []
    for episode in range(episodes):
        # Randomly sample a task at each episode
        task_idx = np.random.randint(0, len(test_tasks))
        gravity, height, slope = test_tasks[task_idx]
        env = ContinuousMountainCarCustom(gravity, height, slope)
        env = NormalizeEnv(env)
        
        state = env.reset()
        state = np.concatenate([state, [gravity, height, slope]])  # Include task-specific parameters
        done = False
        episode_reward = 0
            
        while not done:
            action = model(torch.FloatTensor(state)).argmax().item()
            next_state, reward, done, truncated, _ = env.step(action)
            next_state = np.concatenate([next_state, [gravity, height, slope]])  # Include task-specific parameters
            state = next_state
            episode_reward += reward

        rewards.append(episode_reward)
        print(f"Task {task_idx + 1}, Episode {episode + 1}, Reward: {episode_reward}")
    
    average_reward = np.mean(rewards)
    print(f"Average Reward across all test tasks: {average_reward}")

# Main script
if __name__ == "__main__":

    state_dim = 2 + 3  # State features + task-specific parameters (gravity, height, slope)
    action_dim = 3  # Continuous Mountain Car has 3 discrete actions

    model = DQN(state_dim, action_dim)

    print("Starting Training...")
    train_agent(model, train_tasks)

    print("\nStarting Testing...")
    test_agent(model, test_tasks)


Starting Training...
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
Episode 20, Task 4/7, Reward: -500.0
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
Episode 40, Task 1/7, Reward: -500.0
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
Episode 60, Task 4/7, Reward: -500.0
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
Episode 80, Task 2/7, Reward: -500.0
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
Episode 100, Task 1/7, Reward: -500.0
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
Episode 120, Task 2/7, Reward: -500.0
60500
61000
61500
62000
62500
63000
63500
64000
64500
6