### Imports

In [1]:
#This package
from multiagent_wrapper import Multiagent_wrapper
from multiagent_critic import Multiagent_critic
from m3ddpg import M3DDPG

#The environment
import laserhockey.hockey_env as h_env

#Things used to make implementations for the interface
import gym
import torch
from torch import nn
import numpy as np

#just used for the sake of clarity here
from typing import List, Union, Dict

## 1. Implementing the Multiagent_wrapper interface for the specific environment

In [2]:
class Multiagent_laserhockey_wrapper(Multiagent_wrapper):
    def __init__(self):
        #MANDETORY: gather all neccesarriy information to initialize the super class:
        env = h_env.HockeyEnv()
        state_space = env.observation_space
        num_agents = 2
        action_spaces = [gym.spaces.Box(-1.0, 1.0, [4], np.float32)]*2
        observation_spaces = [env.observation_space]*2
        super().__init__(env, state_space, num_agents, action_spaces, observation_spaces)

        #OPTIONAL: specific scaling factor for this environment
        self.scaling_vector = np.array([1.0, 1.0, 0.5, 4.0, 4.0, 4.0, 1.0, 1.0, 0.5, 4.0, 4.0, 4.0, 2.0, 2.0, 10.0, 10.0, 4.0, 4.0])

    def _build_joint_action(self, actions: List[np.array]) -> np.array:
        #MANDETORY: This function combines the actions such that they can be passed into the specific gym environment
        return np.hstack(actions)

    def _build_observations(self, state: np.array) -> List[np.array]:
        #MANDETORY: This function splits the state of the specific environment into observations for each agent and returns them in a list.
        return [state/self.scaling_vector, self.env.obs_agent_two()/self.scaling_vector]

    def _build_rewards(self, state: np.array, reward: float, info: Union[None, Dict]) -> List[float]:
        #MANDETORY: This function calculates the rewards for each agent and returns them in a list.
        pure_reward_p1 = reward - info["reward_closeness_to_puck"]
        reward_p1 = max(-1., pure_reward_p1)
        reward_p2 = max(-1., -pure_reward_p1)
        return [reward_p1, reward_p2]

    def _build_state(self, state: np.array) -> np.array:
        #OPTIONAL: Can be used to preprocess the state (used by the critics). Here it is used for scaling.
        return state/self.scaling_vector
    
#initialising environment
env = Multiagent_laserhockey_wrapper()

## 2. Creating actor Module, usual torch.nn.module

In [3]:
class HockeyActorNet(nn.Module):
    def __init__(self, in_dim, out_dim, min_value, max_value):
        super(HockeyActorNet, self).__init__()

        self.layers = nn.Sequential(  
            nn.Linear(in_dim,32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.ReLU(),
            nn.Linear(32, out_dim)
        )

        self.register_buffer('min_value', torch.tensor(min_value, requires_grad=False, dtype=torch.float32))
        self.register_buffer('max_value', torch.tensor(max_value, requires_grad=False, dtype=torch.float32))
        
    def forward(self, x):
        return torch.clip(self.layers(x), self.min_value, self.max_value)

#initialising actors
actor1 =  HockeyActorNet(in_dim=env.observation_spaces[0].shape[0], out_dim=env.action_spaces[0].shape[0], min_value=min(env.action_spaces[0].low), max_value=max(env.action_spaces[0].high))
actor2 =  HockeyActorNet(in_dim=env.observation_spaces[1].shape[0], out_dim=env.action_spaces[1].shape[0], min_value=min(env.action_spaces[1].low), max_value=max(env.action_spaces[1].high))

## 3. Creating Critic module inheriting the Multiagent_critic interface

In [4]:
class HockeyCriticNet(Multiagent_critic):
    def __init__(self, in_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim,32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.ReLU(),
            nn.Linear(32,1)
        )

    def forward(self, state: torch.tensor, actions: List[torch.tensor]) -> torch.tensor:
        #MANDETORY: This function describes how to process the state and the list 
        combined = torch.hstack([state,*actions])
        return self.layers(combined)
    
#initialising critics
critic1 = HockeyCriticNet(in_dim=env.state_space.shape[0]+env.action_space.shape[0])
critic2 = HockeyCriticNet(in_dim=env.state_space.shape[0]+env.action_space.shape[0])

### Optional, creating burn in policies

In [5]:
#Taking given policies from the environment, adding noise and mixing them for better exploration effect.
strong_opponent = h_env.BasicOpponent(weak=False)
weak_opponent = h_env.BasicOpponent(weak=True)

def _add_noise_to_action(action, noise_level, noise_clip, action_low, action_high):
    action = torch.tensor(action, dtype=torch.float32, requires_grad=False)
    action_high = torch.tensor(action_high, dtype=torch.float32, requires_grad=False)
    action_low =torch.tensor(action_low, dtype=torch.float32, requires_grad=False)
    noise = (torch.randn_like(action) * noise_level).clip(-noise_clip, noise_clip)
    return torch.max(torch.min(action + noise, action_high), action_low).numpy()

burnin_policies = [
    lambda obs: _add_noise_to_action(np.random.choice([strong_opponent,weak_opponent]).act(env.env._get_obs()),0.2, 1., env.action_spaces[0].low, env.action_spaces[0].high),
    lambda obs: _add_noise_to_action(np.random.choice([strong_opponent,weak_opponent]).act(env.env._get_obs()),0.2, 1., env.action_spaces[1].low, env.action_spaces[1].high)
]

## 4. Choose hyperparameters

In [6]:
# and pass previously initialised parameters, see Docstring of m3ddpg class for short description of parameters
m3ddpg = M3DDPG(env= env, 
            actor_models = [actor1, actor2],
            critic_models = [critic1, critic2],
            actor_learning_rates = [0.001, 0.001],
            critic_learning_rates = [0.001, 0.001],
            device = "cpu",
            discounts = [0.99, 0.99],
            taus = [0.005, 0.005],
            noise_levels = [0.2, 0.2],
            critic_noise_levels = [0.02, 0.02],
            noise_clips = [0.5, 0.5],
            epsilons = [0.2, 0.2],
            alphas = [1., 1.],
            batch_size=64,
            burnin_steps=100000,
            burnin_policies=burnin_policies,
            max_replay_buffer_size = 1000000,
            update_target_nets_frequency = 2)

## 5. Start training

In [7]:
rewards = m3ddpg.train(num_train_steps=100)

100%|████████████████████████████████████████████████████████████████████████| 100100/100100 [01:49<00:00, 914.25it/s]


### Load pretrained agents

In [8]:
#the actual load_status function just takes a directory and lists of file names 
#this is just a way to creat those filename lists

#creating file names
PREFIX = "M3DDPG"
DIR_PATH = "./models/"
NUM_ACTORS = 2
ITERATIONS = 4000000

actor_file_names, critic_file_names, actor_optimizer_file_names, critic_optimizer_file_names = [], [], [], []
for i in range(NUM_ACTORS):
    actor_file_names.append(f'{PREFIX}_actor{i}_{ITERATIONS}its.pt')
    critic_file_names.append(f'{PREFIX}_critic{i}_{ITERATIONS}its.pt')
    actor_optimizer_file_names.append(f'{PREFIX}_actor{i}_optimizer_{ITERATIONS}its.pt')
    critic_optimizer_file_names.append(f'{PREFIX}_critic{i}_optimizer_{ITERATIONS}its.pt')
    
#loading models and optimizers
m3ddpg.load_status(dir_path=DIR_PATH, 
                   actor_file_names=actor_file_names,
                  critic_file_names=critic_file_names,
                  actor_optimizer_file_names=actor_optimizer_file_names,
                  critic_optimizer_file_names=critic_optimizer_file_names)

### get policies

In [9]:
actors = [m3ddpg.get_policy(i) for i in range(NUM_ACTORS)]

### observer policies

In [10]:
for _ in range(10):
    state, obs = env.reset()
    done = False
    while not done:
        actions = []
        for i in range(NUM_ACTORS):
            actions.append(actors[i](obs[i]))
        state, obs, rerwards, done, info = env.step(actions)
        env.render()
env.close()