In [1]:
import gym
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
from itertools import count
from gymenv.PlaytimeEnv import PlaytimeEnv
from gymenv.action_manager import ActionModel
from objects.Maneuver import Mission_Maneuver
from objects.AllManeuvers import LIST_MAN
from objects.Plane import ULM
from function.tools import *
from function.j_methods import * 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

Cet avion peut voler pendant  52438  s, soit  873  min, soit 2039  km à une vitesse moyenne de  140  km/h et sa vitesse max sera de  200  km/h


  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


<matplotlib.pyplot._IonContext at 0x21c0b3bda30>

In [2]:
param_list = {
    'Plane': [ULM],
    'GoalDistance': [0, 10, 15, 20, 30],
    'RtBDistance': [0, 10, 15, 20, 30],
    'Fuel': [1, 2, 3, 3, 4, 10],
    # Start with just 2 states : good (sunny, no clouds) and bad (cloudy or rainy)
    'Meteo': ["Sunny", "Cloudy", "Misty"],
    'MissionType': [Mission_Maneuver.SCAR],  # , Mission_Maneuver.CAS
    # Add ennemies number afterwards, weaponry
    'Strength': ['Weak', 'Equal', 'Strong'],
    'TimeMin': [0, 300],
    'SynchroTime': [0, 1000],
}
combinations = get_all_combinations(param_list)
first_c = list(filter(lambda c : c['TimeMin'] == 300 and c['Fuel'] == 3, combinations))[0]
print(first_c)

{'Plane': ULM, 'GoalDistance': 0, 'RtBDistance': 0, 'Fuel': 3, 'Meteo': 'Sunny', 'MissionType': SCAR, 'Strength': 'Weak', 'TimeMin': 300, 'SynchroTime': 0}


In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [8]:
env = PlaytimeEnv(combinations)
env.reset()
a = env.action_space.sample()
print(a)
print([value.sample() for value in env.action_space.values()])
nb_param_out = max(man._nb_param_() for man in LIST_MAN) + 1

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

policy_net = ActionModel(len(env.state), nb_param_out)
target_net  = ActionModel(len(env.state), nb_param_out)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)
steps_done = 0

OrderedDict([('altitude', 6299), ('distance', 42), ('gap', 2), ('length', 32), ('maneuver', 3), ('speed', 181), ('width', 31)])
[12033, 29, 2, 24, 3, 139, 28]
4
4


In [5]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[value.sample() for 
                            value in env.action_space.values()]], 
                            device=device, dtype=torch.long)


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

In [6]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [7]:
num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    for t in count():
        # Select and perform an action
        action = select_action(state)
        print("Action selected :", action)
        last_state = state
        obs, reward, done, _ = env.step(action.tolist())
        reward = torch.tensor([reward], device=device)

        # Observe new state
        
        current_state = obs
        if not done:
            next_state = current_state - last_state
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()

Action selected : tensor([[3587,   20,    2,   38,    3,  184,   38]])
Action in step : [3587, 20, 2, 38, 3, 184, 38]


KeyError: 3587

In [None]:
env = PlaytimeEnv(first_c)

num_steps = 150
obs = env.reset()
for step in range(num_steps):
    # take random action, but you can also do something more intelligent
    # action = my_intelligent_agent_fn(obs) 
    # action = policy(obs)
    action = env.action_choose(obs)
    # apply the action
    obs, reward, done, info = env.step(action)
    print(obs, reward, done, info)
    # Render the env
    # env.render()

    # Wait a bit before the next frame unless you want to see a crazy fast video
    time.sleep(0.001)
    
    # If the episode is up, then start another one
    if done:
        print("Reset now")
        env.reset()
env.close()


KeyError: 5

# Let's try to add an agent with a deep learning neural network

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

In [None]:
print(env.observation_space)
states = (len(env.observation_space.spaces.keys()),)
actions = env.action_space.n 
num_hidden = 24

Box(2,)


AttributeError: 'Box' object has no attribute 'spaces'

In [None]:
print(states, actions)

(2,) 3


In [None]:
del model

In [None]:
# Entrée : paramètres de wheel
# Ajouter aussi les choix de plusieurs manoeuvres

inputs = Input(shape=states)
first = Dense(num_hidden, activation='relu')(inputs)
second = Dense(num_hidden, activation='relu')(first)
out_speed = Dense(env.action_space.spaces['speed'].n, activation='linear')(second)
out_altitude = Dense(env.action_space.spaces['altitude'].n, activation='linear')(second)
out_distance = Dense(env.action_space.spaces['distance'].n, activation='linear')(second)

model = Model(inputs=inputs, outputs=[out_speed, out_altitude, out_distance])


In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 2)]          0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 24)           72          ['input_1[0][0]']                
                                                                                                  
 dense_9 (Dense)                (None, 24)           600         ['dense_8[0][0]']                
                                                                                                  
 dense_10 (Dense)               (None, 10)           250         ['dense_9[0][0]']                
                                                                                            

In [None]:
from rl.agents import DDPGAgent, DQNAgent
from rl.policy import BoltzmannQPolicy 
from rl.memory import SequentialMemory

In [None]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                   nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [None]:
dqn = build_agent(states, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

AttributeError: 'tuple' object has no attribute 'output'