In [5]:
import numpy as np
import torch
from torch import nn
import import_ipynb
import ActorCriticModel
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from FiftyStations_OpenAIGym_wrapper import Torino_environment

ModuleNotFoundError: No module named 'FiftyStations_OpenAIGym_wrapper'

In [None]:
hidden_size = 64
learning_rate_actor = 1e-3
learning_rate_critic = 1e-3
max_episodes = 200
gamma = 0.9

action_size = 3
actions = [-1, 0, 1]
state_size = 2

train_env, test_env = TorinoEnvironment(), TorinoEnvironment()

train_env.head(.8)
test_env.tail(.8)

In [None]:
# env_work = TorinoEnvironment().getWork()
# largest_diff = 0
# index = 0
# for i in range(len(env_work) - 1):
#     if abs(env_work[i] - env_work[i+1]) > largest_diff:
#         largest_diff = abs(env_work[i] - env_work[i+1])
#         index = i
# print("Largest difference in work at index", i, "with a difference of ", largest_diff[0])

In [None]:
def entropy(x):
    logx = torch.log(x)
    Hx = torch.sum(x*logx)
    return Hx

In [None]:
def train(env, learning_rate_actor, learning_rate_critic, state_size, action_size, hidden_size, max_episodes, gamma, load_models=False):
    actor = ActorCriticModel.Actor(state_size, action_size, hidden_size, temperature=0.9)
    critic = ActorCriticModel.Critic(state_size, action_size, hidden_size)
    print(actor)
    print(critic)
    
    if load_models:
        actor.load_state_dict(torch.load('actor_state_dict_EUCNC_compare_results'))
        critic.load_state_dict(torch.load('critic_state_dict_EUCNC_compare_results'))

    optimizer_actor = torch.optim.Adam(actor.parameters(), lr=learning_rate_actor) # Consider if Adam is best optimizer here
    optimizer_critic = torch.optim.Adam(critic.parameters(), lr=learning_rate_critic)
    
    rewards_episodes_avg = []
    rewards_episodes_min = []
    rewards_episodes_max = []

    losses_actor = []
    losses_critic = []

    
    for episode in range(max_episodes):
        print("#################")
        print("## Episode %03d ##" % episode)
        print("#################")
        print("[TRAINING]")
        env.resetState()

        W = int(env.monitorState('instant_work'))
        N = W + 1
        state  = np.array((W, N))

        rewards = []
        values = []
        action_probs = []
        entropies = []
        k = 1

        #for step in tqdm(range(env.duration)):
        for step in tqdm(range(500)):
            state = torch.from_numpy(state).float()
            action_dist = actor.forward(state)

            if action_dist.isnan().any():
                break
    
            value = critic.forward(state)
            values.append(value)
            
            action = np.random.choice(actions, p=np.squeeze(action_dist.detach().cpu().numpy()))
            action_probs.append(action_dist[actions.index(action)])

            entropies.append(entropy(action_dist))

            env.evolveState(nrOfCPUs=int(state.cpu().numpy()[1]) + action)
            
            reward = env.getReward()
            rewards.append(reward)

            W = int(env.monitorState('instant_work'))
            N = len(env.monitorState('CPUload'))
            state = np.array((W, N))

        print("min reward episode: ", np.amin(rewards))
        print("max reward episode: ", np.amax(rewards))
        print("avg reward episode: ", np.mean(rewards))

        rewards_episodes_avg.append(np.mean(rewards)) 
        rewards_episodes_min.append(np.amin(rewards))
        rewards_episodes_max.append(np.amax(rewards))

        
        optimizer_actor.zero_grad()
        optimizer_critic.zero_grad()
        loss_actor, loss_critic = ActorCriticModel.calc_loss(torch.stack(action_probs), 
                                          torch.cat(values), 
                                          torch.tensor(rewards).float(),
                                          gamma)
        
        loss_entropy = torch.mean(torch.stack(entropies))
        loss = loss_actor + loss_critic + loss_entropy*k
        print("total_loss ", loss.item(), "| actor_loss ", loss_actor.item(), "| critic_loss ", loss_critic.item())
        losses_actor.append(loss_actor.item())
        losses_critic.append(loss_critic.item())
        loss.backward()
        optimizer_actor.step()
        optimizer_critic.step()

        torch.save(actor.state_dict(), 'actor_state_dict_EUCNC_compare_results')
        torch.save(critic.state_dict(), 'critic_state_dict_EUCNC_compare_results')

        if episode%50 == 0:
            val(test_env)
    
    plt.plot(losses_actor, label="loss actor")
    plt.legend()
#    plt.savefig("loss_actor_EUCNC_comparison")
#    plt.show()
    plt.plot(losses_critic, label="loss critic")
    plt.legend()
    plt.savefig("loss_critic_EUCNC_comparison")
    plt.show()

    return rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max

In [None]:
def val(env):

    print("[VALIDATION]")
    # Load models
    actor = ActorCriticModel.Actor(state_size, action_size, hidden_size)
    actor.load_state_dict(torch.load('actor_state_dict_EUCNC_compare_results'))
    actor.eval()
    
    critic = ActorCriticModel.Critic(state_size, action_size, hidden_size)
    critic.load_state_dict(torch.load('critic_state_dict_EUCNC_compare_results'))
    critic.eval()
    
    rewards_episodes_avg = []
    rewards_episodes_min = []
    rewards_episodes_max = []

    losses_actor = []
    losses_critic = []
    
    env.resetState()

    W = int(env.monitorState('instant_work'))
    N = W + 1
    state  = np.array((W, N))

    rewards = []
    values = []
    action_probs = []

    with torch.no_grad():
        for step in tqdm(range(env.duration)):
            state = torch.from_numpy(state).float()
            action_dist = actor.forward(state)
    
            if action_dist.isnan().any():
                break
    
            value = critic.forward(state)
            values.append(value)
            
            #action = np.random.choice(actions, p=np.squeeze(action_dist.detach().cpu().numpy()))
            action = actions[np.argmax(action_dist.detach().cpu().numpy())]
            action_probs.append(action_dist[actions.index(action)])
    
            env.evolveState(nrOfCPUs=int(state.cpu().numpy()[1]) + action)
            
            reward = env.getReward()
            rewards.append(reward)
    
            W = int(env.monitorState('instant_work'))
            N = len(env.monitorState('CPUload'))
            state = np.array((W, N))
    
        print("min reward episode: ", np.amin(rewards))
        print("max reward episode: ", np.amax(rewards))
        print("avg reward episode: ", np.mean(rewards))

        rewards_episodes_avg.append(np.mean(rewards)) 
        rewards_episodes_min.append(np.amin(rewards))
        rewards_episodes_max.append(np.amax(rewards))

        loss_actor, loss_critic = ActorCriticModel.calc_loss(torch.stack(action_probs), 
                                          torch.cat(values), 
                                          torch.tensor(rewards).float(),
                                          gamma)
        
        loss = loss_actor + loss_critic
        print("total_loss ", loss.item(), "| actor_loss ", loss_actor.item(), "| critic_loss ", loss_critic.item())
        losses_actor.append(loss_actor.item())
        losses_critic.append(loss_critic.item())

In [None]:
def test(env, save=True):

    # Load models
    actor = ActorCriticModel.Actor(state_size, action_size, hidden_size)
    actor.load_state_dict(torch.load('actor_state_dict_EUCNC_compare_results'))
    actor.eval()
    
    critic = ActorCriticModel.Critic(state_size, action_size, hidden_size)
    critic.load_state_dict(torch.load('critic_state_dict_EUCNC_compare_results'))
    critic.eval()
    
    # Run test environment
    env.resetState()
    traceNrOfCPUs = np.zeros((env.duration,))
    traceMaxCPUload = np.zeros((env.duration,))
    traceSumBacklog = np.zeros((env.duration,))
    traceReward = np.zeros((env.duration,))
    
    W = int(env.monitorState('instant_work'))
    N = W + 1
    state  = np.array((W, N))
    rewards = []
    delays = []
    time = 0

    #while not env.stop:
    for step in tqdm(range(env.duration)):

        state = torch.from_numpy(state).float()
        action_dist = actor.forward(state)
        value = critic.forward(state)
        #action = np.random.choice(actions, p=np.squeeze(action_dist.detach().numpy()))
        action = actions[np.argmax(action_dist.detach().cpu().numpy())]
        
        env.evolveState(nrOfCPUs=int(state.numpy()[1]) + action)
          
        reward = env.getReward()
        rewards.append(reward)
        
        W = int(env.monitorState('instant_work'))
        N = len(env.monitorState('CPUload'))
        state = np.array((W, N))    
     
        traceNrOfCPUs[time] = state[1]
        traceMaxCPUload[time] = np.max(env.monitorState('CPUload'))
        traceSumBacklog[time] = np.sum(env.monitorState('backlog'))
        traceReward[time] = env.getReward()
        
        delays.append(env.delay)
        
        time += 1

    if save:
        #add in all other plots to compare
        pd.DataFrame({
            'nrOfCPUs': traceNrOfCPUs,
            'maxCPUload': traceMaxCPUload,
            'sumBacklog': traceSumBacklog,
            'reward': traceReward,
        }).to_csv('A2C.csv')

In [None]:
def visualize(rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max):    
    #smoothed_rewards = pd.Series.rolling(pd.Series(rewards_episodes), 10).mean() # hier zou alle rewards weer meegenomen moeten worden
    #smoothed_rewards = [elem for elem in smoothed_rewards]
    
    plt.plot(rewards_episodes_avg, label="Average reward per episode")
    plt.plot(rewards_episodes_min, label="Minimum reward per episode")
    plt.plot(rewards_episodes_max, label="Maximum reward per episode")
    #plt.plot(smoothed_rewards)
    plt.legend()
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.savefig("rewards_episodes_EUCNCcomparison")
    plt.show()

In [None]:
# train
rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max = train(train_env, learning_rate_actor, learning_rate_critic, state_size, action_size, hidden_size, max_episodes, gamma, load_models=False)

# test
val(test_env)
test(test_env)

visualize(rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max)
#env.close()