In [1]:
import numpy as np
import torch
from torch import nn
import operator
from TorinoEnvironment import TorinoEnvironment
import import_ipynb
import ActorCriticModel
import os
import pandas as pd

importing Jupyter notebook from ActorCriticModel.ipynb


In [2]:
#find correct hyperparameters
hidden_size = 32 
learning_rate = 1e-3 
max_episodes = 200
gamma = 0.99

In [3]:
def train(env, learning_rate, state_size, action_size, hidden_size, max_episodes, gamma):
    actor = ActorCriticModel.Actor(state_size, action_size, hidden_size).double()
    critic = ActorCriticModel.Critic(state_size, action_size, hidden_size).double()    
    
    params = list(actor.parameters()) + list(critic.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    rewards_episodes_avg = []
    rewards_episodes_min = []
    rewards_episodes_max = []
    
    losses_actor = []
    losses_critic = []

    for episode in range(max_episodes):
        print("Episode #", episode)
        env.resetState()
        state = np.array([env.minimumNrOfCPUs])

        rewards = []
        values = []
        action_probs = []
        rewards_min = 0.0
        rewards_max = 0.0

        while not env.stop:
            state = torch.from_numpy(state).double()
            action_dist = actor.forward(state)
    
            value = critic.forward(state)
            values.append(value)
            
            action = np.random.choice(action_size, p=np.squeeze(action_dist.detach().numpy()))
            action_probs.append(action_dist[action])
            state = np.array([env.evolveState(nrOfCPUs=action)])
            reward = env.getReward()
            rewards.append(reward)
            if reward < rewards_min:
                rewards_min = reward
            elif reward > rewards_max:
                rewards_max = reward

        print("min reward episode: ", rewards_min)
        print("max reward episode: ", rewards_max)
        discounted_sum = 0.0
        returns = []

        rewards_episodes_avg.append(np.sum(rewards)/env.duration) 
        rewards_episodes_min.append(np.amin(rewards))
        rewards_episodes_max.append(np.amax(rewards))
        
        optimizer.zero_grad()
        loss_actor, loss_critic = ActorCriticModel.calc_loss(torch.stack(action_probs), 
                                          torch.cat(values), 
                                          torch.tensor(rewards).double(),
                                          gamma)

        losses_actor.append(loss_actor.item())
        losses_critic.append(loss_critic.item())
        
        loss = loss_actor + loss_critic
        loss.backward()
        optimizer.step()
        
        print("total loss ",loss, "actor loss ", loss_actor, "critic loss ", loss_critic)
        losses_actor.append(loss_actor.item())
        losses_critic.append(loss_critic.item())
    
    #Uncomment this if you want to save your progress!
    #torch.save(actor.state_dict(), 'actor_state_dict')
    #torch.save(critic.state_dict(), 'critic_state_dict')
    
    plt.plot(losses_actor, label="loss actor")
    plt.legend()
    plt.savefig("loss_actor_torino")
    plt.show()
    plt.plot(losses_critic, label="loss critic")
    plt.legend()
    plt.savefig("loss_critic_torino")
    plt.show()
    return rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

def visualize(rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max):    
    #smoothed_rewards = pd.Series.rolling(pd.Series(rewards_episodes), 10).mean() # hier zou alle rewards weer meegenomen moeten worden
    #smoothed_rewards = [elem for elem in smoothed_rewards]
    
    plt.plot(rewards_episodes_avg, label="Average reward per episode")
    plt.plot(rewards_episodes_min, label="Minimum reward per episode")
    plt.plot(rewards_episodes_max, label="Maximum reward per episode")
    #plt.plot(smoothed_rewards)
    plt.legend()
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.show()


In [5]:
train_env, test_env = TorinoEnvironment(), TorinoEnvironment()
train_env.head(.8)
test_env.tail(.8)

55659
13915


In [6]:
state_size = 1 #Dimensions of amount of cars incoming, so 1?
action_size = train_env.maximumNrOfCPUs - train_env.minimumNrOfCPUs
rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max = \
        train(train_env, learning_rate, state_size, action_size, hidden_size, max_episodes, gamma)


visualize(rewards_episodes_avg, rewards_episodes_min, rewards_episodes_max)
#env.close()

Episode # 0
min reward episode:  -0.9094967544864451
max reward episode:  0.4236930567491761
total loss  tensor(4278795.9478, dtype=torch.float64, grad_fn=<AddBackward0>) actor loss  tensor(4278231.3055, dtype=torch.float64, grad_fn=<NegBackward>) critic loss  tensor(564.6423, dtype=torch.float64, grad_fn=<MseLossBackward>)
Episode # 1
min reward episode:  -1.2395311187476135
max reward episode:  0.4319746173196366
total loss  tensor(4345359.3126, dtype=torch.float64, grad_fn=<AddBackward0>) actor loss  tensor(4344807.7800, dtype=torch.float64, grad_fn=<NegBackward>) critic loss  tensor(551.5326, dtype=torch.float64, grad_fn=<MseLossBackward>)
Episode # 2
min reward episode:  -1.07818098510882
max reward episode:  0.4143442450116413
total loss  tensor(4353763.9552, dtype=torch.float64, grad_fn=<AddBackward0>) actor loss  tensor(4353225.0665, dtype=torch.float64, grad_fn=<NegBackward>) critic loss  tensor(538.8887, dtype=torch.float64, grad_fn=<MseLossBackward>)
Episode # 3
min reward e

KeyboardInterrupt: 

In [None]:
# env = TorinoEnvironment()
# env.plotWork()

In [None]:
actor = ActorCriticModel.Actor(state_size, action_size, hidden_size).double()
actor.load_state_dict(torch.load('actor_state_dict'))
actor.eval()

critic = ActorCriticModel.Critic(state_size, action_size, hidden_size).double()
critic.load_state_dict(torch.load('critic_state_dict'))
critic.eval()

test_env.resetState()
traceNrOfCPUs = np.zeros((test_env.duration,))
traceMaxCPUload = np.zeros((test_env.duration,))

#nrOfCPUs = 21 # best setup according to TorinoScalingConstant.ipynb

state = np.array([test_env.minimumNrOfCPUs])
rewards = []
delays = []

time = 0
while not test_env.stop:
    state = torch.from_numpy(state).double()
    action_dist = actor.forward(state)
    value = critic.forward(state)
    action = np.random.choice(action_size, p=np.squeeze(action_dist.detach().numpy()))
    state = np.array([test_env.evolveState(nrOfCPUs=action)])
    
    print(action)
    print(action_dist)
    
    reward = test_env.getReward()
    rewards.append(reward)    
    traceNrOfCPUs[time] = state
    traceMaxCPUload[time] = np.max(test_env.monitorState('CPUload'))
    
    delays.append(test_env.delay)
    
    time += 1

rewards_benchmark = []
test_env.resetState()
nr_of_CPUs = 21
while not test_env.stop:
    state = nr_of_CPUs
    test_env.evolveState(nrOfCPUs=action)
    rewards_benchmark.append(test_env.getReward())

# plot figures
smoothed_traceNrOfCPUs = pd.Series.rolling(pd.Series(traceNrOfCPUs), 200).mean()
smoothed_traceNrOfCPUs = [elem for elem in smoothed_traceNrOfCPUs]

plt.figure(figsize = (16,4))
plt.xlabel('time [5m interval]')
plt.ylabel('nr of CPUs')
plt.plot(traceNrOfCPUs)
print(f'average number of CPUs {np.mean(traceNrOfCPUs):.1f}')

smoothed_traceMaxCPUload = pd.Series.rolling(pd.Series(traceMaxCPUload), 200).mean()
smoothed_traceMaxCPUload = [elem for elem in smoothed_traceMaxCPUload]

plt.figure(figsize = (16,4))
plt.xlabel('time [5m interval]')
plt.ylabel('maximum CPU load')
plt.plot(traceMaxCPUload)

smoothed_delays = pd.Series.rolling(pd.Series(delays), 200).mean()
smoothed_delays = [elem for elem in smoothed_delays]

plt.figure(figsize= (16,4))
plt.xlabel('time [5m interval]')
plt.ylabel('Delays')
plt.plot(smoothed_delays)

smoothed_rewards = pd.Series.rolling(pd.Series(rewards), 200).mean()
smoothed_rewards = [elem for elem in smoothed_rewards]

plt.figure(figsize= (16,4))
plt.xlabel('time [5m interval]')
plt.ylabel('Rewards')
plt.plot(rewards)

smoothed_rewards_benchmark = pd.Series.rolling(pd.Series(rewards_benchmark), 200).mean()
smoothed_rewards_benchmark = [elem for elem in smoothed_rewards_benchmark]

plt.figure(figsize= (16,4))
plt.xlabel('time [5m interval]')
plt.ylabel('Rewards benchmark')
plt.plot(rewards_benchmark)