# Welcome to Colab!

In [4]:

from google.colab import files
import os

CSV_PATH = "/content/load_balancing_playground.csv"

if not os.path.exists(CSV_PATH):
    print("Upload load_balancing_playground.csv")
    uploaded = files.upload()
    uploaded_filename = list(uploaded.keys())[0]
    os.rename(uploaded_filename, CSV_PATH)

import numpy as np
import pandas as pd

class LoadBalancingEnv:
    """
    MDP Environment for RL-based Load Balancing
    Each step = assign one task to one CPU
    """

    def __init__(
        self,
        csv_path,
        num_cpus=4,
        lambda_imbalance=0.5,
        mu_sla=2.0,
        episode_length=1000,
    ):
        self.df = pd.read_csv(csv_path)
        self.num_cpus = num_cpus

        self.lambda_imbalance = lambda_imbalance
        self.mu_sla = mu_sla
        self.episode_length = episode_length

        self.current_step = 0
        self.start_index = 0

    def reset(self):
        self.start_index = np.random.randint(
            0, len(self.df) - self.episode_length
        )
        self.current_step = 0
        return self._get_state()

    def step(self, action):
        """
        action: integer in [0, num_cpus - 1]
        """
        row = self.df.iloc[self.start_index + self.current_step]

        completion_time = (
            row[f"cpu_{action}_wait_time"]
            + row["task_size"] / row[f"cpu_{action}_speed"]
        )

        load_variance = row["load_variance"]
        sla_violation = row["sla_violation"]

        reward = (
            -completion_time
            - self.lambda_imbalance * load_variance
            - self.mu_sla * sla_violation
        )

        self.current_step += 1
        done = self.current_step >= self.episode_length

        next_state = self._get_state() if not done else None

        info = {
            "completion_time": completion_time,
            "sla_violation": sla_violation,
            "oracle_cpu": row["chosen_cpu_oracle"],
        }

        return next_state, reward, done, info

    def _get_state(self):
        row = self.df.iloc[self.start_index + self.current_step]

        state = [
            row["task_size"],
            row["priority"],
            row["system_load_avg"],
            row["load_variance"],
        ]

        for i in range(self.num_cpus):
            state.extend([
                row[f"cpu_{i}_queue_len"],
                row[f"cpu_{i}_load"],
                row[f"cpu_{i}_wait_time"],
                row[f"cpu_{i}_speed"],
            ])

        return np.array(state, dtype=np.float32)

    def action_space(self):
        return self.num_cpus

    def state_dim(self):
        return len(self._get_state())




env = LoadBalancingEnv(csv_path=CSV_PATH)

state = env.reset()
print("State dimension:", env.state_dim())
print("Action space:", env.action_space())

total_reward = 0

for step in range(5):
    action = np.random.randint(0, env.action_space())
    next_state, reward, done, info = env.step(action)
    total_reward += reward

    print(
        f"Step {step} | Action: CPU-{action} | "
        f"Reward: {reward:.3f} | "
        f"Oracle CPU: {info['oracle_cpu']}"
    )

print("Total reward (sample):", total_reward)


Upload load_balancing_playground.csv


Saving load_balancing_playground.csv to load_balancing_playground.csv
State dimension: 20
Action space: 4
Step 0 | Action: CPU-2 | Reward: -65.908 | Oracle CPU: 0
Step 1 | Action: CPU-2 | Reward: -21.430 | Oracle CPU: 1
Step 2 | Action: CPU-2 | Reward: -41.351 | Oracle CPU: 2
Step 3 | Action: CPU-3 | Reward: -50.906 | Oracle CPU: 2
Step 4 | Action: CPU-0 | Reward: -8.125 | Oracle CPU: 2
Total reward (sample): -187.71853351143304


In [5]:


import numpy as np


def round_robin_policy(step, num_cpus):
    return step % num_cpus

def lwr_policy(state, num_cpus):
    """
    Choose CPU with minimum estimated work remaining
    """
    idx = 4
    scores = []
    for i in range(num_cpus):
        queue_len = state[idx]
        load = state[idx + 1]
        wait = state[idx + 2]
        speed = state[idx + 3]
        scores.append(wait / speed)
        idx += 4
    return int(np.argmin(scores))

def oracle_policy(info):
    return info["oracle_cpu"]


def evaluate_policy(env, policy_name, episodes=5):
    total_rewards = []
    oracle_matches = []
    sla_violations = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        step = 0

        ep_reward = 0
        ep_oracle_match = 0
        ep_sla = 0
        total_steps = 0

        while not done:
            if policy_name == "RR":
                action = round_robin_policy(step, env.action_space())
            elif policy_name == "LWR":
                action = lwr_policy(state, env.action_space())
            else:
                raise ValueError("Unknown policy")

            next_state, reward, done, info = env.step(action)

            ep_reward += reward
            ep_oracle_match += int(action == info["oracle_cpu"])
            ep_sla += info["sla_violation"]
            total_steps += 1

            state = next_state
            step += 1

        total_rewards.append(ep_reward / total_steps)
        oracle_matches.append(ep_oracle_match / total_steps)
        sla_violations.append(ep_sla / total_steps)

    print(f"\n=== {policy_name} POLICY RESULTS ===")
    print(f"Avg Reward      : {np.mean(total_rewards):.3f}")
    print(f"Oracle Accuracy : {np.mean(oracle_matches)*100:.2f}%")
    print(f"SLA Violation % : {np.mean(sla_violations)*100:.2f}%")


env = LoadBalancingEnv(
    csv_path=CSV_PATH,
    episode_length=1000
)

evaluate_policy(env, "RR")
evaluate_policy(env, "LWR")



=== RR POLICY RESULTS ===
Avg Reward      : -38.513
Oracle Accuracy : 24.78%
SLA Violation % : 33.18%

=== LWR POLICY RESULTS ===
Avg Reward      : -14.916
Oracle Accuracy : 86.00%
SLA Violation % : 33.06%


In [6]:


import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.net(x)


class ReplayBuffer:
    def __init__(self, capacity=50000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(states, dtype=torch.float32).to(device),
            torch.tensor(actions).to(device),
            torch.tensor(rewards, dtype=torch.float32).to(device),
            torch.tensor(next_states, dtype=torch.float32).to(device),
            torch.tensor(dones, dtype=torch.float32).to(device),
        )

    def __len__(self):
        return len(self.buffer)


state_dim = env.state_dim()
action_dim = env.action_space()

policy_net = DQN(state_dim, action_dim).to(device)
target_net = DQN(state_dim, action_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)
buffer = ReplayBuffer()

gamma = 0.99
batch_size = 64
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
target_update = 200

episodes = 50
max_steps = 1000

for ep in range(episodes):
    state = env.reset()
    total_reward = 0
    sla_violations = 0
    oracle_matches = 0

    for step in range(max_steps):

        if random.random() < epsilon:
            action = random.randint(0, action_dim - 1)
        else:
            with torch.no_grad():
                q_vals = policy_net(
                    torch.tensor(state, dtype=torch.float32).to(device)
                )
                action = int(torch.argmax(q_vals).item())

        next_state, reward, done, info = env.step(action)

        buffer.push(state, action, reward, next_state, done)

        total_reward += reward
        sla_violations += info["sla_violation"]
        oracle_matches += int(action == info["oracle_cpu"])

        state = next_state
        if done:
            break

        # Learn
        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()
            next_q = target_net(next_states).max(1)[0]
            target = rewards + gamma * next_q * (1 - dones)

            loss = nn.MSELoss()(q_values, target.detach())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if step % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    print(
        f"Episode {ep+1:02d} | "
        f"Avg Reward: {total_reward/max_steps:.3f} | "
        f"Oracle Acc: {oracle_matches/max_steps*100:.2f}% | "
        f"SLA %: {sla_violations/max_steps*100:.2f}% | "
        f"Epsilon: {epsilon:.3f}"
    )


  torch.tensor(states, dtype=torch.float32).to(device),


Episode 01 | Avg Reward: -39.165 | Oracle Acc: 24.70% | SLA %: 35.70% | Epsilon: 0.995


TypeError: not a sequence

In [8]:
import numpy as np
import pandas as pd

class LoadBalancingEnv:
    ...



In [9]:
import torch
...


Ellipsis

In [10]:
env = LoadBalancingEnv(CSV_PATH)
...


TypeError: LoadBalancingEnv() takes no arguments

In [11]:

from google.colab import files
import os

CSV_PATH = "/content/load_balancing_playground.csv"

if not os.path.exists(CSV_PATH):
    print("Upload load_balancing_playground.csv")
    uploaded = files.upload()
    uploaded_filename = list(uploaded.keys())[0]
    os.rename(uploaded_filename, CSV_PATH)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class LoadBalancingEnv:
    def __init__(
        self,
        csv_path,
        num_cpus=4,
        lambda_imbalance=0.5,
        mu_sla=2.0,
        episode_length=1000,
    ):
        self.df = pd.read_csv(csv_path)
        self.num_cpus = num_cpus
        self.lambda_imbalance = lambda_imbalance
        self.mu_sla = mu_sla
        self.episode_length = episode_length
        self.current_step = 0
        self.start_index = 0

    def reset(self):
        self.start_index = np.random.randint(
            0, len(self.df) - self.episode_length
        )
        self.current_step = 0
        return self._get_state()

    def step(self, action):
        row = self.df.iloc[self.start_index + self.current_step]

        completion_time = (
            row[f"cpu_{action}_wait_time"]
            + row["task_size"] / row[f"cpu_{action}_speed"]
        )

        reward = (
            -completion_time
            - self.lambda_imbalance * row["load_variance"]
            - self.mu_sla * row["sla_violation"]
        )

        self.current_step += 1
        done = self.current_step >= self.episode_length

        next_state = self._get_state() if not done else None

        info = {
            "oracle_cpu": row["chosen_cpu_oracle"],
            "sla_violation": row["sla_violation"],
        }

        return next_state, reward, done, info

    def _get_state(self):
        row = self.df.iloc[self.start_index + self.current_step]
        state = [
            row["task_size"],
            row["priority"],
            row["system_load_avg"],
            row["load_variance"],
        ]
        for i in range(self.num_cpus):
            state.extend([
                row[f"cpu_{i}_queue_len"],
                row[f"cpu_{i}_load"],
                row[f"cpu_{i}_wait_time"],
                row[f"cpu_{i}_speed"],
            ])
        return np.array(state, dtype=np.float32)

    def action_space(self):
        return self.num_cpus

    def state_dim(self):
        return len(self._get_state())


class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
        )

    def forward(self, x):
        return self.net(x)

class ReplayBuffer:
    def __init__(self, state_dim, capacity=50000):
        self.buffer = deque(maxlen=capacity)
        self.state_dim = state_dim

    def push(self, state, action, reward, next_state, done):
        if next_state is None:
            next_state = np.zeros(self.state_dim, dtype=np.float32)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.from_numpy(np.array(states)).float().to(device),
            torch.tensor(actions).long().to(device),
            torch.tensor(rewards).float().to(device),
            torch.from_numpy(np.array(next_states)).float().to(device),
            torch.tensor(dones).float().to(device),
        )

    def __len__(self):
        return len(self.buffer)


env = LoadBalancingEnv(CSV_PATH)
state_dim = env.state_dim()
action_dim = env.action_space()

policy_net = DQN(state_dim, action_dim).to(device)
target_net = DQN(state_dim, action_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)
buffer = ReplayBuffer(state_dim)

gamma = 0.99
batch_size = 64
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
target_update = 200

episodes = 50
max_steps = 1000

for ep in range(episodes):
    state = env.reset()
    total_reward = 0
    oracle_hits = 0
    sla = 0

    for step in range(max_steps):
        if random.random() < epsilon:
            action = random.randint(0, action_dim - 1)
        else:
            with torch.no_grad():
                q_vals = policy_net(
                    torch.tensor(state).float().to(device)
                )
                action = int(torch.argmax(q_vals).item())

        next_state, reward, done, info = env.step(action)

        buffer.push(state, action, reward, next_state, done)

        total_reward += reward
        oracle_hits += int(action == info["oracle_cpu"])
        sla += info["sla_violation"]

        state = next_state
        if done:
            break

        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            q = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()
            next_q = target_net(next_states).max(1)[0]
            target = rewards + gamma * next_q * (1 - dones)

            loss = nn.MSELoss()(q, target.detach())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if step % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    print(
        f"Episode {ep+1:02d} | "
        f"Avg Reward: {total_reward/max_steps:.3f} | "
        f"Oracle Acc: {oracle_hits/max_steps*100:.2f}% | "
        f"SLA %: {sla/max_steps*100:.2f}% | "
        f"Epsilon: {epsilon:.3f}"
    )


Episode 01 | Avg Reward: -39.366 | Oracle Acc: 24.40% | SLA %: 35.30% | Epsilon: 0.995
Episode 02 | Avg Reward: -37.686 | Oracle Acc: 24.20% | SLA %: 31.30% | Epsilon: 0.990
Episode 03 | Avg Reward: -37.723 | Oracle Acc: 25.80% | SLA %: 33.30% | Epsilon: 0.985
Episode 04 | Avg Reward: -38.565 | Oracle Acc: 26.40% | SLA %: 33.30% | Epsilon: 0.980
Episode 05 | Avg Reward: -39.722 | Oracle Acc: 27.00% | SLA %: 33.10% | Epsilon: 0.975
Episode 06 | Avg Reward: -38.485 | Oracle Acc: 26.40% | SLA %: 35.90% | Epsilon: 0.970
Episode 07 | Avg Reward: -36.832 | Oracle Acc: 28.40% | SLA %: 33.10% | Epsilon: 0.966
Episode 08 | Avg Reward: -37.731 | Oracle Acc: 26.20% | SLA %: 31.80% | Epsilon: 0.961
Episode 09 | Avg Reward: -37.725 | Oracle Acc: 25.90% | SLA %: 34.90% | Epsilon: 0.956
Episode 10 | Avg Reward: -34.406 | Oracle Acc: 31.50% | SLA %: 32.70% | Epsilon: 0.951
Episode 11 | Avg Reward: -39.081 | Oracle Acc: 26.00% | SLA %: 33.20% | Epsilon: 0.946
Episode 12 | Avg Reward: -37.081 | Oracle A

In [13]:

from google.colab import files
import os

CSV_PATH = "/content/load_balancing_playground.csv"
if not os.path.exists(CSV_PATH):
    print("Upload load_balancing_playground.csv")
    uploaded = files.upload()
    os.rename(list(uploaded.keys())[0], CSV_PATH)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class LoadBalancingEnv:
    def __init__(self, csv_path, num_cpus=4, episode_length=1200):
        self.df = pd.read_csv(csv_path)
        self.num_cpus = num_cpus
        self.episode_length = episode_length

    def reset(self):
        self.start = np.random.randint(0, len(self.df) - self.episode_length)
        self.step_id = 0
        return self._state()

    def step(self, action):
        row = self.df.iloc[self.start + self.step_id]

        completion = row[f"cpu_{action}_wait_time"] + row["task_size"] / row[f"cpu_{action}_speed"]

        reward = (
            -completion
            - 0.2 * row["load_variance"]
            - 12.0 * row["sla_violation"]
        )

        self.step_id += 1
        done = self.step_id >= self.episode_length
        next_state = None if done else self._state()

        info = {
            "oracle_cpu": row["chosen_cpu_oracle"],
            "sla_violation": row["sla_violation"]
        }

        return next_state, reward, done, info

    def _state(self):
        r = self.df.iloc[self.start + self.step_id]
        s = [r["task_size"], r["priority"], r["system_load_avg"], r["load_variance"]]
        for i in range(self.num_cpus):
            s.extend([
                r[f"cpu_{i}_queue_len"],
                r[f"cpu_{i}_wait_time"],
                r[f"cpu_{i}_speed"]
            ])
        return np.array(s, dtype=np.float32)

    def action_space(self):
        return self.num_cpus

    def state_dim(self):
        return len(self._state())


class DQN(nn.Module):
    def __init__(self, s_dim, a_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(s_dim, 96),
            nn.ReLU(),
            nn.Linear(96, 96),
            nn.ReLU(),
            nn.Linear(96, a_dim)
        )
    def forward(self, x):
        return self.net(x)

class ReplayBuffer:
    def __init__(self, s_dim, cap=30000):
        self.buf = deque(maxlen=cap)
        self.s_dim = s_dim

    def push(self, s, a, r, ns, d):
        if ns is None:
            ns = np.zeros(self.s_dim, dtype=np.float32)
        self.buf.append((s, a, r, ns, d))

    def sample(self, n):
        b = random.sample(self.buf, n)
        s,a,r,ns,d = zip(*b)
        return (
            torch.tensor(s).float().to(device),
            torch.tensor(a).long().to(device),
            torch.tensor(r).float().to(device),
            torch.tensor(ns).float().to(device),
            torch.tensor(d).float().to(device)
        )

    def __len__(self):
        return len(self.buf)


env = LoadBalancingEnv(CSV_PATH)
_ = env.reset()                 
sd, ad = env.state_dim(), env.action_space()

policy = DQN(sd, ad).to(device)
target = DQN(sd, ad).to(device)
target.load_state_dict(policy.state_dict())

opt = optim.Adam(policy.parameters(), lr=1e-3)
buf = ReplayBuffer(sd)

gamma = 0.99
eps, eps_min, eps_decay = 1.0, 0.05, 0.97
batch = 64
episodes = 60
target_update = 100

for ep in range(episodes):
    s = env.reset()
    R, sla, oracle = 0, 0, 0

    for t in range(env.episode_length):
        if random.random() < eps:
            a = random.randint(0, ad-1)
        else:
            with torch.no_grad():
                a = policy(torch.tensor(s).to(device)).argmax().item()

        ns, r, d, info = env.step(a)
        buf.push(s, a, r, ns, d)

        R += r
        sla += info["sla_violation"]
        oracle += int(a == info["oracle_cpu"])
        s = ns

        if len(buf) > batch:
            S,A,Rw,NS,D = buf.sample(batch)
            q = policy(S).gather(1, A.unsqueeze(1)).squeeze()
            tq = target(NS).max(1)[0]
            y = Rw + gamma * tq * (1-D)
            loss = nn.MSELoss()(q, y.detach())
            opt.zero_grad()
            loss.backward()
            opt.step()

        if t % target_update == 0:
            target.load_state_dict(policy.state_dict())

        if d:
            break

    eps = max(eps*eps_decay, eps_min)

    print(
        f"Ep {ep+1:02d} | "
        f"Reward: {R/env.episode_length:.2f} | "
        f"Oracle: {oracle/env.episode_length*100:.1f}% | "
        f"SLA: {sla/env.episode_length*100:.1f}% | "
        f"Eps: {eps:.2f}"
    )


Ep 01 | Reward: -41.55 | Oracle: 25.8% | SLA: 34.1% | Eps: 0.97
Ep 02 | Reward: -41.21 | Oracle: 26.1% | SLA: 34.1% | Eps: 0.94
Ep 03 | Reward: -39.28 | Oracle: 25.3% | SLA: 31.4% | Eps: 0.91
Ep 04 | Reward: -39.41 | Oracle: 30.1% | SLA: 33.2% | Eps: 0.89
Ep 05 | Reward: -38.50 | Oracle: 31.5% | SLA: 33.8% | Eps: 0.86
Ep 06 | Reward: -38.95 | Oracle: 33.5% | SLA: 33.9% | Eps: 0.83
Ep 07 | Reward: -37.50 | Oracle: 35.8% | SLA: 33.7% | Eps: 0.81
Ep 08 | Reward: -39.21 | Oracle: 33.5% | SLA: 33.4% | Eps: 0.78
Ep 09 | Reward: -36.44 | Oracle: 36.2% | SLA: 33.5% | Eps: 0.76
Ep 10 | Reward: -36.84 | Oracle: 36.9% | SLA: 34.8% | Eps: 0.74
Ep 11 | Reward: -35.53 | Oracle: 39.6% | SLA: 32.8% | Eps: 0.72
Ep 12 | Reward: -34.66 | Oracle: 41.2% | SLA: 33.1% | Eps: 0.69
Ep 13 | Reward: -33.88 | Oracle: 41.8% | SLA: 32.8% | Eps: 0.67
Ep 14 | Reward: -33.44 | Oracle: 41.5% | SLA: 31.8% | Eps: 0.65
Ep 15 | Reward: -33.40 | Oracle: 44.8% | SLA: 33.4% | Eps: 0.63
Ep 16 | Reward: -33.26 | Oracle: 44.8% |