# Volt-VAR Optimization with Reinforcement Learning

From the [Sisyphean Gridworks ML Playground](https://sgridworks.com/ml-playground/guides/06-volt-var-optimization.html)

## Setup

Clone the repository and install dependencies. Run this cell first.

In [None]:
!git clone https://github.com/SGridworks/Dynamic-Network-Model.git 2>/dev/null || echo 'Already cloned'
%cd Dynamic-Network-Model
!pip install -q pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm pyarrow

## Analyze the Voltage Profile

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load SP&L datasets
from demo_data.load_demo_data import (
    load_load_profiles, load_network_edges, load_network_nodes
)

load_profiles = load_load_profiles()
edges = load_network_edges()
nodes = load_network_nodes()

# Pick Feeder FDR-0001 and look at voltage profiles
feeder_load = load_profiles[load_profiles["feeder_id"] == "FDR-0001"].copy()

# The load_profiles already contain voltage_pu measurements
# Plot voltage variation over a representative day
one_day = feeder_load.head(96)  # 96 intervals = 24 hours at 15-min

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(range(len(one_day)), one_day["voltage_pu"], "o-", color="#5FCCDB", markersize=4)
ax.axhline(y=1.05, color="red", linestyle="--", alpha=0.7, label="Upper limit")
ax.axhline(y=0.95, color="red", linestyle="--", alpha=0.7, label="Lower limit")
ax.axhspan(0.95, 1.05, alpha=0.1, color="green", label="ANSI range")
ax.set_xlabel("15-Minute Interval")
ax.set_ylabel("Voltage (p.u.)")
ax.set_title("Voltage Profile — Feeder FDR-0001 (One Day)")
ax.legend()
plt.tight_layout()
plt.show()

## Build a Rule-Based Controller

Before using ML, build a simple rule: "If the voltage at the end of the feeder drops below 0.97 p.u., switch on a capacitor bank. If it rises above 1.03 p.u., switch it off."

In [None]:
def rule_based_vvo(voltage_pu, cap_on):
    """Simple rule-based capacitor control."""
    if voltage_pu 0.97 and not cap_on:
        return True   # switch ON to boost voltage
    elif voltage_pu > 1.03 and cap_on:
        return False  # switch OFF to reduce voltage
    return cap_on

# Simulate 24 hours using actual voltage profile data
# We use the voltage_pu column and simulate capacitor effect
day_data = feeder_load.head(96).copy()

cap_on = False
rule_results = []

for i, (_, row) in enumerate(day_data.iterrows()):
    v = row["voltage_pu"]

    # Simulate capacitor effect: +0.02 p.u. when on
    if cap_on:
        v += 0.02

    cap_on = rule_based_vvo(v, cap_on)
    rule_results.append({"interval": i, "voltage_pu": v, "cap_on": cap_on})

rule_df = pd.DataFrame(rule_results)
print(rule_df.head(24))

## What is Reinforcement Learning?

The rule-based controller works, but it requires you to hand-pick the voltage thresholds and logic. What if the agent could learn the best strategy by itself? That's what reinforcement learning (RL) does.

In [None]:
class VoltVAREnv:
    """Simple Volt-VAR environment for Q-learning."""

    def __init__(self, load_data):
        self.load_data = load_data.reset_index(drop=True)
        self.step_idx = 0
        self.cap_on = False

    def reset(self):
        self.step_idx = 0
        self.cap_on = False
        return self._get_state()

    def _get_voltage(self):
        row = self.load_data.iloc[self.step_idx % len(self.load_data)]
        v = row["voltage_pu"]
        if self.cap_on:
            v += 0.02  # capacitor boost
        return v

    def _get_state(self):
        v = self._get_voltage()
        if v 0.95:   bucket = 0
        elif v 0.97: bucket = 1
        elif v 1.00: bucket = 2
        elif v 1.03: bucket = 3
        else:           bucket = 4
        return (bucket, int(self.cap_on))

    def step(self, action):
        self.cap_on = bool(action)
        voltage = self._get_voltage()

        if 0.95 1.05:
            reward = 1.0
        else:
            reward = -10.0
        reward += max(0, 1 - abs(voltage - 1.0) * 20)

        self.step_idx += 1
        done = self.step_idx >= len(self.load_data)
        return self._get_state(), reward, done, {"voltage": voltage}

## Define the RL Environment

Now let's set up a reinforcement learning environment. The agent observes the current voltage and decides whether to switch the capacitor on or off. It gets a positive reward when voltage is within the ANSI range and a negative reward (penalty) when it's outside.

In [None]:
# Q-table: a lookup table with one entry per (state, action) pair
# Dimensions: 5 voltage buckets x 2 cap states x 2 possible actions
# Initialized to zeros — the agent starts with no knowledge
q_table = np.zeros((5, 2, 2))

# Hyperparameters — these control how the agent learns
alpha = 0.1      # learning rate: how much to update Q-values each step
gamma = 0.95     # discount factor: how much to value future vs. immediate rewards
epsilon = 1.0    # exploration rate: start with 100% random actions
epsilon_min = 0.05   # never stop exploring completely (5% random)
epsilon_decay = 0.995 # multiply epsilon by this after each episode
n_episodes = 100  # number of training runs through the day's data

env = VoltVAREnv(day_data)
episode_rewards = []

for ep in range(n_episodes):
    state = env.reset()  # start a new episode (reset to hour 0)
    total_reward = 0

    while True:
        # Epsilon-greedy: explore randomly or exploit best-known action
        if np.random.random() randint(2)  # random: 0=cap off, 1=cap on
        else:
            action = np.argmax(q_table[state[0], state[1]])  # pick best action from Q-table

        # Take the action and observe what happens
        next_state, reward, done, info = env.step(action)

        # THE Q-LEARNING UPDATE EQUATION:
        # Q(s,a) = Q(s,a) + α * [reward + γ * max_a' Q(s',a') - Q(s,a)]
        #
        # In plain English: adjust the old estimate toward the
        # actual reward received PLUS the discounted value of the
        # best action in the next state.
        old_q = q_table[state[0], state[1], action]         # current Q-value estimate
        best_next = np.max(q_table[next_state[0], next_state[1]]) # best future value
        q_table[state[0], state[1], action] = old_q + alpha * (
            reward + gamma * best_next - old_q  # temporal difference error
        )

        total_reward += reward
        state = next_state

        if done:
            break

    # Decay epsilon: explore less as the agent learns more
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    episode_rewards.append(total_reward)

    if (ep + 1) % 20 == 0:
        print(f"Episode {ep+1:>3}/{n_episodes}  "
              f"Reward: {total_reward:.1f}  Epsilon: {epsilon:.3f}")

## Train the Q-Learning Agent

In [None]:
# Plot training progress
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(episode_rewards, color="#5FCCDB", alpha=0.6)
ax.plot(pd.Series(episode_rewards).rolling(10).mean(),
       color="#1C4855", linewidth=2, label="10-episode avg")
ax.set_xlabel("Episode")
ax.set_ylabel("Total Reward")
ax.set_title("Q-Learning Training Progress")
ax.legend()
plt.tight_layout()
plt.show()

# Run the trained agent for one day and compare with rule-based
env = VoltVAREnv(day_data)
state = env.reset()
rl_results = []

while True:
    action = np.argmax(q_table[state[0], state[1]])
    state, reward, done, info = env.step(action)
    rl_results.append({"interval": env.step_idx - 1, "voltage_pu": info["voltage"],
                       "cap_on": bool(action)})
    if done:
        break

rl_df = pd.DataFrame(rl_results)

# Side-by-side comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

ax1.plot(rule_df["interval"], rule_df["voltage_pu"], "o-", color="#2D6A7A")
ax1.axhspan(0.95, 1.05, alpha=0.1, color="green")
ax1.set_title("Rule-Based Controller")
ax1.set_xlabel("15-Minute Interval")
ax1.set_ylabel("Voltage (p.u.)")

ax2.plot(rl_df["interval"], rl_df["voltage_pu"], "o-", color="#5FCCDB")
ax2.axhspan(0.95, 1.05, alpha=0.1, color="green")
ax2.set_title("Q-Learning Controller")
ax2.set_xlabel("15-Minute Interval")

plt.suptitle("VVO Controller Comparison")
plt.tight_layout()
plt.show()

## Test and Compare Both Approaches

## What You Built and Next Steps