## Purpose: 
Demonstration of how an RL policy could learn intraday heuristics. 
This is a toy Q-learning demo with discrete actions {charge, idle, discharge} and a simple reward. 
I have used this to explain the RL pipeline â€” environment, reward shaping, domain randomization, shadow-trading & safety.

In [1]:
# model_d_qlearn.py
# importing libraries
import numpy as np
import random
from collections import defaultdict

In [2]:
class SimpleBatteryEnv:
    def __init__(self, price_series, P=6.0, E_max=24.0, eta=0.95):
        self.price = price_series
        self.T = len(price_series)
        self.P = P
        self.E_max = E_max
        self.eta = eta
        self.reset()
    def reset(self):
        self.t = 0
        self.soc = 0.5 * self.E_max
        return (self.t, self.soc)
    def step(self, action):
        # action: 0=charge,1=idle,2=discharge
        price = self.price[self.t]
        if action == 0:
            # charge P for one timestep (1h)
            charged = min(self.P, self.E_max - self.soc)
            self.soc += charged * self.eta
            reward = - price * charged  # pay to charge
        elif action == 2:
            discharged = min(self.P, self.soc)
            self.soc -= discharged / self.eta
            reward = price * discharged
        else:
            reward = 0.0
        self.t += 1
        done = (self.t >= self.T)
        obs = (self.t, self.soc)
        return obs, reward, done, {}

In [3]:
# Q-learning
def train(env, episodes=2000, alpha=0.1, gamma=0.99, eps=0.2):
    Q = defaultdict(lambda: np.zeros(3))
    for ep in range(episodes):
        obs = env.reset()
        done = False
        while not done:
            state_key = (int(obs[0]), int(obs[1]//1.0))  # discretize for demo
            if random.random() < eps:
                action = random.randint(0,2)
            else:
                action = int(np.argmax(Q[state_key]))
            next_obs, reward, done, _ = env.step(action)
            next_key = (int(next_obs[0]), int(next_obs[1]//1.0))
            Q[state_key][action] += alpha*(reward + gamma*np.max(Q[next_key]) - Q[state_key][action])
            obs = next_obs
    return Q

In [4]:
# Demo usage
if __name__ == "__main__":
    price = np.concatenate([np.linspace(10,5,12), np.linspace(5,20,12)])  # low then peak
    env = SimpleBatteryEnv(price_series=price)
    Q = train(env, episodes=1000)
    print("Trained Q-sample:", list(Q.items())[:5])


Trained Q-sample: [((0, 12), array([ 86.23950522, 112.60279751, 125.86817184])), ((1, 17), array([-15.52090909,   9.09221411, 153.29883459])), ((2, 23), array([-0.54545455,  0.        ,  0.        ])), ((3, 23), array([-0.51818182,  0.01260193,  0.        ])), ((4, 23), array([-0.02454545,  0.12729227,  9.39158462]))]
