# ðŸ§  RL Index Optimization Agent

## RL State Definition

In [1]:
import pandas as pd
import numpy as np

# Load ML-ready dataset
df = pd.read_csv("../data/ml_features.csv")

state_features = [
    "rows_examined",
    "joins",
    "tables_count",
    "query_length",
    "cpu_usage",
    "memory_usage",
    "has_where"
]

df[state_features].head()


Unnamed: 0,rows_examined,joins,tables_count,query_length,cpu_usage,memory_usage,has_where
0,250188,1,2,42,11.3,190.453125,0
1,250188,1,2,42,0.0,190.453125,0
2,250188,1,2,42,0.0,190.453125,0
3,15,2,3,32,0.0,190.453125,1
4,15,2,3,32,0.0,190.453125,1


In [2]:
ACTION_DO_NOTHING = 0
ACTION_ADD_INDEX = 1
ACTION_REMOVE_INDEX = 2

NUM_ACTIONS = 3


In [3]:
def compute_reward(query_time, action):
    reward = -query_time  # faster queries â†’ higher reward
    
    if action == ACTION_ADD_INDEX:
        reward -= 0.05  # discourage excessive indexes
    
    if action == ACTION_REMOVE_INDEX:
        reward -= 0.02  # discourage blind removal
    
    return reward


In [8]:
import gymnasium as gym
from gymnasium import spaces

class IndexOptimizationEnv(gym.Env):
    def __init__(self, df):
        super().__init__()
        
        self.df = df.reset_index(drop=True)
        self.current_step = 0
        
        # Observation space
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(len(state_features),),
            dtype=np.float32
        )
        
        # Action space
        self.action_space = spaces.Discrete(NUM_ACTIONS)
        
    def reset(self, seed=None):
        self.current_step = np.random.randint(0, len(self.df))
        state = (
    self.df.loc[self.current_step, state_features]
    .astype(np.float32)
    .values
)
        return state, {}
    
    def step(self, action):
        row = self.df.loc[self.current_step]
        base_time = row["query_time"]
        
        # Simulate index effect
        if action == ACTION_ADD_INDEX:
            query_time = base_time * np.random.uniform(0.7, 0.9)
        elif action == ACTION_REMOVE_INDEX:
            query_time = base_time * np.random.uniform(1.0, 1.2)
        else:
            query_time = base_time
        
        reward = compute_reward(query_time, action)
        
        terminated = True
        truncated = False
        
        next_state = row[state_features].values
        
        return next_state, reward, terminated, truncated, {}


In [15]:
from stable_baselines3 import PPO

env = IndexOptimizationEnv(df)

model = PPO(
    policy="MlpPolicy",
    env=env,
    verbose=1
)

model.learn(total_timesteps=1000000)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.0592  |
| time/              |          |
|    fps             | 508      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.0689     |
| time/                   |             |
|    fps                  | 421         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015242183 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x26d31850080>

In [16]:
state, _ = env.reset()
action, _ = model.predict(state)

print("Chosen action:", action)

Chosen action: 0


In [17]:
action_counts = {0: 0, 1: 0, 2: 0}

for _ in range(200):
    state, _ = env.reset()
    action, _ = model.predict(state, deterministic=True)
    action_counts[int(action)] += 1

action_counts


{0: 200, 1: 0, 2: 0}