In [None]:
pip install stable-baselines3[extra] gym shimmy numpy

Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3

In [3]:
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO

# Define RL Environment for Eigenvalue Problem
class EigenvalueEnv(gym.Env):
    def __init__(self, A):
        super(EigenvalueEnv, self).__init__()
        self.A = A
        self.dim = A.shape[0]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.dim,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.dim,), dtype=np.float32)
        self.state = None

    def reset(self):
        self.state = np.random.randn(self.dim)
        self.state /= np.linalg.norm(self.state)
        return self.state

    def step(self, action):
        action = np.array(action)
        action /= np.linalg.norm(action) + 1e-8
        reward = -self.rayleigh_quotient(action)
        self.state = action
        done = False
        return self.state, reward, done, {}

    def rayleigh_quotient(self, x):
        return np.dot(x, self.A @ x) / (np.dot(x, x) + 1e-8)

# Test Matrix (Symmetric)
np.random.seed(42)
A = np.random.randn(6, 6)
A = (A + A.T) / 2

# Initialize and Train the RL agent
env = EigenvalueEnv(A)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Evaluate Trained Model
def evaluate_model(model, env, num_steps=100):
    obs = env.reset()
    best_val = np.inf
    best_vec = None

    for _ in range(num_steps):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        val = -reward
        if val < best_val:
            best_val = val
            best_vec = obs

    print("Approximated smallest eigenvalue:", best_val)
    print("Eigenvector:", best_vec)

# Run Evaluation
evaluate_model(model, env)

# Verify using numpy built-in solver for comparison
eigvals, eigvecs = np.linalg.eigh(A)
print("True smallest eigenvalue (numpy):", eigvals[0])


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 1341 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 801         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011145031 |
|    clip_fraction        | 0.0822      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.5        |
|    explained_variance   | 0.0019      |
|    learning_rate        | 0.0003      |
|    loss                 | 4.63        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0125     |
|    std                  | 0.992       |
|    value_loss           | 31.8        |
----------------------------------

Code to compute largest eigenvalue of a matrix

In [4]:
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO

# Define RL Environment for Eigenvalue Problem (largest eigenvalue)
class EigenvalueEnv(gym.Env):
    def __init__(self, A):
        super(EigenvalueEnv, self).__init__()
        self.A = A
        self.dim = A.shape[0]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.dim,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.dim,), dtype=np.float32)
        self.state = None

    def reset(self):
        self.state = np.random.randn(self.dim)
        self.state /= np.linalg.norm(self.state)
        return self.state

    def step(self, action):
        action = np.array(action)
        action /= np.linalg.norm(action) + 1e-8
        reward = self.rayleigh_quotient(action)
        self.state = action
        done = False
        return self.state, reward, done, {}

    def rayleigh_quotient(self, x):
        return np.dot(x, self.A @ x) / (np.dot(x, x) + 1e-8)

# Test Matrix (Symmetric)
np.random.seed(42)
A = np.random.randn(5, 5)
A = (A + A.T) / 2

# Initialize and Train the RL agent
env = EigenvalueEnv(A)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Evaluate Trained Model
def evaluate_model(model, env, num_steps=100):
    obs = env.reset()
    best_val = -np.inf
    best_vec = None

    for _ in range(num_steps):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        val = reward
        if val > best_val:
            best_val = val
            best_vec = obs

    print("Approximated largest eigenvalue:", best_val)
    print("Eigenvector:", best_vec)

# Run Evaluation
evaluate_model(model, env)

# Verify using numpy built-in solver for comparison
eigvals, eigvecs = np.linalg.eigh(A)
print("True largest eigenvalue (numpy):", eigvals[-1])


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 1433 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1019        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011128096 |
|    clip_fraction        | 0.0821      |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.09       |
|    explained_variance   | -0.00538    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.86        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0119     |
|    std                  | 1           |
|    value_loss           | 8.15        |
----------------------------------

Now we will extend this for large sparse matrix.

In [None]:
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from scipy.sparse import rand
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigsh

# Define RL Environment for Sparse Eigenvalue Problem (smallest eigenvalue)
class SparseEigenvalueEnv(gym.Env):
    def __init__(self, A):
        self.A = A
        self.dim = A.shape[0]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.dim,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.dim,), dtype=np.float32)
        self.state = None

    def reset(self):
        self.state = np.random.randn(self.dim)
        self.state /= np.linalg.norm(self.state)
        return self.state

    def step(self, action):
        action = np.array(action)
        action /= np.linalg.norm(action)
        self.state = action
        reward = -self.rayleigh_quotient(action)
        done = False
        return self.state, reward, done, {}

    def rayleigh_quotient(self, x):
        return (x @ self.A.dot(x)) / (np.dot(x, x) + 1e-8)

# Create a large sparse symmetric matrix
from scipy.sparse import random as sparse_random
from scipy.sparse import csr_matrix
np.random.seed(42)
size = 1000  # Large sparse matrix
A_sparse = sparse_random(size, size, density=0.01, format='csr')
A_sparse = (A_sparse + A_sparse.T) * 0.5  # make symmetric

# Create environment
env = SparseEigenvalueEnv(A_sparse)

# Train RL model
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=5000)

# Evaluate Trained Model
def evaluate_model(model, env, num_steps=100):
    obs = env.reset()
    best_val = np.inf
    best_vec = None

    for _ in range(num_steps):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        val = -reward
        if val < best_val:
            best_val, best_vec = val, obs

    print("Computed smallest eigenvalue (RL):", best_val)

# Run Evaluation
env = SparseEigenvalueEnv(A_sparse)
model = PPO('MlpPolicy', env, verbose=0)
model.learn(total_timesteps=20000)

evaluate_model(model, env)

# Verify using scipy eigsh solver for sparse matrices
from scipy.sparse.linalg import eigsh
true_val, true_vec = eigsh(A_sparse, k=1, which='SA')
print("True smallest eigenvalue (scipy):", true_val[0])


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 325  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 347        |
|    iterations           | 2          |
|    time_elapsed         | 11         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.40022254 |
|    clip_fraction        | 0.768      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.42e+03  |
|    explained_variance   | -0.182     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.135     |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.135     |
|    std                  | 1          |