In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import MultivariateNormal
from collections import deque
import random
import matplotlib.pyplot as plt

# class Policy(nn.Module):
#     def __init__(self, dim):
#         super().__init__()
#         # Deep policy network
#         self.shared_net = nn.Sequential(
#             nn.Linear(dim, 256),
#             nn.ReLU(),
#             nn.Linear(256, 128),
#             nn.ReLU()
#         )
#         self.fc_mean = nn.Linear(128, dim)

#         # Simpler exploration - single learnable std
#         self.log_std = nn.Parameter(torch.zeros(1))

#         # Value function baseline
#         self.value_net = nn.Sequential(
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1)
#         )

#     def forward(self, x):
#         features = self.shared_net(x)
#         mean = self.fc_mean(features)
#         # Scalar covariance for all dimensions
#         cov = torch.eye(x.size(-1)) * torch.exp(self.log_std)
#         return MultivariateNormal(mean, cov), self.value_net(features)

class Policy(nn.Module):
    def __init__(self, dim):
        super().__init__()
        # Deeper shared network for feature extraction
        self.shared_net = nn.Sequential(
            nn.Linear(dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )

        self.fc_mean = nn.Linear(128, dim)

        # Shared scalar std for simplicity
        self.log_std = nn.Parameter(torch.zeros(1))

        # Deeper value function network
        self.value_net = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        features = self.shared_net(x)
        mean = self.fc_mean(features)
        cov = torch.eye(x.size(-1)) * torch.exp(self.log_std)
        return MultivariateNormal(mean, cov), self.value_net(features)


class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, min(batch_size, len(self.buffer)))

    def __len__(self):
        return len(self.buffer)

def compute_reward(v_new, v_prev, A, lambda_):
    """Reward function using Rayleigh quotient"""
    # Compute Rayleigh quotient
    with torch.no_grad():
        R = (v_new @ A @ v_new) / (v_new @ v_new)

    # 1. Rayleigh quotient component (closer to true eigenvalue is better)
    rayleigh_component = -torch.abs(R - lambda_).item()

    # 2. Direction consistency component
    cosine_sim = torch.nn.functional.cosine_similarity(
        v_new.unsqueeze(0), v_prev.unsqueeze(0)
    ).item()
    direction_component = 0.3 * cosine_sim

    # 3. Improvement component (compared to previous Rayleigh quotient)
    with torch.no_grad():
        prev_R = (v_prev @ A @ v_prev) / (v_prev @ v_prev)
    improvement_component = 0.2 * (torch.abs(prev_R - lambda_).item() - torch.abs(R - lambda_).item())

    # Combined reward
    reward = rayleigh_component + direction_component + improvement_component

    # Also compute residual for tracking
    residual = torch.linalg.norm((A - lambda_ * torch.eye(A.shape[0])) @ v_new)

    return reward, residual.item(), cosine_sim

def train(A, lambda_, policy, epochs=500, batch_size=32, replay_size=1000, dominant_v=None):
    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    replay_buffer = ExperienceReplay(replay_size)
    v = torch.randn(A.shape[0])
    v = v / torch.norm(v)

    # Enhanced exploration parameters
    initial_log_std = 0.0  # Starting exploration level
    final_log_std = -2.0   # Minimum exploration level
    exploration_decay = 0.995  # Per-epoch decay

    # Dimension-aware hyperparameters
    dim = A.shape[0]
    entropy_coef = max(0.05, 0.01 * (dim / 20))  # Scales with dimension
    policy.log_std.data.fill_(initial_log_std)  # Initialize exploration

    # Tracking variables
    best_v = v.clone()
    best_residual = float('inf')
    stats = {
        'residuals': [],
        'rewards': [],
        'cosine_similarities': [],
        'exploration': [],
        'epochs': []
    }

    for epoch in range(epochs):
        # Anneal exploration
        current_log_std = max(
            final_log_std,
            initial_log_std * (exploration_decay ** epoch)
        )
        policy.log_std.data.fill_(current_log_std)

        # Batch collection
        states, actions, rewards, values = [], [], [], []
        for _ in range(batch_size):
            v_prev = v.clone()
            dist, value_est = policy(v_prev)
            delta_v = dist.sample()

            with torch.no_grad():
                v_new = (v_prev + delta_v).clone()
                v_new = v_new / torch.norm(v_new)
                reward, residual, cos_sim = compute_reward(v_new, v_prev, A, lambda_)

                if residual < best_residual:
                    best_residual = residual
                    best_v = v_new.clone()

            replay_buffer.push((v_prev, delta_v, reward, value_est))
            states.append(v_prev)
            actions.append(delta_v)
            rewards.append(reward)
            values.append(value_est)

        # Sample from replay buffer
        if len(replay_buffer) > batch_size//2:
            replay_batch = replay_buffer.sample(batch_size//2)
            states.extend([s for s,_,_,_ in replay_batch])
            actions.extend([a for _,a,_,_ in replay_batch])
            rewards.extend([r for _,_,r,_ in replay_batch])
            values.extend([v for _,_,_,v in replay_batch])

        # Convert to tensors
        states = torch.stack(states)
        actions = torch.stack(actions)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        old_values = torch.cat(values).squeeze()

        # Normalize rewards
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)

        # PPO Update
        for _ in range(3):  # Multiple optimization steps
            dists, value_ests = policy(states)
            log_probs = dists.log_prob(actions)

            ratios = torch.exp(log_probs - dists.log_prob(actions).detach())
            advantages = rewards - old_values.detach()

            # Clipped objective
            clipped_ratios = torch.clamp(ratios, 0.8, 1.2)
            policy_loss = -torch.min(ratios*advantages, clipped_ratios*advantages).mean()

            # Enhanced losses
            value_loss = 0.5 * (value_ests.squeeze() - rewards).pow(2).mean()
            entropy = dists.entropy().mean()

            loss = policy_loss + 0.5*value_loss - entropy_coef*entropy

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(policy.parameters(), 0.5)
            optimizer.step()

        v = best_v.clone()

        # Logging
        if epoch % 50 == 0 or epoch == epochs-1:
            cos_sim = torch.nn.functional.cosine_similarity(
                best_v.unsqueeze(0), dominant_v.unsqueeze(0)
            ).item() if dominant_v is not None else 0.0

            stats['epochs'].append(epoch)
            stats['residuals'].append(best_residual)
            stats['cosine_similarities'].append(cos_sim)
            stats['exploration'].append(torch.exp(policy.log_std).item())

            print(
                f'Epoch {epoch}: '
                f'Residual={best_residual:.4f}, '
                f'CosSim={cos_sim:.4f}, '
                f'Exploration={torch.exp(policy.log_std).item():.4f}, '
                f'Entropy={entropy.item():.4f}'
            )

    return best_v, stats

def test_improved_policy(dim=5, epochs=1000):
    # Generate random symmetric matrix
    A_np = np.random.randn(dim, dim)
    A_np = A_np + A_np.T
    A = torch.tensor(A_np, dtype=torch.float32)

    # Get ground truth
    eigenvalues, eigenvectors = np.linalg.eig(A_np)
    dominant_idx = np.argmax(np.abs(eigenvalues))
    dominant_lambda = eigenvalues[dominant_idx]
    dominant_v = torch.tensor(eigenvectors[:, dominant_idx], dtype=torch.float32)

    print(f"True dominant eigenvalue: {dominant_lambda}")

    # Train
    policy = Policy(dim)
    predicted_v, stats = train(A, dominant_lambda, policy, epochs=epochs, dominant_v=dominant_v)

    # Compare
    cosine_sim = torch.nn.functional.cosine_similarity(
        predicted_v.unsqueeze(0), dominant_v.unsqueeze(0)
    ).item()

    print(f"\nPredicted eigenvector: {predicted_v.detach().numpy()}")
    print(f"True eigenvector: {dominant_v.numpy()}")
    print(f"Cosine similarity: {cosine_sim:.4f}")


    # # Plot cosine similarity
    # plt.figure(figsize=(10, 6))
    # plt.plot(stats['epochs'], np.abs(stats['cosine_similarities']), 'b-o', linewidth=2)
    # plt.title('Cosine Similarity vs Epoch', fontsize=14)
    # plt.xlabel('Epoch', fontsize=12)
    # plt.ylabel('Cosine Similarity', fontsize=12)
    # plt.grid(True, alpha=0.3)
    # plt.tight_layout()
    # plt.show()

    return cosine_sim, stats

In [None]:
from google.colab import drive
drive.mount('/content/drive/MyDrive/SMAI_Project')

Mounted at /content/drive


In [None]:
test_improved_policy(dim=5, epochs=1000)

In [None]:
test_improved_policy(dim=100, epochs=2000)

True dominant eigenvalue: 28.205543865572157
Epoch 0: Residual=27.1610, CosSim=-0.2835, Exploration=1.0030, Entropy=141.9939
Epoch 50: Residual=21.9557, CosSim=-0.3118, Exploration=0.9995, Entropy=141.8701
Epoch 100: Residual=21.5926, CosSim=-0.2386, Exploration=0.9966, Entropy=141.7802
Epoch 150: Residual=21.5926, CosSim=-0.2386, Exploration=0.9966, Entropy=141.7814
Epoch 200: Residual=18.7773, CosSim=-0.2592, Exploration=1.0017, Entropy=141.9479
Epoch 250: Residual=18.7773, CosSim=-0.2592, Exploration=0.9985, Entropy=141.8483
Epoch 300: Residual=17.7337, CosSim=-0.3090, Exploration=0.9972, Entropy=141.8017
Epoch 350: Residual=13.7667, CosSim=-0.5200, Exploration=1.0015, Entropy=141.9405
Epoch 400: Residual=13.7667, CosSim=-0.5200, Exploration=0.9999, Entropy=141.8959
Epoch 450: Residual=13.7667, CosSim=-0.5200, Exploration=0.9969, Entropy=141.7906
Epoch 500: Residual=13.7667, CosSim=-0.5200, Exploration=1.0004, Entropy=141.9105
Epoch 550: Residual=13.7667, CosSim=-0.5200, Exploration

AttributeError: 'list' object has no attribute 'numpy'

<Figure size 1000x600 with 0 Axes>

# Evaluating on different sizes

In [None]:
choices = [10, 20, 50, 75, 100, 200, 1000]
results = []
for dim in choices:
    print(f"\nTesting with dimension: {dim}")
    for i in range(3):
        print(f"Run {i + 1}")
        x = test_improved_policy(dim=dim, epochs=2001)
        results.append(x)
    print(f"\n\n\n\n\n!!!______________________________________________________Testing for {dim}x{dim} matrices done ______________________________________________________!!!\n\n\n\n\n")
results = np.array(results)
results = results.reshape(len(choices), 10)



Testing with dimension: 10
Run 1
True dominant eigenvalue: -7.628652785210479
Epoch 0: Residual=5.7799, CosSim=-0.7239, Exploration=1.0030, Entropy=14.1993
Epoch 50: Residual=1.5430, CosSim=-0.9183, Exploration=0.9959, Entropy=14.1757
Epoch 100: Residual=1.5430, CosSim=-0.9183, Exploration=0.9966, Entropy=14.1781
Epoch 150: Residual=1.5430, CosSim=-0.9183, Exploration=0.9966, Entropy=14.1779
Epoch 200: Residual=1.5430, CosSim=-0.9183, Exploration=0.9972, Entropy=14.1802
Epoch 250: Residual=1.5430, CosSim=-0.9183, Exploration=0.9996, Entropy=14.1881
Epoch 300: Residual=1.5430, CosSim=-0.9183, Exploration=0.9967, Entropy=14.1783
Epoch 350: Residual=1.5430, CosSim=-0.9183, Exploration=0.9969, Entropy=14.1792
Epoch 400: Residual=1.5430, CosSim=-0.9183, Exploration=0.9969, Entropy=14.1789
Epoch 450: Residual=1.5430, CosSim=-0.9183, Exploration=0.9973, Entropy=14.1806
Epoch 500: Residual=1.5430, CosSim=-0.9183, Exploration=0.9976, Entropy=14.1814
Epoch 550: Residual=1.5430, CosSim=-0.9183, 

In [None]:
perf = [np.mean(np.abs(results[i])) for i in range(len(choices))]

In [None]:
for i in range(len(choices)):
  print(f"{choices[i]} :  {perf[i]}")