In [18]:
with open('/scratch1/srajasek/dl/modelss.py', 'r') as f:
    print(f.read())

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class DQNetwork(nn.Module):
    """
    Deep Q-Network that outputs both action values and communication signals.
    """
    def __init__(self, input_size=10, hidden_size=64, output_size=5):
        super(DQNetwork, self).__init__()
        
        # Shared feature extraction
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        
        # Action output
        self.action_head = nn.Linear(hidden_size, output_size)
        
        # Communication output
        self.comm_head = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        """
        Forward pass through the network.
        """
        # Feature extraction
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        # Action values
        action_values = self.action_head(x)
        
        # Communication signal
        comm_signal = torch.tan

In [22]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from multi_agent_env import MultiAgentEnv
from netmodelss import DQNetwork, ReplayBuffer

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Training parameters
BATCH_SIZE = 64
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.1
EPSILON_DECAY = 0.995
LEARNING_RATE = 0.001
TARGET_UPDATE = 10
NUM_EPISODES = 1000
HIDDEN_SIZE = 64
BUFFER_SIZE = 10000

# Initialize environment
env = MultiAgentEnv()

# Initialize networks
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Agent A networks
policy_net_A = DQNetwork(input_size=10, hidden_size=HIDDEN_SIZE).to(device)
target_net_A = DQNetwork(input_size=10, hidden_size=HIDDEN_SIZE).to(device)
target_net_A.load_state_dict(policy_net_A.state_dict())
target_net_A.eval()

# Agent B networks
policy_net_B = DQNetwork(input_size=10, hidden_size=HIDDEN_SIZE).to(device)
target_net_B = DQNetwork(input_size=10, hidden_size=HIDDEN_SIZE).to(device)
target_net_B.load_state_dict(policy_net_B.state_dict())
target_net_B.eval()

# Initialize optimizers
optimizer_A = optim.Adam(policy_net_A.parameters(), lr=LEARNING_RATE)
optimizer_B = optim.Adam(policy_net_B.parameters(), lr=LEARNING_RATE)

# Initialize replay buffers
memory_A = ReplayBuffer(BUFFER_SIZE)
memory_B = ReplayBuffer(BUFFER_SIZE)

# Training metrics
episode_rewards = []
success_rates = []
epsilon = EPSILON_START

def select_action(state, policy_net, epsilon):
    """Select an action using epsilon-greedy policy."""
    if np.random.random() < epsilon:
        # Exploration: random action
        action = np.random.randint(0, 5)
        with torch.no_grad():
            _, comm = policy_net(torch.FloatTensor(state).unsqueeze(0).to(device))
            comm = comm.item()
    else:
        # Exploitation: best action
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
            q_values, comm = policy_net(state_tensor)
            action = q_values.max(1)[1].item()
            comm = comm.item()
    return action, comm

def optimize_model(policy_net, target_net, optimizer, memory):
    """Perform one step of optimization for a DQN."""
    if len(memory) < BATCH_SIZE:
        return 0
    
    # Sample batch
    states, actions, rewards, next_states, dones, _ = memory.sample(BATCH_SIZE)
    
    # Move to device
    states = states.to(device)
    actions = actions.to(device)
    rewards = rewards.to(device)
    next_states = next_states.to(device)
    dones = dones.to(device)
    
    # Compute Q(s_t, a)
    q_values, _ = policy_net(states)
    q_values = q_values.gather(1, actions)
    
    # Compute expected Q values
    with torch.no_grad():
        next_q_values, _ = target_net(next_states)
        next_q_values = next_q_values.max(1, keepdim=True)[0]
        expected_q_values = rewards + GAMMA * next_q_values * (~dones)
    
    # Compute loss
    loss = F.smooth_l1_loss(q_values, expected_q_values)
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    
    # Gradient clipping - FIXED VERSION
    for param in policy_net.parameters():
        if param.grad is not None:  # Check if gradient exists
            param.grad.data.clamp_(-1, 1)
            
    optimizer.step()
    
    return loss.item()

# Training loop
print("Starting training...")
successes = 0

for episode in range(NUM_EPISODES):
    # Reset environment
    obs_A, obs_B = env.reset()
    
    # Reset episode metrics
    episode_reward = 0
    episode_success = False
    comm_A, comm_B = 0.0, 0.0
    
    # Update epsilon
    epsilon = max(EPSILON_END, EPSILON_START * (EPSILON_DECAY ** episode))
    
    # Episode loop
    while True:
        # Select actions
        action_A, new_comm_A = select_action(obs_A, policy_net_A, epsilon)
        action_B, new_comm_B = select_action(obs_B, policy_net_B, epsilon)
        
        # Take action in environment
        (next_obs_A, next_obs_B), reward, done = env.step(action_A, action_B, new_comm_A, new_comm_B)
        
        # Store transitions
        memory_A.push(obs_A, action_A, reward, next_obs_A, done, new_comm_A)
        memory_B.push(obs_B, action_B, reward, next_obs_B, done, new_comm_B)
        
        # Update observations and communication signals
        obs_A, obs_B = next_obs_A, next_obs_B
        comm_A, comm_B = new_comm_A, new_comm_B
        
        # Perform optimization
        optimize_model(policy_net_A, target_net_A, optimizer_A, memory_A)
        optimize_model(policy_net_B, target_net_B, optimizer_B, memory_B)
        
        # Track reward and success
        episode_reward += reward
        if reward > 0:
            episode_success = True
            successes += 1
        
        if done:
            break
    
    # Update target networks
    if episode % TARGET_UPDATE == 0:
        target_net_A.load_state_dict(policy_net_A.state_dict())
        target_net_B.load_state_dict(policy_net_B.state_dict())
    
    # Record metrics
    episode_rewards.append(episode_reward)
    success_rates.append(1 if episode_success else 0)
    
    # Print progress
    if episode % 50 == 0 or episode == NUM_EPISODES - 1:
        success_rate = np.mean(success_rates[-100:]) if len(success_rates) >= 100 else np.mean(success_rates)
        print(f"Episode {episode}/{NUM_EPISODES} | "
              f"Reward: {episode_reward:.2f} | "
              f"Success: {episode_success} | "
              f"Success Rate (100): {success_rate:.2f} | "
              f"Epsilon: {epsilon:.2f}")
    
    # Early stopping
    if len(success_rates) >= 100 and np.mean(success_rates[-100:]) >= 0.95:
        print(f"Early stopping at episode {episode} with success rate {np.mean(success_rates[-100:]):.2f}")
        break

# Save the trained model (Agent A)
scripted_model = torch.jit.script(policy_net_A)
scripted_model.save("dqn_net.pt")
print("Model saved as dqn_net.pt")

# Plot training results
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(episode_rewards)
plt.title('Episode Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')

plt.subplot(1, 2, 2)
# Calculate moving average of success rate
window_size = min(100, len(success_rates))
moving_avg = [np.mean(success_rates[max(0, i-window_size+1):i+1]) for i in range(len(success_rates))]
plt.plot(moving_avg)
plt.title('Success Rate (Moving Average)')
plt.xlabel('Episode')
plt.ylabel('Success Rate')
plt.ylim(0, 1)

plt.tight_layout()
plt.savefig('training_results.png')
plt.close()

print("Training results saved as training_results.png")

Starting training...
Episode 0/1000 | Reward: 0.00 | Success: False | Success Rate (100): 0.00 | Epsilon: 1.00
Episode 50/1000 | Reward: 0.00 | Success: False | Success Rate (100): 0.02 | Epsilon: 0.78
Episode 100/1000 | Reward: 0.00 | Success: False | Success Rate (100): 0.06 | Epsilon: 0.61
Episode 150/1000 | Reward: 10.00 | Success: True | Success Rate (100): 0.24 | Epsilon: 0.47
Episode 200/1000 | Reward: 10.00 | Success: True | Success Rate (100): 0.42 | Epsilon: 0.37
Episode 250/1000 | Reward: 10.00 | Success: True | Success Rate (100): 0.54 | Epsilon: 0.29
Episode 300/1000 | Reward: 10.00 | Success: True | Success Rate (100): 0.50 | Epsilon: 0.22
Episode 350/1000 | Reward: 0.00 | Success: False | Success Rate (100): 0.46 | Epsilon: 0.17
Episode 400/1000 | Reward: 0.00 | Success: False | Success Rate (100): 0.57 | Epsilon: 0.13
Episode 450/1000 | Reward: 10.00 | Success: True | Success Rate (100): 0.58 | Epsilon: 0.10
Episode 500/1000 | Reward: 0.00 | Success: False | Success Rat