<a href="https://colab.research.google.com/github/Sidy3143/Reinforcement-Learning/blob/main/Cartpole_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=-1)
        return x

In [None]:
# Choose action based on the policy
def select_action(policy_net, state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy_net(state)
    action = np.random.choice(np.arange(probs.shape[1]), p=probs.detach().numpy().ravel())
    return action

In [None]:
# Function to compute discounted rewards
def compute_returns(rewards, gamma=0.99):
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + 1e-9)
    return returns

In [None]:
# Training parameters
hidden_dim = 64
learning_rate = 1e-2
gamma = 0.99
batch_size = 20
max_time_steps = int(1e6)
save_interval = 400
eval_steps = 1000

In [None]:
env = gym.make('CartPole-v1')
policy_net = PolicyNetwork(env.observation_space.shape[0], hidden_dim, env.action_space.n)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

In [None]:
# Training loop
total_time_steps = 0
episode_rewards = []

while total_time_steps < max_time_steps:
    state = env.reset()
    log_probs = []
    rewards = []

    for t in range(100):  # Training loop for 100 time steps
        action = select_action(policy_net, state)
        next_state, reward, done, _ = env.step(action)

        log_prob = torch.log(policy_net(torch.from_numpy(state).float().unsqueeze(0))[0, action])
        log_probs.append(log_prob)
        rewards.append(reward)

        state = next_state
        total_time_steps += 1

        if done:
            break

        # Every 20 time steps, perform a policy update
        if total_time_steps % batch_size == 0:
            returns = compute_returns(rewards)
            loss = []
            for log_prob, R in zip(log_probs, returns):
                loss.append(-log_prob * R)
            loss = torch.cat(loss).sum()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    episode_rewards.append(sum(rewards))

    # Save and evaluate the model every 400 time steps
    if total_time_steps % save_interval == 0:
        torch.save(policy_net.state_dict(), f'policy_net_{total_time_steps}.pth')
        print(f"Model saved at time step {total_time_steps}")

        # Evaluation without training
        eval_rewards = []
        for _ in range(10):  # Run multiple test episodes
            state = env.reset()
            test_rewards = 0
            for _ in range(eval_steps):  # Play for 1000 steps
                action = select_action(policy_net, state)
                state, reward, done, _ = env.step(action)
                test_rewards += reward
                if done:
                    break
            eval_rewards.append(test_rewards)
        print(f"Evaluation after {total_time_steps} time steps: Average reward = {np.mean(eval_rewards)}")

env.close()
