## Import required dependencies and check device

In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 17 14:48:31 2023

@author: dinglin
"""

import numpy as np
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [2]:
import os
!nvidia-smi

Thu Oct 19 06:12:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:1A:00.0 Off |                  N/A |
| 22%   39C    P2    62W / 250W |  10489MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:1B:00.0 Off |                  N/A |
| 22%   23C    P8    21W / 250W |    341MiB / 11264MiB |      0%      Default |
|       

In [3]:
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')

In [4]:
# Preprocess game frames
def preprocess(frame):
    frame = frame[35:195]
    frame = frame[::2,::2,0]
    frame[frame == 144] = 0
    frame[frame == 109] = 0
    frame[frame != 0] = 1
    return np.reshape(frame.astype(np.float32), 6400)

## Define Policy Gradient Algorithm

In [5]:
class Policy(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(Policy, self).__init__()
        # memories
        self.ep_obs, self.ep_as, self.ep_rs, self.log_p = [], [], [], []
        # parameters
        self.gamma = 0.99
        # neural networks
        self.layer1 = nn.Linear(n_inputs, 200)
        self.layer2 = nn.Linear(200, n_outputs)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        x = self.softmax(x)
        return x
    
    def choose_action(self, p):
        action = Categorical(p).sample()
        self.log_p.append(torch.log(p[action]))
        return action
    
    def store_transition(self, s, a, r):
        self.ep_obs.append(np.array([s], np.float32))
        self.ep_as.append(a)
        self.ep_rs.append(r)
    
    def discount_reward(self):
        discounted_ep_rs = np.zeros_like(self.ep_rs)
        running_add = 0

        for t in reversed(range(0, len(self.ep_rs))):
            running_add = running_add * self.gamma + self.ep_rs[t]
            discounted_ep_rs[t] = running_add

        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= (np.std(discounted_ep_rs)+1e-9)
        return discounted_ep_rs
        
    def learn(self):
        self.optimizer = optim.Adam(self.parameters(), lr = 1e-3)
        discounted_ep_rs_norm = self.discount_reward()
        loss = []
        for q, l in zip(discounted_ep_rs_norm, self.log_p):
            loss.append(-l * q)

        self.optimizer.zero_grad()
        loss = torch.stack(loss).sum()
        loss.backward()
        self.optimizer.step()
        
        self.ep_obs, self.ep_as, self.ep_rs, self.log_p  = [], [], [], []

def smoothing_plot(rewards, window = 99, save = "True"):
    # Sample rewards data (replace this with your actual rewards)
    episodes = list(range(1, len(rewards)+1))

    # Apply a smoothing filter (Savitzky-Golay filter) to the rewards
    smoothed_rewards = savgol_filter(rewards, window, 3)

    # Create the plot
    plt.figure(figsize=(10, 6))
    plt.plot(episodes, rewards, label='Original Rewards', color='lightgray', alpha=0.7)
    plt.plot(episodes, smoothed_rewards, label='Smoothed Rewards', color='blue')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Rewards per Episode with Smoothing')
    plt.legend()
    plt.grid(True)
    if save:
        plt.savefig("rewards.pdf")
    plt.show()

## Cart-pole Simulation

In [14]:
# key functions of Gym
env = gym.make('CartPole-v0')

# Initialize RL agent
RL_cart = Policy(4, 2).to(device)

rewards = []
# Roll out 1000 episodes
for episode in range(1000):
    
    # Initiate one episode
    observation, info = env.reset()

    terminated = False
    truncated = False

    # Roll out one episode
    while (not terminated) and (not truncated):
        #action = env.action_space.sample() # Use your policy here
        observation = torch.from_numpy(observation).to(device)
        p = RL_cart(observation)
        action = RL_cart.choose_action(p)
        observation, reward, terminated, truncated, info = env.step(int(action))

        RL_cart.store_transition(observation, action, reward)
    
    ep_rs_sum = sum(RL_cart.ep_rs)

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
    rewards.append(ep_rs_sum)
    if episode%10 == 0:
        print(f"CartPole-v0, episode {episode}, rewards {ep_rs_sum}")
    vt = RL_cart.learn()
    
env.close()

CartPole-v0, episode 0, rewards 15.0
CartPole-v0, episode 10, rewards 30.0
CartPole-v0, episode 20, rewards 17.0
CartPole-v0, episode 30, rewards 44.0
CartPole-v0, episode 40, rewards 20.0
CartPole-v0, episode 50, rewards 58.0
CartPole-v0, episode 60, rewards 19.0
CartPole-v0, episode 70, rewards 43.0
CartPole-v0, episode 80, rewards 74.0
CartPole-v0, episode 90, rewards 48.0
CartPole-v0, episode 100, rewards 21.0
CartPole-v0, episode 110, rewards 33.0
CartPole-v0, episode 120, rewards 47.0
CartPole-v0, episode 130, rewards 101.0
CartPole-v0, episode 140, rewards 20.0
CartPole-v0, episode 150, rewards 20.0
CartPole-v0, episode 160, rewards 78.0
CartPole-v0, episode 170, rewards 33.0
CartPole-v0, episode 180, rewards 19.0
CartPole-v0, episode 190, rewards 25.0
CartPole-v0, episode 200, rewards 99.0
CartPole-v0, episode 210, rewards 200.0
CartPole-v0, episode 220, rewards 107.0
CartPole-v0, episode 230, rewards 37.0
CartPole-v0, episode 240, rewards 96.0
CartPole-v0, episode 250, rewards

## Pong game Simulation

In [7]:
env = gym.make("Pong-v0")

RL_pong = Policy(6400, 2).to(device)
rewards = []
for episode in range(2000):
    # Initiate one episode
    observation, info = env.reset()
    #observation = [item for sublist in observation for item in sublist]

    obs_history = []
    reward_history = []
    action_history = []

    terminated = False
    truncated = False

    # Roll out one episode
    while (not terminated) and (not truncated):
        #action = env.action_space.sample() # Use your policy here
        observation = preprocess(observation)
        observation = torch.from_numpy(observation).to(device)
        p = RL_pong(observation)
        action = RL_pong.choose_action(p) + 2
        observation, reward, terminated, truncated, info = env.step(action)

        
        #observation = [item for sublist in observation for item in sublist]
        RL_pong.store_transition(observation, action-2, reward)
        obs_history.append(observation)
        reward_history.append(reward)
        action_history.append(action)
        
        #print("observation", np.shape(observation))
    ep_rs_sum = sum(RL_pong.ep_rs)

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
    
    rewards.append(ep_rs_sum)
    vt = RL_pong.learn()
    if episode % 10 == 0:
        print(f"Pong-v0, episode {episode}, rewards {ep_rs_sum}")
    if episode % 100 == 0:
        np.save('./rewards.npy', rewards)
    if episode % 500 == 0:
        model_file_name = f"model_episode_{episode}.pth"
        torch.save(RL_pong.state_dict(), model_file_name)

env.close()

Pong-v0, episode 0, rewards -21.0
Pong-v0, episode 10, rewards -20.0
Pong-v0, episode 20, rewards -20.0
Pong-v0, episode 30, rewards -20.0
Pong-v0, episode 40, rewards -20.0
Pong-v0, episode 50, rewards -19.0
Pong-v0, episode 60, rewards -21.0
Pong-v0, episode 70, rewards -19.0
Pong-v0, episode 80, rewards -19.0
Pong-v0, episode 90, rewards -20.0
Pong-v0, episode 100, rewards -18.0
Pong-v0, episode 110, rewards -20.0
Pong-v0, episode 120, rewards -18.0
Pong-v0, episode 130, rewards -17.0
Pong-v0, episode 140, rewards -19.0
Pong-v0, episode 150, rewards -18.0
Pong-v0, episode 160, rewards -19.0
Pong-v0, episode 170, rewards -18.0
Pong-v0, episode 180, rewards -17.0
Pong-v0, episode 190, rewards -15.0
Pong-v0, episode 200, rewards -18.0
Pong-v0, episode 210, rewards -18.0
Pong-v0, episode 220, rewards -19.0
Pong-v0, episode 230, rewards -10.0
Pong-v0, episode 240, rewards -17.0
Pong-v0, episode 250, rewards -19.0
Pong-v0, episode 260, rewards -14.0
Pong-v0, episode 270, rewards -13.0
Pon