In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 21 16:34:31 2023

@author: dinglin
"""

import numpy as np
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [2]:
import os
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')

In [3]:
# Preprocess game frames
def preprocess(frame):
    frame = frame[35:195]
    frame = frame[::2,::2,0]
    frame[frame == 144] = 0
    frame[frame == 109] = 0
    frame[frame != 0] = 1
    return np.reshape(frame.astype(np.float32), 6400)

## Policy Network and Value Network

In [4]:
class PolicyNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(PolicyNetwork, self).__init__()
        # memories
        self.ep_obs, self.ep_as, self.ep_rs, self.log_p = [], [], [], []
        # parameters
        self.gamma = 0.99
        # neural networks
        self.layer1 = nn.Linear(n_inputs, 200)
        self.layer2 = nn.Linear(200, n_outputs)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        x = self.softmax(x)
        return x
    
class ValueNetwork(nn.Module):
    def __init__(self, n_inputs):
        super(ValueNetwork, self).__init__()
        # neural networks
        self.layer1 = nn.Linear(n_inputs, 200)
        self.layer2 = nn.Linear(200, 200)
        self.output = nn.Linear(200, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.output(x)

## Policy Gradient with Baseline

In [5]:
class PolicyGradient():
    def __init__(self, n_inputs, n_outputs, lr_p, lr_v, gamma):
        self.action_size = n_outputs
        self.p_net = PolicyNetwork(n_inputs, n_outputs).to(device)
        self.v_net = ValueNetwork(n_inputs).to(device)
        self.a_optimizer = optim.Adam(self.p_net.parameters(), lr=lr_p)
        self.v_optimizer = optim.Adam(self.v_net.parameters(), lr=lr_v)
        self.gamma = gamma
        
    def select_action(self, state):
        #get action probs then randomly sample from the probabilities
        with torch.no_grad():
            input_state = torch.FloatTensor(state).to(device)
            action_probs = self.p_net(input_state)
            #detach and turn to numpy to use with np.random.choice()
            action_probs = action_probs.detach().cpu().numpy()
            action = np.random.choice(np.arange(self.action_size), p=action_probs)
        return action

    def train(self, state_list, action_list, reward_list):
        
        #turn rewards into return
        trajectory_len = len(reward_list)
        return_array = np.zeros((trajectory_len,))
        g_return = 0.
        for i in range(trajectory_len-1,-1,-1):
            g_return = reward_list[i] + self.gamma*g_return
            return_array[i] = g_return
            
        # create tensors
        state_t = torch.FloatTensor(state_list).to(device)
        action_t = torch.LongTensor(action_list).to(device).view(-1,1)
        return_t = torch.FloatTensor(return_array).to(device).view(-1,1)
        
        # get value function estimates
        vf_t = self.v_net(state_t).to(device)
        with torch.no_grad():
            advantage_t = return_t - vf_t
        
        # calculate actor loss
        selected_action_prob = self.p_net(state_t).gather(1, action_t)
        # REINFORCE loss:
        #actor_loss = torch.mean(-torch.log(selected_action_prob) * return_t)
        # REINFORCE Baseline loss:
        actor_loss = torch.mean(-torch.log(selected_action_prob) * advantage_t)
        self.a_optimizer.zero_grad()
        actor_loss.backward()
        self.a_optimizer.step() 

        # calculate vf loss
        loss_fn = nn.MSELoss()
        vf_loss = loss_fn(vf_t, return_t)
        self.v_optimizer.zero_grad()
        vf_loss.backward()
        self.v_optimizer.step() 
        
        return actor_loss.detach().cpu().numpy(), vf_loss.detach().cpu().numpy()

## Cart-Pole Simulation

In [33]:
# key functions of Gym
env = gym.make('CartPole-v0')

# Initialize RL agent
RL_cart = PolicyGradient(4,2,1e-3,1e-3,0.95)

rewards = []
stats_rewards_list = []
# Roll out 1000 episodes
for episode in range(1000):    
    # Initiate one episode
    observation, info = env.reset()
    episode_length = 0
    stats_actor_loss, stats_vf_loss = 0., 0.
    
    state_list, action_list, reward_list = [], [], []

    terminated = False
    truncated = False

    # Roll out one episode
    while (not terminated) and (not truncated):
        #action = env.action_space.sample() # Use your policy here
        #observation = torch.from_numpy(observation).to(device)
        #p = RL_cart(observation)
        action = RL_cart.select_action(observation)
        next_observation, reward, terminated, truncated, _  = env.step(int(action))

        # store agent's trajectory
        state_list.append(observation)
        action_list.append(action)
        reward_list.append(reward)
        
        observation = next_observation
    
    ep_rs_sum = sum(reward_list)
    actor_loss, vf_loss = RL_cart.train(state_list, action_list, reward_list)
    stats_rewards_list.append((episode, ep_rs_sum, episode_length))
    stats_actor_loss += actor_loss
    stats_vf_loss += vf_loss
    total_reward = 0
    episode_length = 0  
    if episode % 10 == 0:
        print('Episode: {}'.format(episode),
            'Total reward: {:.1f}'.format(np.mean(stats_rewards_list[-10:],axis=0)[1]),
            'Episode length: {:.1f}'.format(np.mean(stats_rewards_list[-10:],axis=0)[2]),
            'Actor Loss: {:.4f}'.format(stats_actor_loss/10), 
            'VF Loss: {:.4f}'.format(stats_vf_loss/10))
        stats_actor_loss, stats_vf_loss = 0., 0.

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
    rewards.append(ep_rs_sum)
    
env.close()

Episode: 0 Total reward: 20.0 Episode length: 0.0 Actor Loss: 0.5372 VF Loss: 7.5632
Episode: 10 Total reward: 17.6 Episode length: 0.0 Actor Loss: 0.1848 VF Loss: 1.9552
Episode: 20 Total reward: 15.7 Episode length: 0.0 Actor Loss: 0.2932 VF Loss: 3.8830
Episode: 30 Total reward: 24.8 Episode length: 0.0 Actor Loss: 0.2076 VF Loss: 3.6657
Episode: 40 Total reward: 13.4 Episode length: 0.0 Actor Loss: 0.0956 VF Loss: 1.5322
Episode: 50 Total reward: 16.2 Episode length: 0.0 Actor Loss: 0.0236 VF Loss: 0.9113
Episode: 60 Total reward: 21.3 Episode length: 0.0 Actor Loss: -0.1005 VF Loss: 0.5674
Episode: 70 Total reward: 22.8 Episode length: 0.0 Actor Loss: -0.3741 VF Loss: 6.4060
Episode: 80 Total reward: 20.8 Episode length: 0.0 Actor Loss: -0.2268 VF Loss: 3.6022
Episode: 90 Total reward: 29.3 Episode length: 0.0 Actor Loss: 0.1115 VF Loss: 2.8257
Episode: 100 Total reward: 36.1 Episode length: 0.0 Actor Loss: 0.1259 VF Loss: 1.9261
Episode: 110 Total reward: 53.6 Episode length: 0.0

In [7]:
env = gym.make("Pong-v0")

RL_pong = PolicyGradient(6400,2,1e-3,1e-3,0.99)
rewards = []
stats_rewards_list = []
for episode in range(2000):
    # Initiate one episode
    observation, info = env.reset()
    episode_length = 0
    stats_actor_loss, stats_vf_loss = 0., 0.
    
    state_list, action_list, reward_list = [], [], []

    terminated = False
    truncated = False

    # Roll out one episode
    while (not terminated) and (not truncated):
        #action = env.action_space.sample() # Use your policy here
        observation = preprocess(observation)
        action = RL_pong.select_action(observation)
        next_observation, reward, terminated, truncated, info = env.step(action+2)

        # store agent's trajectory
        state_list.append(observation)
        action_list.append(action)
        reward_list.append(reward)
        
        observation = next_observation

    ep_rs_sum = sum(reward_list)

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
        
    actor_loss, vf_loss = RL_pong.train(state_list, action_list, reward_list)
    stats_rewards_list.append((episode, ep_rs_sum, episode_length))
    stats_actor_loss += actor_loss
    stats_vf_loss += vf_loss
    total_reward = 0
    episode_length = 0
    
    rewards.append(ep_rs_sum)
    if episode % 10 == 0:
        print('Episode: {}'.format(episode),
            'Total reward: {:.1f}'.format(np.mean(stats_rewards_list[-10:],axis=0)[1]),
            'Episode length: {:.1f}'.format(np.mean(stats_rewards_list[-10:],axis=0)[2]),
            'Actor Loss: {:.4f}'.format(stats_actor_loss/10), 
            'VF Loss: {:.4f}'.format(stats_vf_loss/10))
        stats_actor_loss, stats_vf_loss = 0., 0.
    if episode % 100 == 0:
        np.save('./rewards_base.npy', rewards)

  logger.warn(


Episode: 0 Total reward: -21.0 Episode length: 0.0 Actor Loss: -0.1362 VF Loss: 0.4062
Episode: 10 Total reward: -20.1 Episode length: 0.0 Actor Loss: -0.0131 VF Loss: 0.0469
Episode: 20 Total reward: -20.2 Episode length: 0.0 Actor Loss: 0.0017 VF Loss: 0.0621
Episode: 30 Total reward: -20.6 Episode length: 0.0 Actor Loss: 0.0035 VF Loss: 0.0407
Episode: 40 Total reward: -20.6 Episode length: 0.0 Actor Loss: -0.0244 VF Loss: 0.0443
Episode: 50 Total reward: -19.8 Episode length: 0.0 Actor Loss: -0.0075 VF Loss: 0.0354
Episode: 60 Total reward: -20.0 Episode length: 0.0 Actor Loss: 0.0122 VF Loss: 0.0307
Episode: 70 Total reward: -19.3 Episode length: 0.0 Actor Loss: -0.0038 VF Loss: 0.0303
Episode: 80 Total reward: -20.3 Episode length: 0.0 Actor Loss: -0.0002 VF Loss: 0.0247
Episode: 90 Total reward: -19.8 Episode length: 0.0 Actor Loss: 0.0040 VF Loss: 0.0429
Episode: 100 Total reward: -19.9 Episode length: 0.0 Actor Loss: -0.0022 VF Loss: 0.0191
Episode: 110 Total reward: -19.9 Epi