In [6]:
import os
import numpy as np

#torch imports
import torch
from torchvision import transforms
from torch import nn
from torch.distributions import Categorical

# Gym is an OpenAI toolkit for RL
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace

# Super Mario environment for OpenAI Gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY

# For Plotting
import matplotlib.pyplot as plt

# For storing lists and reusing
import pickle

## Preprocessing the environment

In [7]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(low=0, high=255, shape=self.observation_space.shape[:2], dtype=np.uint8)

    def observation(self, observation):
        transform = transforms.Grayscale()
        return transform(torch.tensor(np.transpose(observation, (2, 0, 1)).copy(), dtype=torch.float))


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transformations = transforms.Compose([transforms.Resize(self.shape), transforms.Normalize(0, 255)])
        return transformations(observation).squeeze(0)

env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3')
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)
env = JoypadSpace(env, RIGHT_ONLY)

env2 = env
# env.seed(42)
# env.action_space.seed(42)
# torch.manual_seed(42)
# torch.random.manual_seed(42)
# np.random.seed(42)

## Defining a Convolutional Network for Actor and Critic

In [8]:
class CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n)
        )
        self.critic = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, obs):
        return Categorical(logits=self.actor(obs)), self.critic(obs).reshape(-1)

## PPO Agent

In [9]:
class PPOAgent:
    def __init__(self,params):
        #device, we use gpu
        self.device = torch.device("cuda")
        #params
        self.params = params
        self.rewards = []
        self.gamma = params['gamma']
        self.lamda = params['lamda']
        self.n_mini_batch = params['n_mini_batch']
        self.epochs = params['epochs']
        self.save_directory = params['save_directory']
        self.batch_size = params['batch_size']
        self.mini_batch_size = self.batch_size // self.n_mini_batch
        self.state = env.reset().__array__()
        self.policy = CNNet().to(self.device)
        self.mse_loss = nn.MSELoss()
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': 0.00025},
            {'params': self.policy.critic.parameters(), 'lr': 0.001}
        ], eps=1e-4)
        self.policy_old = CNNet().to(self.device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.undiscounted_rewards = []
        self.all_mean_rewards = []
        self.episode = 0
        self.visual = params['visual']
        self.max_episodes = params['episodes']
    
    def sample(self):

        # Initiating environment variables
        rewards = np.zeros(self.batch_size, dtype=np.float32)
        actions = np.zeros(self.batch_size, dtype=np.int32)
        done = np.zeros(self.batch_size, dtype=bool)
        state = np.zeros((self.batch_size, 4, 84, 84), dtype=np.float32)
        log_pis = np.zeros(self.batch_size, dtype=np.float32)
        values = np.zeros(self.batch_size, dtype=np.float32)

        for t in range(self.batch_size):
            # temporarily setting all requires_grad flags to false
            with torch.no_grad():
                state[t] = self.state
                actionProb, v = self.policy_old(torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0))
                values[t] = v.cpu().numpy()
                #taking action from distribution
                action = actionProb.sample()
                actions[t] = action.cpu().numpy()
                log_pis[t] = actionProb.log_prob(action).cpu().numpy()
            self.state, rewards[t], done[t], _ = env.step(actions[t])
            self.state = self.state.__array__()
            if self.visual == True:
                env.render()
            self.rewards.append(rewards[t])
            if done[t]:
                self.episode += 1
                self.undiscounted_rewards.append(np.sum(self.rewards))
                self.rewards = []
                env.reset()
                if self.episode % 10 == 0:
                    print('Episode: {}, average reward: {}'.format(self.episode, np.mean(self.undiscounted_rewards[-10:])))
                    self.all_mean_rewards.append(np.mean(self.undiscounted_rewards[-10:]))
                    plt.plot(self.all_mean_rewards)
                    plt.savefig("{}/Unidiscounted_Reward_ep_{}.png".format(self.save_directory, self.episode))
                    plt.clf()
                    self.save_model()

            # episode limiter
            if self.episode >= self.max_episodes:
                break
            
        returns, advantages = self.calculate_advantages(done, rewards, values)
        
        # with open('returns_ppo.pkl', 'wb') as file:
        #     pickle.dump(returns, file)
        
        # with open('rewards_ppo.pkl', 'wb') as file:
        #     pickle.dump(self.all_rewards, file)

        return {
            'states': torch.tensor(state.reshape(state.shape[0], *state.shape[1:]), dtype=torch.float32, device=self.device),
            'actions': torch.tensor(actions, device=self.device),
            'values': torch.tensor(values, device=self.device),
            'log_pis': torch.tensor(log_pis, device=self.device),
            'advantages': torch.tensor(advantages, device=self.device, dtype=torch.float32),
            'returns': torch.tensor(returns, device=self.device, dtype=torch.float32)
        }

    def calculate_advantages(self, done, rewards, values):
        _, last_value = self.policy_old(torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0))
        last_value = last_value.cpu().data.numpy()
        values = np.append(values, last_value)
        returns = []
        gae = 0
        for i in reversed(range(len(rewards))):
            mask = 1.0 - done[i]
            delta = rewards[i] + self.gamma * values[i + 1] * mask - values[i]
            gae = delta + self.gamma * self.lamda * mask * gae
            returns.insert(0, gae + values[i])
        adv = np.array(returns) - values[:-1]
        return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-8)

    def calculate_loss(self, samples, clip_range):
        sampled_returns = samples['returns']
        sampled_advantages = samples['advantages']
        actionProb, value = self.policy(samples['states'])
        ratio = torch.exp(actionProb.log_prob(samples['actions']) - samples['log_pis'])
        clipped_ratio = ratio.clamp(min=1.0 - clip_range, max=1.0 + clip_range)
        policy_reward = torch.min(ratio * sampled_advantages, clipped_ratio * sampled_advantages)
        entropy_bonus = actionProb.entropy()
        vf_loss = self.mse_loss(value, sampled_returns)
        loss = -policy_reward + 0.5 * vf_loss - 0.01 * entropy_bonus
        return loss.mean()

    def train(self, samples, clip_range):
        indexes = torch.randperm(self.batch_size)
        for start in range(0, self.batch_size, self.mini_batch_size):
            end = start + self.mini_batch_size
            mini_batch_indexes = indexes[start: end]
            mini_batch = {}
            for k, v in samples.items():
                mini_batch[k] = v[mini_batch_indexes]
            for _ in range(self.epochs):
                loss = self.calculate_loss(clip_range=clip_range, samples=mini_batch)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            self.policy_old.load_state_dict(self.policy.state_dict())

    def save_model(self):
        filename = os.path.join(self.save_directory, 'PPOmodel_ep_{}.pth'.format(self.episode))
        torch.save(self.policy_old.state_dict(), f=filename)
        #print('Model saved to \'{}\''.format(filename))

    def load_model(self, filename):
        self.policy.load_state_dict(torch.load(os.path.join(self.save_directory, filename)))
        self.policy_old.load_state_dict(torch.load(os.path.join(self.save_directory, filename)))
        #print('Resuming training from checkpoint \'{}\'.'.format(filename))

In [10]:
train_parameters = {
    'gamma' : 0.99,
    'lamda' : 0.95,
    'n_mini_batch' : 4,
    'epochs' : 30,
    'save_directory' : "./Models",
    'batch_size' : 4096,
    'visual' : True,
    'episodes' : 10,
}

# The eps value for 1-eps : 1+eps clipping
PPO_clip = 0.2

solver = PPOAgent(train_parameters)
while True:
    solver.train(solver.sample(), PPO_clip)



Episode: 10, average reward: 696.9000244140625


In [None]:
if __name__ == '__main__':
    # set the random seed
    # np.random.seed(1234)
    # random.seed(1234)
    # torch.manual_seed(1234)


    # create training parameters
    train_parameters = {
    'gamma' : 0.99,
    'lamda' : 0.95,
    'n_mini_batch' : 4,
    'epochs' : 30,
    'save_directory' : "./mario_ppo",
    'batch_size' : 4096,
    'visual' : False,
    'episodes' : 10,
}

    agent = PPOAgent(train_parameters)
    agent.load_model("marioppo.pth")

    gamma = 0.99
    step_list = []
    undisc_returns = []
    returns = []
    for i in range(100):
        state = env2.reset().__array__()
        step_count = 0
        done = False
        ep_reward = []
        R = 0
        G = 0
        while not done:
            actionProb,_ = agent.policy_old(torch.tensor(state, dtype=torch.float32, device=torch.device("cuda")).unsqueeze(0))
            action = actionProb.sample()
            actiond = np.zeros(1, dtype = np.int32)
            actiond[0] = action.cpu().numpy()
            next_obs, reward, done, info = env2.step(actiond[0])
            flag = info['flag_get']
            ep_reward.append(reward)
            step_count+= 1
            state = next_obs.__array__()
        if flag:
            for r in reversed(ep_reward):
                R += r
                G  = r + gamma*G
            undisc_returns.append(R)
            returns.append(G)
            step_list.append(step_count)


In [None]:
import pickle
with open('trained_returns_ppo.pk1', 'wb') as file:
    pickle.dump(returns, file)

with open('undisc_trained_returns_ppo.pk1', 'wb') as file:
    pickle.dump(undisc_returns, file)

with open('step_count_ppo.pk1', 'wb') as file:
    pickle.dump(step_list, file)

In [None]:
env2.close()