## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
import os
import time

from agent import Agent

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def rgb2gray(rgb, norm=True):
        # rgb image -> gray [0, 1]
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
        # normalize
        gray = gray / 128. - 1.
    return gray

seed = 0
img_stack = 4
action_repeat = 10
env = gym.make('CarRacing-v0', verbose=0)
state = env.reset()
reward_threshold = env.spec.reward_threshold

In [2]:
class Wrapper():
    """
    Environment wrapper for CarRacing 
    """

    def __init__(self, env):
        self.env = env  

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()

        self.die = False
        img_rgb = env.reset()
        img_gray = rgb2gray(img_rgb)
        self.stack = [img_gray] * img_stack  # four frames for decision
        return np.array(self.stack)

    def step(self, action):
        total_reward = 0
        for i in range(action_repeat):
            img_rgb, reward, die, _ = env.step(action)
            # don't penalize "die state"
            if die:
                reward += 100
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 185.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die:
                break
        img_gray = rgb2gray(img_rgb)
        self.stack.pop(0)
        self.stack.append(img_gray)
        assert len(self.stack) == img_stack
        return np.array(self.stack), total_reward, done, die


    @staticmethod
    def reward_memory():
        # record reward for last 100 steps
        count = 0
        length = 100
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory
    
agent = Agent(device)

env_wrap = Wrapper(env)    

### 2. Prepare Load

In [3]:
def load(agent, directory, filename):
    agent.net.load_state_dict(torch.load(os.path.join(directory,filename)))

### 3. Prepare Player

In [4]:
from collections import deque
import os

def play(env, agent, n_episodes):
    state = env_wrap.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        state = env_wrap.reset()        
        score = 0
        
        time_start = time.time()
        
        while True:
            action, a_logp = agent.select_action(state)
            env.render()
            next_state, reward, done, die = env_wrap.step( \
                action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))

            state = next_state
            score += reward
            
            if done or die:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)

        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  


### 3. Load and Play: Score = 350-550

In [11]:
load(agent, 'dir_chk', 'model_weights_350-550.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 63.53,	Score: 63.53 	Time: 00:00:04
Episode 2	Average Score: 305.90,	Score: 548.28 	Time: 00:00:10
Episode 3	Average Score: 370.60,	Score: 500.00 	Time: 00:00:11
Episode 4	Average Score: 366.48,	Score: 354.09 	Time: 00:00:07
Episode 5	Average Score: 304.39,	Score: 56.03 	Time: 00:00:05


### 4. Load and Play: Score = 580-660

In [6]:
load(agent, 'dir_chk', 'model_weights_480-660.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 603.72,	Score: 603.72 	Time: 00:00:12
Episode 2	Average Score: 593.94,	Score: 584.16 	Time: 00:00:11
Episode 3	Average Score: 432.31,	Score: 109.06 	Time: 00:00:08
Episode 4	Average Score: 480.99,	Score: 627.01 	Time: 00:00:11
Episode 5	Average Score: 517.67,	Score: 664.38 	Time: 00:00:11


### 5. Load and Play: Score = 820-980

In [7]:
load(agent, 'dir_chk', 'model_weights_820-980.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 1003.80,	Score: 1003.80 	Time: 00:00:10
Episode 2	Average Score: 958.42,	Score: 913.04 	Time: 00:00:11
Episode 3	Average Score: 943.30,	Score: 913.04 	Time: 00:00:11
Episode 4	Average Score: 943.02,	Score: 942.18 	Time: 00:00:11
Episode 5	Average Score: 938.26,	Score: 919.25 	Time: 00:00:11


In [12]:
env.close()