# Imports and setup

In [None]:
import sys
from typing import Optional, Tuple

import gym
import imageio
import numpy as np
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecNormalize, VecVideoRecorder
from stable_baselines3 import PPO, A2C, TD3
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.base_class import BaseAlgorithm

In [None]:
sys.path.append("..")

In [None]:
from src.env.observation_wrapper import ImageWrapper
from src.callbacks.episodic_callback import EpisodicCallback
from src.evaluation import evaluate

In [None]:
def train_test(
    model: BaseAlgorithm, 
    model_path: str,
    env: gym.Env, 
    eval_env: gym.Env, 
    train_steps: int = 50000,  
    n_test_episodes: int=10,
    **train_kwargs: Optional[dict],
) -> Tuple[BaseAlgorithm, float, float]:
    
    env.reset()
    model.learn(train_steps, callback=EpisodicCallback(), **train_kwargs)
    model.save(model_path)
    mean_reward, std_reward = evaluate(model, eval_env, n_test_episodes)
    
    env.close()
    eval_env.close()
    
    return mean_reward, std_reward

# Image processing experiments

## Simple environment

In [None]:
env = DummyVecEnv([lambda: gym.make("CarRacing-v0")])
eval_env = DummyVecEnv([lambda: gym.make("CarRacing-v0")])
model_ppo = PPO('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
mean_reward, std_reward = train_test(model_ppo, "../models/PPO_base", env, eval_env, tb_log_name="PPO_base")

In [None]:
mean_reward, std_reward

## Frame stacking

In [None]:
FRAME_STACK_SIZE = 4

In [None]:
env = DummyVecEnv([lambda: gym.make("CarRacing-v0")])
env = VecFrameStack(env, FRAME_STACK_SIZE)

eval_env = DummyVecEnv([lambda: gym.make("CarRacing-v0")])
eval_env = VecFrameStack(eval_env, FRAME_STACK_SIZE)

model_ppo = PPO('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
mean_reward, std_reward = train_test(model_ppo, "../models/PPO_stack", env, eval_env, tb_log_name="PPO_stack")

In [None]:
mean_reward, std_reward

## Image normalization

In [None]:
env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=False, normalize=True)])
eval_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=False, normalize=True)])
model_ppo = PPO('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
mean_reward, std_reward = train_test(model_ppo, "../models/PPO_normalized", env, eval_env, train_steps=100000, tb_log_name="PPO_normalized")

In [None]:
mean_reward, std_reward

## Image processing - grayscale

In [None]:
env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])
eval_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])
model_ppo = PPO('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
mean_reward, std_reward = train_test(model_ppo, "../models/PPO_grayscale", env, eval_env, train_steps=100000, tb_log_name="PPO_grayscale")

In [None]:
mean_reward, std_reward

## Reward normalization

In [None]:
env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=False)])
env = VecNormalize(env, norm_obs=False, norm_reward=True)

eval_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=False)])
model_ppo = PPO('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
# reset_num_timesteps=False
mean_reward, std_reward = train_test(model_ppo, "../models/PPO_reward_norm", env, eval_env, train_steps=100000, tb_log_name="PPO_reward_norm")

In [None]:
mean_reward, std_reward = evaluate(model_ppo, eval_env, 10)

In [None]:
mean_reward, std_reward

# A2C grayscale

In [None]:
env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])
eval_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])
model_a2c = A2C('CnnPolicy', env, tensorboard_log='../logs/', verbose=1)

In [None]:
%%time
mean_reward, std_reward = train_test(model_a2c, "../models/A2C_grayscale", env, eval_env, train_steps=100000, tb_log_name="A2C_grayscale")

In [None]:
mean_reward, std_reward

### **Summary**: The best result was achived by the use of grayscale image processing. This transformation will be used in next experiments.

# Train longer best model 

In [None]:
env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])
eval_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])

model = PPO.load("../models/PPO_grayscale", env)

In [None]:
mean_reward, std_reward = train_test(model, "../models/PPO_grayscale", env, eval_env, train_steps=300000, tb_log_name="PPO_grayscale", reset_num_timesteps=False)

In [None]:
mean_reward, std_reward = evaluate(model, eval_env, 10)
mean_reward, std_reward 

In [None]:
# Generate gameplay video
test_env = DummyVecEnv([lambda: ImageWrapper(gym.make("CarRacing-v0"), grayscale=True)])

images = []
obs = test_env.reset()
done = False
while not done:
    img = test_env.render(mode='rgb_array')
    images.append(img)
    action = model.predict(obs)
    obs, rewards, dones, _ = test_env.step(action[0])
    done = dones[0]

imageio.mimsave('../resources/gameplay.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)
test_env.close()