In [None]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F

import optuna

from DQN_parametrized import DQN_parametrized 

import gym
import matplotlib.pyplot as plt
from itertools import count

import torch
import torch.optim as optim
import torch.nn.functional as F

from memory import ReplayMemory
from DQN import DQN
from preprocessing import get_screen
from utils import select_action, plot_scores, reset_steps
from training import optimize_model

In [None]:
env = gym.make('LunarLander-v2')

In [None]:
n_actions = env.action_space.n
init_screen = get_screen(env)
_, _, screen_height, screen_width = init_screen.shape

In [None]:
plt.ion()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
best_booster = None
target_net = None

def callback(study, trial):
    global best_booster
    if study.best_trial == trial:
        best_booster = target_net

In [None]:
def objective(trial):
    reset_steps() # reset the epsilon of the epsilon-greedy policy
    global target_net
    
    pred_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    
    target_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    target_net.load_state_dict(pred_net.state_dict())
    target_net.eval() 
    
    REPLAY_MEMORY_SIZE = 1000
    memory = ReplayMemory(REPLAY_MEMORY_SIZE)

    TARGET_UPDATE = 1000  # period of target network update
    optimizer = optim.RMSprop(pred_net.parameters())

    num_episodes = 80
    episode_rewards = []
    steps = 0
    eps_decay = 1000
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        state = get_screen(env).to(device)
        episode_rewards.append(0)
        done = False
        while not done:
            # Select and perform an action
            action = select_action(pred_net, state, n_actions, eps_decay=eps_decay).to(device)
            _, reward, done, _ = env.step(action.item())  # our states are screenshot differences
            episode_rewards[-1] += reward

            reward = torch.tensor([reward], device=device)

            # Observe new state
            if not done:
                next_state = get_screen(env).to(device)
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(device, pred_net, target_net, optimizer, memory)
            steps += 1

            if steps == TARGET_UPDATE:  # update the target net weights
                steps = 0
                target_net.load_state_dict(pred_net.state_dict())
        print(i_episode, 'reward:', episode_rewards[-1])
    return sum(episode_rewards)/num_episodes

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
%%time
study.optimize(objective, n_trials=20, callbacks=[callback])

In [None]:
study.best_trial

In [None]:
def run_best(trial):
    reset_steps() # reset the epsilon of the epsilon-greedy policy
    global target_net
    
    pred_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    
    target_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    target_net.load_state_dict(pred_net.state_dict())
    target_net.eval() 
    
    REPLAY_MEMORY_SIZE = 1000
    memory = ReplayMemory(REPLAY_MEMORY_SIZE)

    TARGET_UPDATE = 1000  # period of target network update
    optimizer = optim.RMSprop(pred_net.parameters())

    num_episodes = 1000
    episode_rewards = []
    steps = 0
    eps_decay = 1000
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        state = get_screen(env).to(device)
        episode_rewards.append(0)
        done = False
        while not done:
            # Select and perform an action
            action = select_action(pred_net, state, n_actions, eps_decay=eps_decay).to(device)
            _, reward, done, _ = env.step(action.item())  # our states are screenshot differences
            episode_rewards[-1] += reward

            reward = torch.tensor([reward], device=device)

            # Observe new state
            if not done:
                next_state = get_screen(env).to(device)
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(device, pred_net, target_net, optimizer, memory)
            steps += 1

            if steps == TARGET_UPDATE:  # update the target net weights
                steps = 0
                target_net.load_state_dict(pred_net.state_dict())
        plot_scores(episode_rewards)
    return sum(episode_rewards)/num_episodes

In [None]:
run_best(study.best_trial)