# 00 Init

## Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Setting to use py files

In [2]:
import os

In [3]:
os.chdir('/content/drive/MyDrive/Minesweeper [RL]')

In [4]:
# check that os is in right directory
os.getcwd()

'/content/drive/MyDrive/Minesweeper [RL]'

In [5]:
! pip install codes

Collecting codes
  Downloading codes-0.1.5-py3-none-any.whl (5.5 kB)
Installing collected packages: codes
Successfully installed codes-0.1.5


## Import py files

In [11]:
# baseline : Env, Agent
# from codes.environment.reward5 import *
from codes.environment.env5reward import *
from codes.agent.vectorDQN import *
from codes.net.basic import *
from codes.trainer.validShutDown import *
# import codes.trainer.trainerWithValidShutDown as Trainer


## Import Libraries

# 01 Info

## level dictionary

In [7]:
level = {'easy' : {'map_size':(9,9), 'n_mines' : 10},
         'medium' : {'map_size':(16,16), 'n_mines':40},
         'expert' : {'map_size':(16,30), 'n_mines':99}}

## HYPER PARAMETERS

In [8]:
# Environment settings
MEM_SIZE = 50000
MEM_SIZE_MIN = 1000

# Learning settings
BATCH_SIZE = 64
LEARNING_RATE = 0.01
LEARN_DECAY = 0.99975
LEARN_MIN = 0.001
DISCOUNT = 0.1

# Exploration settings
EPSILON = 0.95
EPSILON_DECAY = 0.99975
EPSILON_MIN = 0.01

# DQN settings
CONV_UNITS = 64
UPDATE_TARGET_EVERY = 5

# 02 Train, Valid

In [6]:
import matplotlib.pyplot as plt

class Net(nn.Module):
    def __init__(self, input_dims, n_actions, conv_units):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=2)
        self.conv2 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)
        self.conv3 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)
        self.conv4 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)

        self.flatten = nn.Flatten()

        fc_size = conv_units * (input_dims[-1]+2) * (input_dims[-2]+2)

        self.fc = nn.Linear(fc_size, n_actions)

    def forward(self, x):
        # conv area
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = self.flatten(x)
        # flatten area
        x = self.fc(x)

        return x

In [3]:
import pandas as pd
import numpy as np
import copy
from collections import deque
from IPython.display import display
import pickle
import random

import torch
import torch.nn as nn
import torch.optim as optim

import random
import numpy as np
from collections import deque

import torch.nn as nn
import torch.nn.functional as F

class MinesweeperEnv:
    '''
    This env has 5 rewards : win, lose, progress, guess, and no_progress.
    '''
    def __init__(self,
                 map_size,
                 n_mines,
                 rewards={'win':1, 'lose':-1, 'progress':0.3, 'guess':-0.3, 'no_progress' : -0.3},
                 dones={'win':True, 'lose':True, 'progress':False, 'guess':False, 'no_progress' : False}):

        # 지뢰찾기 맵에 대한 기본 정보
        self.map_size = map_size
        self.nrows, self.ncols = map_size
        self.total_tiles = self.nrows*self.ncols # n_tiles에서 변경함
        self.total_mines = n_mines

        # 학습을 위한 정보
        self.rewards = rewards
        self.dones = dones

        # 지뢰찾기 판 생성
        self.board = self.make_init_board()

        # state 생성
        self.state = self.create_state(self.board)

        # 상황 판단을 위한 파라미터
        self.unrevealed = -1.0 / 8.0

    def seed_mines(self):
        actual_board = np.zeros(shape=self.total_tiles, dtype='object')

        # 지뢰 생성
        mine_indices = np.random.choice(self.total_tiles, self.total_mines, replace=False)
        actual_board[mine_indices] = "M"

        # actual board map_size로 복구
        actual_board = actual_board.reshape(self.map_size)

        return actual_board

    def complete_actual_board(self, actual_board):
        padded_actual_board = np.pad(actual_board, pad_width=1, mode='constant', constant_values=0)
        completed_actual_board = actual_board

        for x in range(0, self.nrows):
            for y in range(0, self.ncols):
                if actual_board[x, y] == "M":
                    continue
                else:
                    kernel = padded_actual_board[x:x+3, y:y+3] # padded_actual_board에서의 x,y값은 기존의 +1이라서
                    # kernel[1,1] = 0 _ 논리 상으로는 있는게 맞지만 없어도 문제는 안된다. 중앙이 지뢰일 경우가 없기 때문에
                    completed_actual_board[x, y] = np.sum(kernel == 'M')

        return completed_actual_board

    def make_init_board(self):
        board = np.ones(shape=(2,self.nrows, self.ncols),dtype='object') # (revealed_or_not, game_board)
        actual_board = self.seed_mines()
        actual_board = self.complete_actual_board(actual_board)
        board[1] = actual_board

        return board

    def create_state(self, board):
        revealed_mask = board[0]
        actual_board = copy.deepcopy(board[1])

        # trainable한 형태로 변환
        actual_board[actual_board == "M"] = -2

        masked_state = np.ma.masked_array(actual_board,revealed_mask)
        masked_state = masked_state.filled(-1) # -1은 unrevealed를 의미한다.

        scaled_state = masked_state / 8
        scaled_state = scaled_state.astype(np.float16)

        return scaled_state

    def get_coord(self, action_idx):
        # 선택한 action을 더 가시적이게 나타내기 위해

        x = action_idx // self.ncols
        y = action_idx % self.ncols

        return (x, y)

    def click(self, action_idx):
        # click한 타일을 reveal
        clicked_coord = self.get_coord(action_idx)
        self.board[0][clicked_coord] = 0
        value = self.board[1][clicked_coord]

        unrevealed_mask = self.board[0] # revealed : 0, unrevealed : 1
        actual_board = self.board[1].reshape(1,self.total_tiles)

        # 첫 번째로 선택한 타일은 지뢰가 아니어야 함.
        if (value == 'M') & (np.sum(unrevealed_mask == 0) == 1):
            safe_tile_indices = np.nonzero(actual_board!='M')[1]
            another_move_idx = np.random.choice(safe_tile_indices)
            another_move_coord = self.get_coord(another_move_idx)

            # 지뢰를 이전한다.
            self.board[1][another_move_coord] = 'M'
            self.board[1][clicked_coord] = 0 # 초기화 용

            # 갱신한 내용을 바탕으로 다시 판을 계산한다.
            self.board[1] = self.complete_actual_board(self.board[1])
            value = self.board[1][clicked_coord]

        # 선택한 타일이 0이라면 주변의 타일이 깨진다.
        if value == 0.0:
            self.reveal_neighbors(clicked_coord)

    def reveal_neighbors(self, coord):
        queue = deque([coord])
        seen = set([coord])
        while queue:
            current_coord = queue.popleft()
            x,y = current_coord

            if self.board[1][x,y] == 0:
                for col in range(max(0,y-1), min(y+2, self.ncols)):
                    for row in range(max(0,x-1), min(x+2,self.nrows)):
                        if (row, col) not in seen:
                            seen.add((row, col))
                            queue.append((row, col))

                            self.board[0][row, col] = 0

    def reset(self):
        # 지뢰찾기 판 생성
        self.board = self.make_init_board()
        # state 생성
        self.state = self.create_state(self.board)

    def step(self, action_idx):
        done = False
        coord = self.get_coord(action_idx)

        current_mask = copy.deepcopy(self.board[0])

        # action에 따라 행동을 수행
        self.click(action_idx)

        # update state
        next_state = self.create_state(self.board)
        self.state = next_state

        # About Reward
        if self.board[1][coord] == 'M':
            reward = self.rewards['lose']
            done = self.dones['lose']

        elif np.sum(self.board[0] == 1) == self.total_mines:
            reward = self.rewards['win']
            done = self.dones['win']

        elif current_mask[coord] == 0: # 이미 깐 타일을 눌렀을 때
            reward = self.rewards['no_progress']
            done = self.dones['no_progress']

        else:
            padded_unrevealed = np.pad(current_mask, pad_width=1, mode='constant', constant_values=1)

            if np.sum(padded_unrevealed[coord[0]:coord[0]+3, coord[1]:coord[1]+3] == 1) == 9: # 아무 정보 없이 누른 타일
                reward = self.rewards['guess']
                done = self.dones['guess']

            else:
                reward = self.rewards['progress']
                done = self.dones['progress']

        return self.state, reward, done

    def render(self, state):
        # 원래 값으로 복구한 뒤 시각화한다.
        state = (state * 8.0).astype(np.int8)
        state = state.astype(object)
        state[state == -1] = '.'
        state[state == -2] = 'M'
        state_df = pd.DataFrame(state.reshape((self.map_size)))

        display(state_df.style.applymap(self.color_state))
        print(" ")

    def color_state(self, value):
        if value == '.':
            color = 'white'
        elif value == 0:
            color = 'slategrey'
        elif value == 1:
            color = 'blue'
        elif value == 2:
            color = 'green'
        elif value == 3:
            color = 'red'
        elif value == 4:
            color = 'midnightblue'
        elif value == 5:
            color = 'brown'
        elif value == 6:
            color = 'aquamarine'
        elif value == 7:
            color = 'black'
        elif value == 8:
            color = 'silver'
        else:
            color = 'magenta'

        return f'color: {color}'

class LimitedMinesweeperEnv(MinesweeperEnv):
    def __init__(self, map_size, n_mines, total_boards=None, train=True):
        super().__init__(map_size, n_mines)

        self.train = train

        if total_boards is None:
            with open("/content/drive/MyDrive/Minesweeper [RL]/dataset/easy1000boards.pkl","rb") as f:
                self.total_boards = pickle.load(f)
        else:
            self.total_boards = total_boards

        self.n_boards = len(self.total_boards)

        if train:
            self.board = self.total_boards[0]
        else:
            self.board_iteration = iter(total_boards)
            self.board = next(self.board_iteration)

    def reset(self):
        self.n_clicks = 0
        self.n_progress = 0

        if self.train:
            self.board = random.choice(self.total_boards)
            self.board[0] = np.ones(shape=self.map_size) # board가 수정되기 때문에 초기화해줘야 한다.

        else:
            self.board = next(self.board_iteration)

        self.state = self.create_state(self.board)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import time

In [18]:
class Agent:
    def __init__(self, env, net, **kwargs):
        self.env = env

        # Environment Settings
        self.mem_size = kwargs.get("MEM_SIZE")
        self.mem_size_min = kwargs.get("MEM_SIZE_MIN")
        print(self.mem_size_min)

        # Learning Settings
        self.batch_size = kwargs.get("BATCH_SIZE")
        self.learning_rate = kwargs.get("LEARNING_RATE")
        self.learn_decay = kwargs.get("LEARN_DECAY")
        self.learn_min = kwargs.get("LEARN_MIN")
        self.discount = kwargs.get("DISCOUNT")

        # Exploration Settings
        self.epsilon = kwargs.get("EPSILON")
        self.epsilon_decay = kwargs.get("EPSILON_DECAY")
        self.epsilon_min = kwargs.get("EPSILON_MIN")

        self.update_target_baseline = kwargs.get("UPDATE_TARGET_EVERY")

        self.model = net
        self.target_model = net

        self.target_model.load_state_dict(self.model.state_dict())

        self.replay_memory = deque(maxlen=self.mem_size)

        self.loss_fn = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1e-4)

        self.model.to(device)
        self.target_model.to(device)

        self.target_update_counter = 0

        self.losses = []

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def get_action(self, state):
        '''
        get_action은 하나의 state_img만을 받는다.
        '''
        if np.random.random() < self.epsilon:
            # take random action
            action = np.random.choice(range(self.env.total_tiles))

        else:
            self.model.eval()

            with torch.no_grad():
                state = torch.tensor(state.reshape(1,1,self.env.nrows,self.env.ncols),
                                     dtype=torch.float32).to(device)
                total_action = self.model(state).view(-1)
                total_action = total_action.cpu()

                self.total_action = total_action

                action = torch.argmax(total_action).item()

        return action

    def train(self, done):
        if len(self.replay_memory) < self.mem_size_min:
            print(len(self.replay_memory))
            print(self.mem_size_min)
            return
        print('train')
        # 리플레이 메모리에서 배치 사이즈만큼 데이터를 꺼낸다.
        # batch[i] = (current_state, action, reward, new_current_state, done)
        batch = random.sample(self.replay_memory, self.batch_size)

        # 배치 안에 저장되어 있는 정보 꺼내기
        current_states, _, _, next_states, _ = zip(*batch)


        current_states =  torch.tensor(np.array(current_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)

        self.model.eval()
        self.target_model.eval()

        with torch.no_grad():
            current_q_values = self.model(current_states).reshape(-1,self.env.total_tiles).cpu().detach().tolist()
            next_q_values = self.target_model(next_states).cpu().detach().numpy()

        #  current_q_values를 target value가 되도록 업데이트하는 코드
        for index, (_, action, reward, _, epi_done) in enumerate(batch):
            if not epi_done:
                max_future_q = np.max(next_q_values[index])
                new_q = reward + self.discount * max_future_q
            else:
                new_q = reward

            current_q_values[index][action] = new_q

        # train model
        self.model.train()

        x = current_states.to(device)
        y = torch.tensor(np.array(current_q_values), dtype=torch.float32).to(device)

        y_est = self.model(x)

        cost = self.loss_fn(y_est, y)

        running_loss = cost.item()
        self.losses.append(round(running_loss,6))

        self.optimizer.zero_grad()
        cost.backward()
        self.optimizer.step()

        if done:
            self.target_update_counter += 1

        if self.target_update_counter > self.update_target_baseline:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_update_counter = 0

        # decay learning rate
        self.learning_rate = max(self.learn_min, self.learning_rate*self.learn_decay)

        # decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

        print(self.epsilon)


In [29]:
class Trainer:
    def __init__(self, env, agent, tester_agent, name, train_start=True, **kwargs):
        self.env = env
        self.agent = agent

        self.progress_list = []
        self.wins_list = []
        self.ep_rewards_list = []

        self.name = name
        self.tester_agent = tester_agent

        self.best_model_train = None
        self.best_model_valid = None

        self.baseline_train = 0
        self.baseline_valid = 0

        self.simple_valid = 0

        # Parameters
        self.episodes = kwargs.get("EPISODES")

        self.print_interval = kwargs.get("PRINT_INTERVAL")
        self.train_render = kwargs.get("TRAIN_RENDER")
        self.train_timestep = kwargs.get("TRAIN_TIMESTEP")

        self.valid_sample = kwargs.get("VALID_SAMPLE")
        self.valid_interval = kwargs.get("VALID_INTERVAL")

        self.visual_interval = kwargs.get("VIUSAL_INTERVAL")
        self.interval = 500

        if train_start:
            self.train()
            self.visualize_train()
            self.save_model()

    def train(self):
        TRAIN_TIMESTEPS = ['every timestep', 'every episodes']
        print(TRAIN_TIMESTEPS)
        print(self.train_timestep)
        start = time.time()

        win_rate = 0
        valid_win_rate = 0

        for episode in range(self.episodes):
            # print(len(self.agent.replay_memory))
            self.env.reset()

            n_clicks = 0
            done = False
            episode_reward = 0

            while not done:
                current_state = self.env.state

                action = self.agent.get_action(current_state)

                next_state, reward, done = self.env.step(action)

                episode_reward += reward

                self.agent.update_replay_memory((current_state, action, reward, next_state, done))

                if self.train_timestep == TRAIN_TIMESTEPS[0]: # every timestep
                    self.agent.train(done)

                n_clicks += 1

            if self.train_timestep == TRAIN_TIMESTEPS[1]: # every episodes
                self.agent.train(done)

            if self.train_render:
                self.env.render(self.env.state)
                print(episode_reward)

            if len(self.agent.replay_memory) < self.agent.mem_size_min:
                continue

            self.progress_list.append(n_clicks)
            self.ep_rewards_list.append(episode_reward)
            self.wins_list.append(reward == self.env.rewards['win'])

            if (episode+1) % self.print_interval == 0:
                med_progress = np.median(self.progress_list[-self.print_interval:])
                win_rate = np.sum(self.wins_list[-self.print_interval:]) / self.print_interval
                med_reward = np.median(self.ep_rewards_list[-self.print_interval:])

                print(f"Episode: [{self.episodes}/{episode+1}], Median progress: {med_progress:.2f}, Median reward: {med_reward:.2f}, Win rate : {win_rate:.2f}, Epsilon: {self.agent.epsilon:.2f}")

                if win_rate > self.baseline_train:
                    self.baseline_train = win_rate
                    self.best_model_train = self.agent.model.state_dict()

                    self.simple_valid = 10

            if self.simple_valid > 0:
                    valid_state = self.agent.model.state_dict()
                    valid_win_rate = self.valid_model(self.env, self.tester_agent, episode, self.valid_sample, valid_state)
                    self.simple_valid -= 1

            if win_rate > self.baseline_valid:
                self.baseline_valid = valid_win_rate
                self.best_model_valid = self.agent.model.state_dict()

        print(round(time.time() - start, 2))

    def valid_model(self, env, agent, episode, epoch, model_state):

        progress_list, wins_list, ep_rewards = [], [], []

        agent.epsilon = 0.0 # valid에서는 탐험을 꺼준다.

        agent.model.load_state_dict(model_state)
        agent.target_model.load_state_dict(model_state)

        for i in range(epoch):

            env.reset()

            done = False
            n_clicks = 0
            episode_reward = 0

            while not done:
                current_state = env.state

                action = agent.get_action(current_state)

                next_state, reward, done = env.step(action)

                if (current_state == next_state).all(): # 같은 곳을 계속 누르는 상황을 탈출시키는 ShutDown Code
                    done = True

                episode_reward += reward
                n_clicks += 1

            progress_list.append(n_clicks)
            ep_rewards.append(episode_reward)
            wins_list.append(reward == env.rewards['win'])

        print(f"Valid n:{epoch}, Median progress: {np.median(progress_list):.2f}, Median reward: {np.median(ep_rewards):.2f}, Win rate : {np.sum(wins_list)/len(wins_list)}")

        return np.sum(wins_list)/len(wins_list) # 승률을 반환한다.

    def visualize_train(self, progress=True, win_rates=True, rewards=True, losses=True):
        progresses = []
        win_rates = []
        rewards = []
        losses = []

        for start in range(0, len(self.progress_list)-self.visual_interval, self.visual_interval):
            progresses.append(sum(self.progress_list[start:start+self.visual_interval]) / self.visual_interval)
            win_rates.append(sum(self.wins_list[start:start+self.visual_interval]) / self.visual_interval)
            rewards.append(sum(self.ep_rewards_list[start:start+self.visual_interval]) / self.visual_interval)
            losses.append(sum(self.agent.losses[start:start+self.visual_interval]) / self.visual_interval)

        xticks = np.arange(0, len(self.progress_list), self.interval)

        if progress:
            if len(progresses) > 50:
                plt.xticks(xticks, [str(x) + 'K' for x in xticks // 10])
            plt.axhline(y=(sum(self.progress_list)/len(self.progress_list)), color='b', linestyle='-')
            plt.scatter(range(len(progresses)), progresses, marker='.',alpha=0.3,
                        color=['red' if x == max(progresses) else 'black' for x in progresses])
            plt.annotate(max(progresses), (progresses.index(max(progresses))+5, max(progresses)))
            plt.title(f"Median Progress per {self.visual_interval} episodes")
            plt.show()

        if win_rates:
            if len(progresses) > 50:
                plt.xticks(xticks, [str(x) + 'K' for x in xticks // 10])
                plt.axhline(y=(sum(self.wins_list)/len(self.wins_list)), color='b', linestyle='-')
                plt.axhline(y=(sum(self.wins_list[-100:])/len(self.wins_list[-100:])), color='b', linestyle='--')
            plt.fill_between(range(len(win_rates)), min(win_rates), win_rates, alpha=0.7)
            plt.scatter(win_rates.index(max(win_rates)), max(win_rates), marker='.', color='r')
            plt.annotate(max(win_rates), (win_rates.index(max(win_rates))+5, max(win_rates)))
            plt.title(f"Median Win rate per {self.visual_interval} episodes")
            plt.show()

        if rewards:
            if len(progresses) > 50:
                plt.xticks(xticks, [str(x) + 'K' for x in xticks // 10])
                plt.axhline(y=(sum(self.ep_rewards_list)/len(self.ep_rewards_list)), color='b', linestyle='-')
            plt.scatter(range(len(rewards)), rewards,
                        marker='.', alpha=0.3, color=['red' if x == max(rewards) else 'black' for x in rewards])
            plt.annotate(round(max(rewards),2), (rewards.index(max(rewards))+5, max(rewards)))
            plt.title(f"Median Episode Reward per {self.visual_interval} episodes")
            plt.show()

        if losses:
            if len(progresses) > 50:
                plt.xticks(xticks, [str(x) + 'K' for x in xticks // 10])
            plt.plot(losses)
            plt.title(f"Median Loss per {self.visual_interval} episodes")
            plt.show()

    def save_model(self):

        def save_file(direction, fname, file):
           with open(os.path.join(direction, f'{fname}.pkl'), 'wb') as f:
                pickle.dump(file,f)

        def create_file(path, name):
            file_path = path + '/' + name
            # 파일이 이미 존재하는지 확인
            if not os.path.exists(file_path):
                os.makedirs(file_path)
                print(f"파일 '{file_path}'가 생성되었습니다.")
            else:
                print(f"파일 '{file_path}'는 이미 존재합니다.")

        save_point = {}
        save_point['n_mines'] = self.env.total_mines
        save_point['total_episodes'] = len(self.progress_list)
        save_point['final_model'] = self.agent.model.state_dict()
        save_point['best_model_train'] = self.best_model_train
        save_point['best_model_valid'] = self.best_model_valid

        self.save_point = save_point

        f_path = '/content/drive/MyDrive/Minesweeper [RL]/models'
        self.total_path = f_path + '/' + self.name

        create_file(f_path, self.name)
        save_file(self.total_path, f'{len(self.progress_list)}epi_max_train{self.baseline_train}_valid{self.baseline_valid}',save_point)
        print('모델이 저장되었습니다.')

In [13]:
env = MinesweeperEnv(map_size=level['easy']['map_size'],
                     n_mines=level['easy']['n_mines'])

net = Net(input_dims=env.state.shape,
          n_actions=env.total_tiles,
          conv_units=CONV_UNITS)

agent = Agent(env=env,
              net=net,
              MEM_SIZE=MEM_SIZE,
              MEM_SIZE_MIN=MEM_SIZE_MIN,
              BATCH_SIZE=BATCH_SIZE,
              LEARNING_RATE=LEARNING_RATE,
              LEARN_DECAY=LEARN_DECAY,
              LEARN_MIN=LEARN_MIN,
              DISCOUNT=DISCOUNT,
              EPSILON=EPSILON,
              EPSILON_DECAY=EPSILON_DECAY,
              EPSILON_MIN=EPSILON_MIN,
              UPDATE_TARGET_EVERY=UPDATE_TARGET_EVERY)

AttributeError: 'Agent' object has no attribute 'learning_rate'

## TRAIN_PARAMETERS

In [31]:
EPISODES = 200000
PRINT_INTERVAL = 100
TRAIN_RENDER = False

TRAIN_TIMESTEPS = ['every timestep', 'every episodes']
TRAIN_TIMESTEP = TRAIN_TIMESTEPS[0]
VIUSAL_INTERVAL = 100

VALID_SAMPLE = 1000
VALID_INTERVAL = 10

In [32]:
tester_agent = Agent(env=env,
                    net=net,
                    MEM_SIZE=MEM_SIZE,
                    MEM_SIZE_MIN=MEM_SIZE_MIN,
                    BATCH_SIZE=BATCH_SIZE,
                    LEARNING_RATE=LEARNING_RATE,
                    LEARN_DECAY=LEARN_DECAY,
                    LEARN_MIN=LEARN_MIN,
                    DISCOUNT=DISCOUNT,
                    EPSILON=EPSILON,
                    EPSILON_DECAY=EPSILON_DECAY,
                    EPSILON_MIN=EPSILON_MIN,
                    UPDATE_TARGET_EVERY=UPDATE_TARGET_EVERY)

1000


In [34]:
trainer = Trainer(env=env,
                    agent=agent,
                    tester_agent=tester_agent,
                    name='episodeInterval',
                    train_start=True,
                    EPISODES = EPISODES,
                    PRINT_INTERVAL = PRINT_INTERVAL,
                    TRAIN_RENDER = TRAIN_RENDER,
                    TRAIN_TIMESTEP = TRAIN_TIMESTEPS[0],
                    VIUSAL_INTERVAL = VIUSAL_INTERVAL,
                    VALID_SAMPLE = VALID_SAMPLE,
                    VALID_INTERVAL = VALID_INTERVAL)

['every timestep', 'every episodes']
every timestep
train
0.9497625
train
0.949525059375
train
0.9492876781101562
train
0.9490503561906287
train
0.9488130936015811
train
0.9485758903281807
train
0.9483387463555987
train
0.9481016616690098
train
0.9478646362535925
train
0.9476276700945292
train
0.9473907631770055
train
0.9471539154862113
train
0.9469171270073398
train
0.946680397725588
train
0.9464437276261566
train
0.9462071166942501
train
0.9459705649150765
train


KeyboardInterrupt: 

# 01