# 00 Init

## Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Setting to use py files

In [2]:
import os

In [3]:
os.chdir('/content/drive/MyDrive/Minesweeper [RL]')

In [4]:
# check that os is in right directory
os.getcwd()

'/content/drive/MyDrive/Minesweeper [RL]'

In [5]:
! pip install codes

Collecting codes
  Downloading codes-0.1.5-py3-none-any.whl (5.5 kB)
Installing collected packages: codes
Successfully installed codes-0.1.5


## Import py files

In [6]:
# baseline : Env, Agent
# from codes.environment.reward5 import *
from codes.environment.reward5 import *
from codes.agent.vectorDQN import *
from codes.agent.vectorDQN import Agent as VectorDQNAgent
from codes.agent.scalarDQN import Agent as ScalarDQNAgent
from codes.net.basic import *
from codes.trainer.validShutDown import *
from codes.tester.basic import *
# import codes.trainer.trainerWithValidShutDown as Trainer


# 01 Info

## level dictionary

In [7]:
level = {'easy' : {'map_size':(9,9), 'n_mines' : 10},
         'medium' : {'map_size':(16,16), 'n_mines':40},
         'expert' : {'map_size':(16,30), 'n_mines':99}}

## HYPER PARAMETERS

In [8]:
# Environment settings
MEM_SIZE = 50000
MEM_SIZE_MIN = 1000

# Learning settings
BATCH_SIZE = 64
LEARNING_RATE = 0.01
LEARN_DECAY = 0.9999975
LEARN_MIN = 0.001
DISCOUNT = 0.1

# Exploration settings
EPSILON = 0.95
EPSILON_DECAY = 0.99975
EPSILON_MIN = 0.01

# DQN settings
CONV_UNITS = 64
UPDATE_TARGET_EVERY = 5

## 02 Agents

- 로직 비교를 위함이니 역전파 죽이기
- 동일한 신경망 사용
-

In [75]:
class NewVectorAgent:
    def __init__(self, env, net, **kwargs):
        self.env = env

        # Environment Settings
        self.mem_size = kwargs.get("MEM_SIZE")
        self.mem_size_min = kwargs.get("MEM_SIZE_MIN")

        # Learning Settings
        self.batch_size = kwargs.get("BATCH_SIZE")
        self.learning_rate = kwargs.get("LEARNING_RATE")
        self.learn_decay = kwargs.get("LEARN_DECAY")
        self.learn_min = kwargs.get("LEARN_MIN")
        self.discount = kwargs.get("DISCOUNT")

        # Exploration Settings
        self.epsilon = kwargs.get("EPSILON")
        self.epsilon_decay = kwargs.get("EPSILON_DECAY")
        self.epsilon_min = kwargs.get("EPSILON_MIN")

        # loss
        self.loss_fn = nn.MSELoss()
        self.losses = []

        # target net update
        self.target_update_counter = 0
        self.update_target_baseline = kwargs.get("UPDATE_TARGET_EVERY")

        # def model
        self.model = copy.deepcopy(net)
        self.target_model = copy.deepcopy(net)

        self.target_model.load_state_dict(self.model.state_dict())

        self.model.to(device)
        self.target_model.to(device)

        # replay memory
        self.replay_memory = deque(maxlen=self.mem_size)

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def get_action(self, state):
        '''
        get_action은 하나의 state_img만을 받는다.
        '''
        if np.random.random() < self.epsilon:
            # take random action
            action = np.random.choice(range(self.env.total_tiles))

        else:
            self.model.eval()

            with torch.no_grad():
                state = torch.tensor(state.reshape(1,1,self.env.nrows,self.env.ncols),
                                     dtype=torch.float32).to(device)
                total_action = self.model(state).view(-1)
                total_action = total_action.cpu()

                self.total_action = total_action

                action = torch.argmax(total_action).item()

        return action

    def train(self, done, batch):
        if len(self.replay_memory) < self.mem_size_min:
            return

        # optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1e-4)

        # 리플레이 메모리에서 배치 사이즈만큼 데이터를 꺼낸다.
        # batch[i] = (current_state, action, reward, new_current_state, done)
        # batch = random.sample(self.replay_memory, self.batch_size)

        # 배치 안에 저장되어 있는 정보 꺼내기
        current_states, actions, rewards, next_states, epi_dones = zip(*batch)

        current_states =  torch.tensor(np.array(current_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)

        actions = torch.tensor(np.array(actions), dtype=torch.int).to(device)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float).reshape(-1,1).to(device)
        epi_dones = torch.tensor(np.array(epi_dones), dtype=torch.float).reshape(-1,1).to(device)

        self.model.train()
        self.target_model.eval()

        current_q_values = self.model(current_states)

        with torch.no_grad():
            next_q_values = self.target_model(next_states)

        target_value = rewards + (1 - epi_dones) * self.discount * torch.max(next_q_values, dim=1)[0].reshape(-1,1)
        target_value = target_value.flatten()

        target_q_values = copy.deepcopy(current_q_values.detach())
        target_q_values[range(BATCH_SIZE), actions] = target_value

        self.action = actions
        self.dones = epi_dones
        self.target_q_values = target_q_values
        self.pred_q_values = current_q_values

        cost = self.loss_fn(current_q_values, target_q_values)

        running_loss = cost.item()

        self.current_loss = running_loss

        self.losses.append(round(running_loss,6))

        # self.optimizer.zero_grad()
        # cost.backward()
        # self.optimizer.step()

        if done:
            self.target_update_counter += 1

        if self.target_update_counter == self.update_target_baseline:
            self.update_target_model()
            self.target_update_counter = 0

        # decay learning rate
        self.learning_rate = max(self.learn_min, self.learning_rate*self.learn_decay)

        # decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

In [42]:
class ScalarAgent(ScalarDQNAgent):
    def __init__(self, env, net, **kwargs):
        super().__init__(env, net, **kwargs)

    def train(self, done, batch):
        if len(self.replay_memory) < self.mem_size_min:
            return

        # optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1e-4)

        self.model.train()
        self.target_model.eval()

        # 리플레이 메모리에서 배치 사이즈만큼 데이터를 꺼낸다.
        # batch[i] = (current_state, action, reward, next_state, done)
        # batch = random.sample(self.replay_memory, self.batch_size)

        # 배치 안에 저장되어 있는 정보 꺼내기
        current_states, batched_actions, batched_rewards, next_states, batched_dones = zip(*batch)

        # state 정의
        current_states = torch.tensor(np.array(current_states), dtype=torch.float32, device=device).reshape(-1,1,self.env.nrows,self.env.ncols)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=device).reshape(-1,1,self.env.nrows,self.env.ncols)

        action_batch = torch.tensor(batched_actions, device=device).reshape(-1,1) # reshape 안해주면 index로써 사용할 수 없다.
        reward_batch = torch.tensor(batched_rewards, device=device).reshape(-1,1)
        done_batch = torch.tensor(batched_dones, dtype=torch.float32, device=device).reshape(-1,1) # bool -> 0/1

        # Q(s,a) 값을 예측값으로 사용 - (batch, action_space.n)
        pred_q_values = self.model(current_states).gather(1, action_batch) # action idx의 데이터만 꺼냄

        # target 값 계산 : reward + gamma * Q(s',a')
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1).values.reshape(-1,1)
            target_q_values = reward_batch + (torch.ones(next_q_values.shape, device=device) - done_batch) * self.discount * next_q_values

        self.target_q_values = target_q_values
        self.pred_q_values = pred_q_values

        loss = self.loss_fn(pred_q_values, target_q_values)

        running_loss = loss.item()

        self.current_loss = running_loss

        self.losses.append(round(running_loss,6))

        # self.optimizer.zero_grad()
        # loss.backward()
        # self.optimizer.step()

        if done:
            self.target_update_counter += 1

        if self.target_update_counter == self.update_target_baseline:
            self.update_target_model()
            self.target_update_counter = 0

        # decay learning rate
        self.learning_rate = max(self.learn_min, self.learning_rate*self.learn_decay)

        # decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

In [43]:
class VectorAgent(VectorDQNAgent):
    def __init__(self, env, net, **kwargs):
        super().__init__(env, net, **kwargs)

    def train(self, done, batch):
        if len(self.replay_memory) < self.mem_size_min:
            return

        # optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1e-4)

        # 리플레이 메모리에서 배치 사이즈만큼 데이터를 꺼낸다.
        # batch[i] = (current_state, action, reward, new_current_state, done)
        # batch = random.sample(self.replay_memory, self.batch_size)

        # 배치 안에 저장되어 있는 정보 꺼내기
        current_states, _, _, next_states, _ = zip(*batch)


        current_states =  torch.tensor(np.array(current_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).reshape(-1,1,self.env.nrows,self.env.ncols).to(device)

        self.model.eval()
        self.target_model.eval()

        with torch.no_grad():
            current_q_values = self.model(current_states).reshape(-1,self.env.total_tiles).cpu().detach().tolist()
            next_q_values = self.target_model(next_states).cpu().detach().numpy()

        #  current_q_values를 target value가 되도록 업데이트하는 코드
        for index, (_, action, reward, _, epi_done) in enumerate(batch):
            if not epi_done:
                max_future_q = np.max(next_q_values[index])
                new_q = reward + self.discount * max_future_q
            else:
                new_q = reward

            current_q_values[index][action] = new_q

        # train model
        self.model.train()

        x = current_states.to(device)
        y = torch.tensor(np.array(current_q_values), dtype=torch.float32).to(device)

        y_est = self.model(x)

        self.target_q_values = y
        self.pred_q_values = y_est

        cost = self.loss_fn(y_est, y)

        running_loss = cost.item()

        self.current_loss = running_loss

        self.losses.append(round(running_loss,6))

        # self.optimizer.zero_grad()
        # cost.backward()
        # self.optimizer.step()

        if done:
            self.target_update_counter += 1

        if self.target_update_counter == self.update_target_baseline:
            self.update_target_model()
            self.target_update_counter = 0

        # decay learning rate
        self.learning_rate = max(self.learn_min, self.learning_rate*self.learn_decay)

        # decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

# 02 Train, Valid

In [44]:
env = MinesweeperEnv(map_size=level['easy']['map_size'],
                     n_mines=level['easy']['n_mines'])

net = Net(input_dims=env.state.shape,
          n_actions=env.total_tiles,
          conv_units=CONV_UNITS)

In [76]:
new_vector_agent = NewVectorAgent(env=env,
                                    net=net,
                                    MEM_SIZE=MEM_SIZE,
                                    MEM_SIZE_MIN=MEM_SIZE_MIN,
                                    BATCH_SIZE=BATCH_SIZE,
                                    LEARNING_RATE=LEARNING_RATE,
                                    LEARN_DECAY=LEARN_DECAY,
                                    LEARN_MIN=LEARN_MIN,
                                    DISCOUNT=DISCOUNT,
                                    EPSILON=EPSILON,
                                    EPSILON_DECAY=EPSILON_DECAY,
                                    EPSILON_MIN=EPSILON_MIN,
                                    UPDATE_TARGET_EVERY=UPDATE_TARGET_EVERY)

vector_agent = VectorAgent(env=env,
                                net=net,
                                MEM_SIZE=MEM_SIZE,
                                MEM_SIZE_MIN=MEM_SIZE_MIN,
                                BATCH_SIZE=BATCH_SIZE,
                                LEARNING_RATE=LEARNING_RATE,
                                LEARN_DECAY=LEARN_DECAY,
                                LEARN_MIN=LEARN_MIN,
                                DISCOUNT=DISCOUNT,
                                EPSILON=EPSILON,
                                EPSILON_DECAY=EPSILON_DECAY,
                                EPSILON_MIN=EPSILON_MIN,
                                UPDATE_TARGET_EVERY=UPDATE_TARGET_EVERY)

scalar_agent = ScalarAgent(env=env,
                                net=net,
                                MEM_SIZE=MEM_SIZE,
                                MEM_SIZE_MIN=MEM_SIZE_MIN,
                                BATCH_SIZE=BATCH_SIZE,
                                LEARNING_RATE=LEARNING_RATE,
                                LEARN_DECAY=LEARN_DECAY,
                                LEARN_MIN=LEARN_MIN,
                                DISCOUNT=DISCOUNT,
                                EPSILON=EPSILON,
                                EPSILON_DECAY=EPSILON_DECAY,
                                EPSILON_MIN=EPSILON_MIN,
                                UPDATE_TARGET_EVERY=UPDATE_TARGET_EVERY)

## TRAIN_PARAMETERS

In [77]:
EPISODES = 300

PRINT_INTERVAL = 100

TRAIN_RENDER = False
reward_list = []
mean_rewards = []

In [78]:
start = time.time()

for episode in range(EPISODES):
    env.reset()
    done = False

    while not done:
        current_state = env.state

        action = new_vector_agent.get_action(current_state)
        next_state, reward, done = env.step(action)

        new_vector_agent.update_replay_memory((current_state, action, reward, next_state, done))
        vector_agent.update_replay_memory((current_state, action, reward, next_state, done))
        scalar_agent.update_replay_memory((current_state, action, reward, next_state, done))

        if len(new_vector_agent.replay_memory) >= new_vector_agent.batch_size:
            batch = random.sample(new_vector_agent.replay_memory, new_vector_agent.batch_size)

            new_vector_agent.train(done, batch)
            vector_agent.train(done, batch)
            scalar_agent.train(done, batch)

        current_state = next_state

        if TRAIN_RENDER:
            env.render()

    # reward_list.append(reward)

    # if (episode+1) % PRINT_INTERVAL == 0:
    #     mean_reward = np.mean(reward_list[-PRINT_INTERVAL:])
    #     mean_rewards.append(mean_reward)
    #     print(f"Episode: [{EPISODES}/{episode+1}] | Mean reward: {mean_reward:.2f} | Epsilon: {agent.epsilon:3f}")

print(round(time.time() - start,2))

23.88


In [79]:
df = pd.DataFrame({'action' : new_vector_agent.action.cpu().detach().numpy(),
                    'done' : list(new_vector_agent.dones.cpu().detach().numpy()),
                    'new_vector_agent_pred' : list(new_vector_agent.pred_q_values.cpu().detach().numpy()),
                    'new_vector_agent_trg' : list(new_vector_agent.target_q_values.cpu().detach().numpy()),
                    'new_vector_agent_loss' : new_vector_agent.current_loss,

                    'vector_agent_pred' : list(vector_agent.pred_q_values.cpu().detach().numpy()),
                    'vector_agent_trg' : list(vector_agent.target_q_values.cpu().detach().numpy()),
                    'vector_agent_loss' : vector_agent.current_loss,

                    'scalar_agent_pred' : list(scalar_agent.pred_q_values.cpu().detach().numpy()),
                    'scalar_agent_trg' : list(scalar_agent.target_q_values.cpu().detach().numpy()),
                    'scalar_agent_loss' : scalar_agent.current_loss
                    })

df.head()

Unnamed: 0,action,done,new_vector_agent_pred,new_vector_agent_trg,new_vector_agent_loss,vector_agent_pred,vector_agent_trg,vector_agent_loss,scalar_agent_pred,scalar_agent_trg,scalar_agent_loss
0,32,[0.0],"[-0.012740556, 0.0032430163, 0.0045295134, -0....","[-0.012740556, 0.0032430163, 0.0045295134, -0....",0.002002,"[-0.012740556, 0.0032430163, 0.0045295134, -0....","[-0.012740556, 0.0032430163, 0.0045295134, -0....",0.002002,[-0.0068920436],[0.30115378],0.162187
1,12,[1.0],"[-0.011348023, 0.00420004, 0.0035170526, 0.000...","[-0.011348023, 0.00420004, 0.0035170526, 0.000...",0.002002,"[-0.011348023, 0.00420004, 0.0035170526, 0.000...","[-0.011348023, 0.00420004, 0.0035170526, 0.000...",0.002002,[0.008917493],[-1.0],0.162187
2,48,[0.0],"[-0.011057192, 0.003720354, 0.0061896453, 0.00...","[-0.011057192, 0.003720354, 0.0061896453, 0.00...",0.002002,"[-0.011057192, 0.003720354, 0.0061896453, 0.00...","[-0.011057192, 0.003720354, 0.0061896453, 0.00...",0.002002,[-0.010494717],[0.30094016],0.162187
3,31,[0.0],"[-0.012024426, 0.003561283, 0.0042827404, 0.00...","[-0.012024426, 0.003561283, 0.0042827404, 0.00...",0.002002,"[-0.012024426, 0.003561283, 0.0042827404, 0.00...","[-0.012024426, 0.003561283, 0.0042827404, 0.00...",0.002002,[0.010939166],[-0.2989061],0.162187
4,58,[0.0],"[-0.012048509, 0.0038842675, 0.0047334423, -0....","[-0.012048509, 0.0038842675, 0.0047334423, -0....",0.002002,"[-0.012048509, 0.0038842675, 0.0047334423, -0....","[-0.012048509, 0.0038842675, 0.0047334423, -0....",0.002002,[-0.0056102937],[-0.2988317],0.162187


In [48]:
df['new_vector_agent_pred'][2][5]

-0.0061555402

In [80]:
idx = 1
df['new_vector_agent_trg'][idx] == df['new_vector_agent_pred'][idx]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [81]:
idx = 1
df['vector_agent_trg'][idx] == df['vector_agent_pred'][idx]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [82]:

df['new_vector_agent_trg'][idx] == df['vector_agent_trg'][idx]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [83]:
print(df['new_vector_agent_trg'][idx][df['action'][idx]])
print(df['vector_agent_trg'][idx][df['action'][idx]])
df['scalar_agent_trg'][idx].tolist()

-1.0
-1.0


[-1.0]

In [84]:
print(df['new_vector_agent_pred'][idx][df['action'][idx]])
print(df['vector_agent_pred'][idx][df['action'][idx]])
df['scalar_agent_pred'][idx].tolist()

0.008917493
0.008917493


[0.008917492814362049]

[-0.2987838]