In [None]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import time


class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device("cuda:0" if T.cuda.is_available() else "cpu")
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)

        return actions




class Agent:
    
    def __init__(
        self,
        gamma,
        epsilon,
        lr,
        input_dims,
        batch_size,
        n_actions,
        max_mem_size=100000,
        eps_end=0.05,
        eps_dec=5e-6,
    ):
        self.na = 0
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [0, 1, 2, 3]  # ['N', 'S', 'E', 'W']
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = DeepQNetwork(
            lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256
        )
        self.state_memory = np.zeros(
            (self.mem_size, *input_dims), dtype=np.float32
        )  
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size

        arr = np.concatenate(
            (np.array(state[0]), state[1], np.concatenate(state[2]),)
            )
        arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")

        self.state_memory[index] = arr

        arr = np.concatenate(
            (np.array(state_[0]), state_[1], np.concatenate(state_[2]),)
        )
        arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")

        self.new_state_memory[index] = arr
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def choose_action(self, observation):
        self.na+=1
        start = time.time()
        if np.random.random() > self.epsilon:
            
            arr = np.concatenate(
                (
                    np.array(observation[0]),
                    observation[1],
                    np.concatenate(observation[2]),
                )
            )
            arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")
            state = T.tensor(np.array(arr), dtype=self.Q_eval.fc1.weight.dtype,).to(
                self.Q_eval.device
            )
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        end = time.time()
        t = end - start
        return (action,t)

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.Q_eval.optimizer.zero_grad()

        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        action_batch = self.action_memory[batch]
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]

        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = (
            self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        )



In [None]:
import sys
import numpy as np
import math
import random
import json
import requests
import random
import numpy as np
import gym
import time

#sys.path.append('c://Users/omara/Desktop/VSCODE/DigitalSquad/gym-maze/')
import gym_maze
from gym_maze.envs.maze_manager import MazeManager
from gym_maze.envs.maze_view_2d import MazeView2D
#from gym_maze.envs.maze_env import MazeENV
from riddle_solvers import *
import math

In [None]:
sample_maze = np.load("sample_maze2.npy")
agent_id = "9" # add your agent id here
    
manager = MazeManager()
manager.init_maze(agent_id, maze_cells=sample_maze)
env = manager.maze_map[agent_id]

riddle_solvers = {'cipher': cipher_solver, 'captcha': captcha_solver, 'pcap': pcap_solver, 'server': server_solver}
maze = {}
states = {}


maze['maze'] = env.maze_view.maze.maze_cells.tolist()
maze['rescue_items'] = list(manager.rescue_items_dict.keys())
x = maze['rescue_items']

In [None]:

def manhattan_distance(start, end):
    return abs(start[0] - end[0]) + abs(start[1] - end[1])

def get_reward(obv,obv_,x,info,gamma,
        epsilon,
        lr,
        input_dims,
        batch_size,
        n_actions):
    
    a=Agent(
        gamma,
        epsilon,
        lr,
        input_dims,
        batch_size,
        n_actions,
        max_mem_size=100000,
        eps_end=0.05,
        eps_dec=5e-8)
    
    _,t = a.choose_action(obv)
    t1 = t * 0.001
    a.na
    l2=[]
    n = 0
    if obv[0][0] == obv_[0][0] and obv[0][1] == obv_[0][1]:

        return -10000-(a.na)-t1
    elif obv[0] in x:
        l2.append(x)
        if n<4:
            n+=1
        if n<=4:
            return 750-(a.na)-t1
        return -1000

    elif obv[0][0] == 9 and obv[0][1] == 9:
        
        return ((1000+n)*n*10)-(a.na)-t1

    else:
        
        return 0 - manhattan_distance(obv[0], np.array((9, 9)))*10-(a.na)-t1


In [None]:
MAX_T = 5000
RENDER_MAZE = True

agent = Agent(gamma=0.99, epsilon=1, batch_size=64, n_actions=4, eps_end=0.001,
                input_dims=[14,], lr=0.001)
scores, eps_history = [], []
n_games = 500
timeout = 15
for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    manager.render(agent_id)
    num_steps = 0
    start_time = time.time()

    
    passed_time = 0
    i=0
    while not done and passed_time < timeout :
        passed_time = time.time() - start_time
        (action,_) = agent.choose_action(observation)
        mapping = ['N', 'S', 'E', 'W']

        actionChar = mapping[action]

        observation_, _, done,_, info = manager.step(agent_id, actionChar)
        # print(info)
        manager.render(agent_id)
        
        reward = get_reward(observation_, observation,x, info,gamma=0.99, epsilon=1, batch_size=64, n_actions=4,
                input_dims=[14,], lr=0.001)
        score += reward

        num_steps += 1
        if np.array_equal(observation_[0], (9,9)):
            manager.set_done(agent_id)
            done = True
        agent.store_transition(observation, action, reward, 
                                observation_, done)
        agent.learn()
       
        observation = observation_
        
    scores.append(score)
    eps_history.append(agent.epsilon)

    avg_score = np.mean(scores[-100:])
    

    print('episode ', i, 'score %.2f' % score,
            'average score %.2f' % avg_score,
            'epsilon %.2f' % agent.epsilon, 'num_steps' , num_steps)
    agent.na = 0
   
    

In [None]:
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

## dependencies

- torch==1.13.1+cpu
- numpy==1.21.5
- json==2.0.9
- requests==2.27.1
- gym==0.26.2