In [30]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np


class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device("cuda:0" if T.cuda.is_available() else "cpu")
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)

        return actions


class Agent:
    def __init__(
        self,
        gamma,
        epsilon,
        lr,
        input_dims,
        batch_size,
        n_actions,
        max_mem_size=100000,
        eps_end=0.05,
        eps_dec=5e-8,
    ):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [0, 1, 2, 3]  # ['N', 'S', 'E', 'W']
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = DeepQNetwork(
            lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256
        )
        self.state_memory = np.zeros(
            (self.mem_size, *input_dims), dtype=np.float32
        )  
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size

        arr = np.concatenate(
            (np.array(state[0]), state[1], np.concatenate(state[2]),)
            )
        arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")

        self.state_memory[index] = arr

        arr = np.concatenate(
            (np.array(state_[0]), state_[1], np.concatenate(state_[2]),)
        )
        arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")

        self.new_state_memory[index] = arr
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            arr = np.concatenate(
                (
                    np.array(observation[0]),
                    observation[1],
                    np.concatenate(observation[2]),
                )
            )
            arr = np.pad(arr, (0, 14 - len(arr)), mode="constant")
            state = T.tensor(np.array(arr), dtype=self.Q_eval.fc1.weight.dtype,).to(
                self.Q_eval.device
            )
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.Q_eval.optimizer.zero_grad()

        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        action_batch = self.action_memory[batch]
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]

        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = (
            self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        )



0 rescue items


In [31]:
import sys
import numpy as np
import math
import random
import json
import requests
import random
import numpy as np
import gym
import time

sys.path.append('c://Users/omara/Desktop/VSCODE/DigitalSquad/gym-maze/')
import gym_maze
from gym_maze.envs.maze_manager import MazeManager
from riddle_solvers import *

In [32]:
def manhattan_distance(start, end):
    return abs(start[0] - end[0]) + abs(start[1] - end[1])

def get_reward(obv,obv_,info):
    if obv[0][0] == obv_[0][0] and obv[0][1] == obv_[0][1]:
        return -1000
    elif obv[0][0] == 9 and obv[0][1] == 9:
        return 10000    
    # {'rescued_items': 0, 'riddle_type': None, 'riddle_question': None}
    elif info[riddle_type]  != None:
        return 1000
    else:
        return (0 - manhattan_distance(obv[0], np.array((9, 9))))* 100


In [33]:
sample_maze = np.load("sample_maze2.npy")
agent_id = "9" # add your agent id here
    
manager = MazeManager()
manager.init_maze(agent_id, maze_cells=sample_maze)
env = manager.maze_map[agent_id]

riddle_solvers = {'cipher': cipher_solver, 'captcha': captcha_solver, 'pcap': pcap_solver, 'server': server_solver}
maze = {}
states = {}


maze['maze'] = env.maze_view.maze.maze_cells.tolist()
maze['rescue_items'] = list(manager.rescue_items_dict.keys())

MAX_T = 5000
RENDER_MAZE = True

agent = Agent(gamma=0.99, epsilon=100.0, batch_size=64, n_actions=4, eps_end=0.1,
                input_dims=[14,], lr=0.001)
scores, eps_history = [], []
n_games = 500
timeout = 15
for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    manager.render(agent_id)
    num_steps = 0
    start_time = time.time()

    
    passed_time = 0
    while not done and passed_time < timeout:
        passed_time = time.time() - start_time
        action = agent.choose_action(observation)
        mapping = ['N', 'S', 'E', 'W']

        actionChar = mapping[action]

        observation_, _, done,_, info = manager.step(agent_id, actionChar)
        # print(info)
        manager.render(agent_id)
        reward = get_reward(observation_, observation, info)
        score += reward

        num_steps += 1
        if np.array_equal(observation_[0], (9,9)):
            manager.set_done(agent_id)
            done = True
        agent.store_transition(observation, action, reward, 
                                observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)

    avg_score = np.mean(scores[-100:])

    print('episode ', i, 'score %.2f' % score,
            'average score %.2f' % avg_score,
            'epsilon %.2f' % agent.epsilon, 'num_steps' , num_steps)

episode  0 score -522000.00 average score -522000.00 epsilon 100.00 num_steps 522
episode  1 score -614000.00 average score -568000.00 epsilon 100.00 num_steps 614
episode  2 score -539000.00 average score -558333.33 epsilon 100.00 num_steps 539


KeyboardInterrupt: 

In [None]:
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

torch==1.13.1+cpu
numpy==1.21.5
json==2.0.9
requests==2.27.1
gym==0.26.2


## dependencies

- torch==1.13.1+cpu
- numpy==1.21.5
- json==2.0.9
- requests==2.27.1
- gym==0.26.2

# Kol El B3d kda maloo4 lazma /\/?

In [None]:
test =  [np.array((1,3)), [3,4,4,3], [[1,3], [2,4], [3,4], [4,4]]]

In [None]:
a = np.concatenate([[1,3], [2,4], [3,4], [4,4]])
input_data = [np.array((1,3)), [3,4,4,3],a]
output_vector = np.concatenate(input_data)

print(output_vector)


In [None]:
a = np.array([1, 3, 3, 4, 4, 3,1,3, 2, 4, 3, 4, 4, 4])
print(a)

In [None]:
np.concatenate([1 0],
[9, 7, 10, 3],
np.c[[1, 1], [0, 1], [1, 1], [1, 0]])

In [None]:
observation

In [None]:
print(observation[0])
print(observation[1])
print(observation[2])

In [None]:
np.concatenate([np.array([1, 0]), [8, 4, 10, 7], np.concatenate([[1, 1], [1, 1], [1, 1], [1, 1]])])

In [None]:
observation = np.concatenate([np.array([observation[0]]),np.array( observation[1]), observation[2]])
observation


In [None]:
observation 

In [None]:
print(observation[0].shape)
print(np.array(observation[1]).shape)
print(np.array(observation[2]).flatten().shape)


In [None]:
observation = np.concatenate([np.array([[observation[0][0], observation[0][1]]]), observation[1], observation[2]])
observation

In [None]:
np.load("hackathon_sample.npy")