In [51]:
import pygame
import sys
import random
import numpy as np

# Параметры карты
GRID_SIZE = 8
CELL_SIZE = 80
SCREEN_SIZE = CELL_SIZE * GRID_SIZE

TILE_EMPTY = 0
TILE_WALL = 1

# Цвета
COLOR_EMPTY = (150, 150, 150)
COLOR_WALL = (150, 150, 150)
COLOR_TANK = (100, 150, 50)
COLOR_TANK_GUN = (100, 200, 50)
COLOR_TARGET = (255, 165, 0)
COLOR_BULLET = (255, 0, 0)

pygame.init()
screen = pygame.display.set_mode((SCREEN_SIZE, SCREEN_SIZE))
pygame.display.set_caption("Танки - Q-Learning")
clock = pygame.time.Clock()

# Генерация карты
grid = [[0 for _ in range(GRID_SIZE)] for _ in range(GRID_SIZE)]
if GRID_SIZE == 8:
    walls = [(4,0), (1,1), (1,3), (1,4), (2,5), (2,6), (3,1), (3,3), (4,7), (6,2), (7,3), (5,5), (4,5), (3,5)]
    for y, x in walls:
        grid[y][x] = TILE_WALL

# Начальные позиции
TANK_START_POS = [0, 0]
TARGET_START_POS = [6, 7]

# Направления и действия
DIRECTIONS = ['up', 'down', 'left', 'right']
ACTION_MOVE = 0
ACTION_SHOOT = 1
ACTION_TURN_LEFT = 2
ACTION_TURN_RIGHT = 3
ACTIONS = [ACTION_MOVE, ACTION_SHOOT, ACTION_TURN_LEFT, ACTION_TURN_RIGHT]

class TankEnv:
    def __init__(self):
        self.grid = np.array(grid)
        self.grid_size = GRID_SIZE
        self.tank_pos = TANK_START_POS.copy()
        self.target_pos = TARGET_START_POS.copy()
        self.tank_dir = random.choice(DIRECTIONS)
        self.bullets = []
        self.shot_cooldown = 0
        self.action_history = []
        self.history_length = 5

    def reset(self):
        self.__init__()
        return self.get_state()

    def get_state(self):
        dx = self.target_pos[1] - self.tank_pos[1]
        dy = self.target_pos[0] - self.tank_pos[0]
        return (self.tank_pos[0], self.tank_pos[1], self.tank_dir, dx, dy)

    def step(self, action):
        prev_dist = abs(self.tank_pos[0] - self.target_pos[0]) + abs(self.tank_pos[1] - self.target_pos[1])
        reward = -1
        done = False
        hit = False

        self.action_history.append(action)
        if len(self.action_history) > self.history_length:
            self.action_history.pop(0)
        if len(self.action_history) == self.history_length and len(set(self.action_history)) == 1:
            reward -= 2

        if action == ACTION_TURN_LEFT:
            dir_idx = DIRECTIONS.index(self.tank_dir)
            self.tank_dir = DIRECTIONS[(dir_idx - 1) % len(DIRECTIONS)]
        elif action == ACTION_TURN_RIGHT:
            dir_idx = DIRECTIONS.index(self.tank_dir)
            self.tank_dir = DIRECTIONS[(dir_idx + 1) % len(DIRECTIONS)]
        elif action == ACTION_MOVE:
            new_pos = self.tank_pos.copy()
            if self.tank_dir == 'up':
                new_pos[0] -= 1
            elif self.tank_dir == 'down':
                new_pos[0] += 1
            elif self.tank_dir == 'left':
                new_pos[1] -= 1
            elif self.tank_dir == 'right':
                new_pos[1] += 1

            if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
                if self.grid[new_pos[0]][new_pos[1]] != TILE_WALL and new_pos != self.target_pos:
                    self.tank_pos = new_pos
                else:
                    reward -= 10
            else:
                reward -= 10
        elif action == ACTION_SHOOT:
            if self.shot_cooldown == 0:
                self.bullets.append([self.tank_pos[0], self.tank_pos[1], self.tank_dir])
                self.shot_cooldown = 5
                dx = self.target_pos[1] - self.tank_pos[1]
                dy = self.target_pos[0] - self.tank_pos[0]
                if (self.tank_dir == 'right' and dx > 0) or \
                   (self.tank_dir == 'left' and dx < 0) or \
                   (self.tank_dir == 'up' and dy < 0) or \
                   (self.tank_dir == 'down' and dy > 0):
                    reward += 5

        new_bullets = []
        for bullet in self.bullets:
            y, x, direction = bullet
            if direction == 'up':
                y -= 1
            elif direction == 'down':
                y += 1
            elif direction == 'left':
                x -= 1
            elif direction == 'right':
                x += 1

            if 0 <= y < self.grid_size and 0 <= x < self.grid_size:
                if self.grid[y][x] == TILE_WALL:
                    continue
                if [y, x] == self.target_pos:
                    hit = True
                    reward += 100
                    done = True
                new_bullets.append([y, x, direction])
        self.bullets = new_bullets

        if self.shot_cooldown > 0:
            self.shot_cooldown -= 1

        new_dist = abs(self.tank_pos[0] - self.target_pos[0]) + abs(self.tank_pos[1] - self.target_pos[1])
        if new_dist < prev_dist and not done:
            reward += 10
        elif new_dist > prev_dist and not done:
            reward -= 5

        next_state = self.get_state()
        return next_state, reward, done, hit

    def render(self):
        screen.fill((0, 0, 0))
        for y in range(self.grid_size):
            for x in range(self.grid_size):
                rect = pygame.Rect(x * CELL_SIZE, y * CELL_SIZE, CELL_SIZE, CELL_SIZE)
                if self.grid[y][x] == TILE_WALL:
                    pygame.draw.rect(screen, COLOR_WALL, rect)
                else:
                    pygame.draw.rect(screen, COLOR_EMPTY, rect, 1)

        ty, tx = self.target_pos
        target_rect = pygame.Rect(tx * CELL_SIZE + 10, ty * CELL_SIZE + 10, CELL_SIZE - 20, CELL_SIZE - 20)
        pygame.draw.rect(screen, COLOR_TARGET, target_rect)

        y, x = self.tank_pos
        tank_rect = pygame.Rect(x * CELL_SIZE + 10, y * CELL_SIZE + 10, CELL_SIZE - 20, CELL_SIZE - 20)
        pygame.draw.rect(screen, COLOR_TANK, tank_rect)

        gun_size = CELL_SIZE // 4
        gun_x = x * CELL_SIZE + CELL_SIZE // 2
        gun_y = y * CELL_SIZE + CELL_SIZE // 2
        if self.tank_dir == 'up':
            gun_rect = pygame.Rect(gun_x - gun_size // 2, gun_y - CELL_SIZE // 2, gun_size, CELL_SIZE // 2)
        elif self.tank_dir == 'down':
            gun_rect = pygame.Rect(gun_x - gun_size // 2, gun_y, gun_size, CELL_SIZE // 2)
        elif self.tank_dir == 'left':
            gun_rect = pygame.Rect(gun_x - CELL_SIZE // 2, gun_y - gun_size // 2, CELL_SIZE // 2, gun_size)
        elif self.tank_dir == 'right':
            gun_rect = pygame.Rect(gun_x, gun_y - gun_size // 2, CELL_SIZE // 2, gun_size)
        pygame.draw.rect(screen, COLOR_TANK_GUN, gun_rect)

        for bullet in self.bullets:
            y_b, x_b, _ = bullet
            b_rect = pygame.Rect(x_b * CELL_SIZE + 30, y_b * CELL_SIZE + 30, CELL_SIZE - 60, CELL_SIZE - 60)
            pygame.draw.rect(screen, COLOR_BULLET, b_rect)

        pygame.display.flip()

# Q-Learning параметры
alpha = 0.3
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.99
epsilon_min = 0.05
episodes = 500

# Пространство состояний
states = [(y, x, d, dx, dy) for y in range(GRID_SIZE) for x in range(GRID_SIZE) 
          for d in DIRECTIONS for dx in range(-GRID_SIZE, GRID_SIZE+1) for dy in range(-GRID_SIZE, GRID_SIZE+1)]
state_to_idx = {s: i for i, s in enumerate(states)}
num_states = len(state_to_idx)
num_actions = len(ACTIONS)
Q_table = np.zeros(shape=(num_states, num_actions))
env = TankEnv()

# Обучение
for episode in range(episodes):
    state = env.reset()
    state_idx = state_to_idx[state]
    total_reward = 0
    done = False
    step = 0

    while not done and step < 200:
        if np.random.rand() < epsilon:
            action = np.random.choice(ACTIONS)
        else:
            action = ACTIONS[np.argmax(Q_table[state_idx])]

        next_state, reward, done, _ = env.step(action)
        next_state_idx = state_to_idx[next_state]

        Q_table[state_idx, action] += alpha * (
            reward + gamma * np.max(Q_table[next_state_idx]) - Q_table[state_idx, action]
        )

        state_idx = next_state_idx
        total_reward += reward
        step += 1

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

for action in ACTIONS:
    avg_q = np.mean(Q_table[:, action])

# Тестовый режим
test_episodes = 200
success_count = 0

for test in range(test_episodes):
    state = env.reset()
    state_idx = state_to_idx[state]
    done = False
    step = 0
    total_reward = 0
    hit = False

    while not done and step < 200:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()

        action = ACTIONS[np.argmax(Q_table[state_idx])]
        next_state, reward, done, hit_flag = env.step(action)
        hit = hit or hit_flag
        next_state_idx = state_to_idx[next_state]

        state_idx = next_state_idx
        total_reward += reward
        step += 1

        env.render()
        clock.tick(10)

    print(f"Test {test + 1} | Total Reward: {total_reward} | Hit: {hit}")
    if hit:
        success_count += 1
pygame.quit()

Test 1 | Total Reward: 203 | Hit: True
Test 2 | Total Reward: 202 | Hit: True
Test 3 | Total Reward: 202 | Hit: True
Test 4 | Total Reward: 203 | Hit: True
Test 5 | Total Reward: 202 | Hit: True
Test 6 | Total Reward: 202 | Hit: True
Test 7 | Total Reward: 202 | Hit: True
Test 8 | Total Reward: 203 | Hit: True
Test 9 | Total Reward: 202 | Hit: True
Test 10 | Total Reward: 201 | Hit: True
Test 11 | Total Reward: 202 | Hit: True
Test 12 | Total Reward: 203 | Hit: True
Test 13 | Total Reward: 202 | Hit: True


SystemExit: 