In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Определение нейронной сети для аппроксимации Q-функции
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        # Простая сеть с одним скрытым слоем
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Среда (упрощенная для примера)
class SimpleEnv:
    def __init__(self):
        self.state_space_size = 4
        self.action_space_size = 2

    def step(self, state, action):
        next_state = (state + 1) % self.state_space_size
        reward = 1 if action == 0 else -1
        done = next_state == 0
        return next_state, reward, done

    def reset(self):
        return 0

def train(env, model, episodes=1000, learning_rate=0.01, gamma=0.95):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    best_hyperparameters = {'learning_rate': learning_rate, 'gamma': gamma}
    best_accuracy = 0

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            state_tensor = torch.FloatTensor([state])
            q_values = model(state_tensor)

            _, action = torch.max(q_values, dim=-1)
            action = action.item()

            next_state, reward, done = env.step(state, action)

            next_state_tensor = torch.FloatTensor([next_state])
            next_q_values = model(next_state_tensor)

            target_q = reward + gamma * torch.max(next_q_values)
            loss = loss_fn(q_values, target_q.unsqueeze(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = next_state

    print(f"Обучение завершено после {episodes} эпизодов")
    print(f"Лучшие гиперпараметры: {best_hyperparameters}")

# Создание сети и обучение
state_size = 1  # Размерность состояния
action_size = 2  # Количество возможных действий

learning_rate = 0.01
gamma = 0.95

q_network = QNetwork(state_size, action_size)
env = SimpleEnv()
train(env, q_network, learning_rate=learning_rate, gamma=gamma)

Обучение завершено после 1000 эпизодов
Лучшие гиперпараметры: {'learning_rate': 0.01, 'gamma': 0.95}
