In [3]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import sys
sys.setrecursionlimit(1500)

import warnings
warnings.filterwarnings('ignore')

import gymnasium as gym
import numpy as np
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

# Tworzenie środowiska
env = gym.make('CartPole-v1')

# Ustawienie losowości dla powtarzalności
np.random.seed(42)
env.action_space.seed(42)
env.observation_space.seed(42)

# Funkcja budująca model sieci neuronowej
def build_model(state_size, action_size):
    model = Sequential()
    model.add(Input(shape=(state_size,)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = build_model(state_size, action_size)

# Parametry uczenia Q-Learning
epsilon = 1.0            # początkowy współczynnik eksploracji
epsilon_min = 0.01       # minimalny współczynnik eksploracji
epsilon_decay = 0.99     # szybkość zmniejszania eksploracji

# Pamięć powtórek doświadczeń
memory = deque(maxlen=2000)

def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

def replay(batch_size=64):
    if len(memory) < batch_size:
        return

    minibatch = random.sample(memory, batch_size)

    states = np.vstack([x[0] for x in minibatch])
    actions = np.array([x[1] for x in minibatch])
    rewards = np.array([x[2] for x in minibatch])
    next_states = np.vstack([x[3] for x in minibatch])
    dones = np.array([x[4] for x in minibatch])

    q_next = model.predict(next_states)
    q_target = model.predict(states)

    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += 0.95 * np.amax(q_next[i])
        q_target[i][actions[i]] = target

    model.fit(states, q_target, epochs=1, verbose=0)

    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    act_values = model.predict(state)
    return np.argmax(act_values[0])

# Liczba epizodów do uczenia
episodes = 10
train_frequency = 5

for e in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(200):  # maksymalnie 200 kroków na epizod
        action = act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])

        remember(state, action, reward, next_state, done)
        state = next_state

        if done:
            print(f"Episode: {e + 1}/{episodes}, score: {time}, epsilon: {epsilon:.2f}")
            break

        if time % train_frequency == 0:
            replay(batch_size=64)

env.close()


Episode: 1/10, score: 20, epsilon: 1.00
Episode: 2/10, score: 32, epsilon: 1.00
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Episode: 3/10, score: 23, epsilon: 0.97
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m2/2[0m [32