In [None]:
import numpy as np

In [None]:
# Define the environment
n_states = 16  # Number of states in the environment
n_actions = 4  # Number of possible actions per state
goal_state = 15  # The target state (goal)

In [None]:
# Initialize the Q-table with zeros
Q_table = np.zeros((n_states, n_actions))

# Hyperparameters
learning_rate = 0.8       # Alpha: Learning rate for Q-value updates
discount_factor = 0.95    # Gamma: Future reward discount factor
exploration_prob = 0.2    # Epsilon: Probability of exploring random actions
epochs = 1000             # Number of training iterations

In [None]:
# Q-learning algorithm
for epoch in range(epochs):
    # Start from a random initial state
    current_state = np.random.randint(0, n_states)

    while current_state != goal_state:
        # Choose an action (epsilon-greedy policy)
        if np.random.rand() < exploration_prob:
            action = np.random.randint(0, n_actions)  # Explore: Random action
        else:
            action = np.argmax(Q_table[current_state])  # Exploit: Best known action

        # Define next state (for simplicity, cycling through states sequentially)
        next_state = (current_state + 1) % n_states

        # Define reward function
        if next_state == goal_state:
            reward = 1  # Positive reward for reaching the goal
        elif next_state == 10 and action == 2:
            reward = -1  # Negative reward for an undesired action
        else:
            reward = 0  # Default reward

        # Q-learning update rule
        Q_table[current_state, action] += learning_rate * (
            reward + discount_factor * np.max(Q_table[next_state]) - Q_table[current_state, action]
        )

        # Move to the next state
        current_state = next_state

# Print the final learned Q-table
print("Learned Q-table:")
print(Q_table)


Learned Q-table:
[[ 0.48767498  0.46815244  0.46816798  0.46816798]
 [ 0.51252074  0.51252065  0.51317781  0.51334208]
 [ 0.54035893  0.54032548  0.54036009  0.54036003]
 [ 0.56880009  0.56880009  0.56880003  0.56880009]
 [ 0.59873694  0.59873663  0.59873694  0.59873694]
 [ 0.63024941  0.63024935  0.63024941  0.63024941]
 [ 0.66342043  0.66342043  0.66342043  0.66342043]
 [ 0.6983373   0.6983373   0.6983373   0.6983373 ]
 [ 0.73509189  0.73509189  0.73509189  0.73509189]
 [ 0.77378094  0.77378094 -0.22621906  0.77378094]
 [ 0.81450625  0.81450625  0.81450625  0.81450625]
 [ 0.857375    0.857375    0.857375    0.857375  ]
 [ 0.9025      0.9025      0.9025      0.9025    ]
 [ 0.95        0.95        0.95        0.95      ]
 [ 1.          1.          1.          1.        ]
 [ 0.          0.          0.          0.        ]]
