In [2]:
import torch
import numpy as np
import torch.nn as nn
import random

In [6]:
import numpy as np
import random

class Model:
    def __init__(self, n_rows, n_cols, goal_state, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.n_rows = n_rows
        self.n_cols = n_cols
        self.n_states = n_rows * n_cols
        self.goal_state = goal_state

        self.actions = ['up', 'down', 'left', 'right']
        self.n_actions = len(self.actions)

        # Q-table: states × actions
        self.q = np.zeros((self.n_states, self.n_actions))

        # Hyperparameters
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def transition_reward(self, state, action):
        row, col = divmod(state, self.n_cols)

        if action == 'up':
            next_row, next_col = max(row - 1, 0), col
        elif action == 'down':
            next_row, next_col = min(row + 1, self.n_rows - 1), col
        elif action == 'left':
            next_row, next_col = row, max(0, col - 1)
        elif action == 'right':
            next_row, next_col = row, min(col + 1, self.n_cols - 1)

        next_state = next_row * self.n_cols + next_col

        # Reward: +1 if goal, -1 otherwise
        reward = 1 if next_state == self.goal_state else -1
        return next_state, reward

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.n_actions - 1)  # explore
        else:
            return np.argmax(self.q[state])  # exploit

    def TD_Learning(self, episodes=1000):
        for _ in range(episodes):
            state = random.randint(0, self.n_states - 1)  # start randomly
            action = self.choose_action(state)

            while state != self.goal_state:
                next_state, reward = self.transition_reward(state, self.actions[action])
                next_action = self.choose_action(next_state)

                # SARSA update
                td_target = reward + self.gamma * self.q[next_state, next_action]
                td_error = td_target - self.q[state, action]
                self.q[state, action] += self.alpha * td_error

                state, action = next_state, next_action
        return self.q


In [11]:
game = Model(3,3,3)

In [12]:
q = game.TD_Learning()

In [13]:
print(q)

[[-0.18123168  1.         -0.04947873 -0.69349188]
 [-0.59164326 -0.40336719 -0.32715104 -1.0337235 ]
 [-1.26382141 -1.23014436 -1.23674331 -1.43083328]
 [ 0.          0.          0.          0.        ]
 [-0.72769698 -0.87419885  1.         -0.80086835]
 [-0.88826107 -0.99766319 -0.38311643 -0.87205654]
 [ 1.         -0.13039122 -0.04894784 -0.1278613 ]
 [-0.13242414 -0.80986381 -0.35417476 -1.03801188]
 [-1.3001924  -1.59844828 -1.16275811 -1.48836168]]
