In [1]:
from datetime import datetime
from enum import Enum

from IPython.core.pylabtools import figsize
from ale_py import ALEInterface
import gymnasium as gym
import time
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display, clear_output
import cv2

from Lab4.logWritter import LogWriter

In [2]:
class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, h=21, w=21):
        super().__init__(env)
        self.h, self.w = h, w
        self.observation_space = gym.spaces.Box(0, 1, (h, w, 3), np.float32)

    def observation(self, obs):
        obs = cv2.resize(obs, (self.w, self.h), interpolation=cv2.INTER_NEAREST)
        return obs.astype(np.float32) / 255.0


In [3]:
def prepare_image(frame, h=21, w=21):
    arr = np.asarray(frame)
    if np.issubdtype(arr.dtype, np.floating):
        arr = (arr * 255.0).clip(0, 255).astype(np.uint8)
    else:
        arr = arr.astype(np.uint8)
    resized = cv2.resize(arr, (w, h), interpolation=cv2.INTER_NEAREST)
    return (resized.astype(np.float32) / 255.0).flatten()


In [4]:
ale = ALEInterface()
gym.register_envs(ale)

env = gym.make("ALE/Berzerk-v5", render_mode="rgb_array", frameskip=4)
env = ResizeObservation(env, h=21, w=21)
observation, info = env.reset()


In [5]:
print("action_space:", env.action_space)
print("n actions:", env.action_space.n)

action_space: Discrete(18)
n actions: 18


In [6]:
try:
    meanings = env.unwrapped.get_action_meanings()
except Exception:
    try:
        meanings = env.get_action_meanings()
    except Exception:
        meanings = None

if meanings:
    print("Action index -> meaning:")
    for i, name in enumerate(meanings):
        print(f"{i}: {name}")
else:
    print("No action meanings available from the env. Use index numbers (0..n-1).")


Action index -> meaning:
0: NOOP
1: FIRE
2: UP
3: RIGHT
4: LEFT
5: DOWN
6: UPRIGHT
7: UPLEFT
8: DOWNRIGHT
9: DOWNLEFT
10: UPFIRE
11: RIGHTFIRE
12: LEFTFIRE
13: DOWNFIRE
14: UPRIGHTFIRE
15: UPLEFTFIRE
16: DOWNRIGHTFIRE
17: DOWNLEFTFIRE


In [7]:
FIRE_ACTIONS = [1, 10, 11, 12, 13, 14, 15, 16, 17]
MOVE_ACTIONS = [2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
def plot_frame(frame):
    # clear_output(wait=True)
    plt.figure(figsize=(6, 4))
    plt.imshow(frame)
    plt.axis('off')
    display(plt.gcf())
    # plt.show()

In [9]:
seed = 42
np.random.seed(seed)

In [10]:
class Sarsa:
    alpha = 1e-4
    gamma = 0.99
    epsilon = 1
    feature_h, feature_w = 21, 21
    use_traces = False
    lmbda = 0.9

    def __init__(self, n_actions):
        self.feature_dim = self.feature_h * self.feature_w * 3 + n_actions
        self.w = np.zeros(feature_dim, dtype=np.float32)
        self.n_actions = n_actions

    def phi_from_state_action(self, features, action):
        a_onehot = np.zeros(self.n_actions, dtype=np.float32)
        a_onehot[action] = 1.0
        return np.concatenate([features, a_onehot])

    def q_value(self, phi):
        return np.dot(self.w, phi)

    def _q_values_all_actions(self, state_features):
        # features_tiled = np.tile(state_features, (self.n_actions, 1))  # shape (n_actions, state_dim)
        # eye_actions = np.eye(self.n_actions, dtype=np.float32)
        # phis = np.concatenate([features_tiled, eye_actions], axis=1)   # shape (n_actions, total_dim)
        # return np.dot(phis, self.w)
        q_base = np.dot(state_features, self.w[:self.feature_dim])
        return

    def _dimension_guard(self, features):
        expected_dim = self.feature_h * self.feature_w * 3
        if features.shape[0] != expected_dim:
            raise ValueError(f"Expected features of dimension {expected_dim}, got {features.shape[0]}")

    def epsilon_greedy(self, features):
        self._dimension_guard(features)

        eps = float(getattr(self, "epsilon", 0.0))
        eps = max(0.0, min(1.0, eps))

        if np.random.rand() < eps:
            return np.random.randint(self.n_actions)

        q_vals = self._q_values_all_actions(features)
        return np.argmax(q_vals)

    def save(self, fileName="sarsa_weights.npz"):
        np.savez(fileName, w=self.w)

    @staticmethod
    def load(file_name="sarsa_weights.npz"):
        data = np.load(file_name)
        ag = Sarsa(n_actions=data['w'].shape[0] - 21*21*3)
        ag.w = data['w']

        print("Loaded SARSA agent with rules characteristics:")
        print("w shape:", ag.w.shape)
        print("w norm:", np.linalg.norm(ag.w))
        print("non-zero weights:", np.count_nonzero(ag.w))
        return ag

    def restrict_exploration(self):
        self.epsilon = 0.0

In [11]:
def is_model_trained():
    try:
        _ = np.load("sarsa_weights.npz")
        return True
    except FileNotFoundError:
        return False

In [12]:
def file_exist(file_name):
    try:
        _ = np.load(file_name)
        return True
    except FileNotFoundError:
        return False

In [13]:
class TrainPreferences(Enum):
    DEFAULT = 0
    KAMIKAZE = 1

In [14]:
class ScoreModifier:
    def __init__(self, preferences=TrainPreferences.DEFAULT):
        self.preferences = preferences
        self.rescore_func = {
            TrainPreferences.DEFAULT: self._default_rescore,
            TrainPreferences.KAMIKAZE: self._kamikaze_rescore
        }[preferences]

    def rescore(self, score, action):
        return self.rescore_func(score, action)

    @staticmethod
    def _default_rescore(score, action):
        return score - 0.1

    @staticmethod
    def _kamikaze_rescore(score, action):
        if action == 0:
            score -= 2
        if action in FIRE_ACTIONS:
            score += 1
        if action in MOVE_ACTIONS:
            score += 0.2
        return score


In [15]:
class Trainer:
    def __init__(self, epsilon_min = 0.05, epsilon_decay = 0.995):
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

    @staticmethod
    def _file_name_for_class(class_name):
        return f"sarsa-weights-{class_name.lower()}.npz"

    def train_if_needed(self, model, env, class_name, preferences = TrainPreferences.DEFAULT, n_episodes=1000):
        file_name = Trainer._file_name_for_class(class_name)
        print(f'Checking for existing model file: {file_name}')
        if not file_exist(file_name):
            self.train(model, env, class_name, preferences, n_episodes)
            return model

        return Sarsa.load(file_name)

    def train(self, model, env, class_name, preferences = TrainPreferences.DEFAULT, n_episodes=1000):
        print(f"Training {class_name} agent...")
        score_modifier = ScoreModifier(preferences)

        action_counts = np.zeros(env.action_space.n, dtype=np.float32)

        for episode in range(n_episodes):
            state, _ = env.reset()
            features = np.array(state).flatten()
            action = model.epsilon_greedy(features)
            action_counts[action] += 1
            phi = model.phi_from_state_action(features, action)

            done = False
            ep_reward = 0

            q_next = None
            phi_next = None

            while not done:
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated

                next_features = np.array(next_state).flatten()
                next_action = model.epsilon_greedy(next_features)
                action_counts[next_action] += 1
                next_phi = model.phi_from_state_action(next_features, next_action)

                q = np.dot(model.w, phi)
                q_next = np.dot(model.w, next_phi)
                reward = score_modifier.rescore(reward, action)
                delta = reward + model.gamma * q_next - q

                model.w += model.alpha * delta * phi


                state = next_state
                action = next_action
                phi = next_phi
                ep_reward += reward

            model.epsilon = max(self.epsilon_min, model.epsilon * self.epsilon_decay)
            if (episode + 1) % 10 == 0:
                print(f"Episode {episode+1}/{n_episodes}: Reward={ep_reward:.2f}, Eps={model.epsilon:.4f}")

        print(f'Action distribution during training: {action_counts}')
        model.save(self._file_name_for_class(class_name))


In [16]:
CLASS_NAME = "Berzerk-Kamikaze"

agent = Sarsa(env.action_space.n)

epsilon_min = 0.1
epsilon_decay = 0.995

trainer = Trainer(epsilon_min, epsilon_decay)
agent = trainer.train_if_needed(agent, env, class_name=CLASS_NAME, preferences=TrainPreferences.DEFAULT, n_episodes=1000)

env.close()


Checking for existing model file: sarsa-weights-berzerk-kamikaze.npz
Loaded SARSA agent with rules characteristics:
w shape: (1341,)
w norm: 26.532623
non-zero weights: 939


# Test

In [17]:
agent = Sarsa.load(Trainer._file_name_for_class(CLASS_NAME))

Loaded SARSA agent with rules characteristics:
w shape: (1341,)
w norm: 26.532623
non-zero weights: 939


In [18]:
ale = ALEInterface()
gym.register_envs(ale)

test_env = gym.make("ALE/Berzerk-v5", render_mode="human", frameskip=4)
agent.restrict_exploration()

In [None]:
n_episodes = 5
total_rewards = []

for ep in range(n_episodes):
    state, _ = test_env.reset()
    done = False
    ep_reward = 0

    actions_count = np.zeros(test_env.action_space.n, dtype=np.int32)
    while not done:
        state = prepare_image(state).flatten()
        action = agent.epsilon_greedy(state)
        actions_count[action] += 1
        next_state, reward, terminated, truncated, _ = test_env.step(action)
        done = terminated or truncated

        state = next_state
        ep_reward += reward

    test_env.render()
    print(f"Episode {ep + 1}: Total Reward = {ep_reward}")
    print(f'Action count during round: {actions_count}')
    print('---------------------------')
    total_rewards.append(ep_reward)

test_env.close()

print(f"\nAverage Test Reward over {n_episodes} episodes: {np.mean(total_rewards):.2f}")