From 4d0e840cf4143052a6bb0d20c3feb9f1fd07223f Mon Sep 17 00:00:00 2001
From: Yichang FENG <mefredfeng@gmail.com>
Date: Sun, 7 Sep 2025 16:48:14 -0500
Subject: [PATCH 1/4] [0.0.0] Add environment configuration for macOS and
 IntelliJ IDEA gitignore

---
 .idea/.gitignore        |  8 ++++
 environment_mac_mod.yml | 98 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 environment_mac_mod.yml

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/environment_mac_mod.yml b/environment_mac_mod.yml
new file mode 100644
index 0000000..8eb01bc
--- /dev/null
+++ b/environment_mac_mod.yml
@@ -0,0 +1,98 @@
+name: csce642
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9.16
+  - numpy=1.23.5
+  - swig
+  - box2d-py
+  - pip
+  - pip:
+      - absl-py==1.4.0
+      - ale-py==0.8.1
+      - asttokens==2.2.1
+      - backcall==0.2.0
+      - backports.functools-lru-cache==1.6.5
+      - certifi==2023.7.22
+      - cffi==1.15.1
+      - charset-normalizer==3.2.0
+      - cloudpickle==2.2.1
+      - cmake==3.27.2
+      - contourpy==1.1.0
+      - cycler==0.11.0
+      - Cython==3.0.0
+      - debugpy==1.6.7
+      - decorator==4.4.2
+      - entrypoints==0.4
+      - executing==1.2.0
+      - Farama-Notifications==0.0.4
+      - fasteners==0.18
+      - filelock==3.12.2
+      - fonttools==4.42.0
+      - glfw==2.6.2
+      - gymnasium==0.29.0
+      - idna==3.4
+      - imageio==2.31.1
+      - imageio-ffmpeg==0.4.8
+      - importlib-metadata==6.8.0
+      - importlib-resources==6.0.1
+      - ipykernel==6.14.0
+      - ipython==8.4.0
+      - jedi==0.19.0
+      - Jinja2==3.1.2
+      - joblib==1.3.2
+      - jupyter-client>=7.4.4
+      - jupyter_core==5.3.1
+      - kiwisolver==1.4.4
+      - lit==16.0.6
+      - lz4==4.3.2
+      - MarkupSafe==2.1.3
+      - matplotlib==3.7.2
+      - matplotlib-inline==0.1.6
+      - ml-dtypes>=0.3.1
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - nest-asyncio==1.5.6
+      - networkx==3.1
+      - opencv-python==4.8.0.76
+      - opt-einsum==3.3.0
+      - packaging==23.1
+      - pandas==2.0.3
+      - parso==0.8.3
+      - pexpect==4.8.0
+      - pickleshare==0.7.5
+      - Pillow==10.0.0
+      - platformdirs==3.10.0
+      - proglog==0.1.10
+      - prompt-toolkit==3.0.39
+      - psutil==5.9.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pycparser==2.21
+      - pygame==2.5.1
+      - Pygments==2.16.1
+      - pynput==1.7.6
+      - PyOpenGL==3.1.7
+      - pyparsing==3.0.9
+      - python-dateutil==2.8.2
+      - python-xlib==0.33
+      - pytz==2023.3
+      - requests==2.31.0
+      - scikit-learn==1.3.0
+      - setuptools==68.0.0
+      - Shimmy==0.2.1
+      - six==1.16.0
+      - stack-data==0.6.2
+      - sympy==1.12
+      - threadpoolctl==3.2.0
+      - torch==2.0.1
+      - tornado>=6.2.0
+      - tqdm==4.66.1
+      - traitlets==5.9.0
+      - typing_extensions==4.7.1
+      - tzdata==2023.3
+      - urllib3==2.0.4
+      - wcwidth==0.2.6
+      - wheel==0.38.4
+      - zipp==3.16.2
\ No newline at end of file

From 00841c6d33c2b1ae3d8bf90b614c68f71019930d Mon Sep 17 00:00:00 2001
From: Yichang FENG <mefredfeng@gmail.com>
Date: Mon, 15 Sep 2025 20:46:39 -0500
Subject: [PATCH 2/4] [0.0.1] Implement value updates and action selection in
 Value Iteration

---
 Solvers/Value_Iteration.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py
index 0054241..4007c87 100644
--- a/Solvers/Value_Iteration.py
+++ b/Solvers/Value_Iteration.py
@@ -71,6 +71,9 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            values = self.one_step_lookahead(each_state)
+            best_action_value = np.max(values)
+            self.V[each_state] = best_action_value
 
         # Dont worry about this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)
@@ -140,7 +143,9 @@ def policy_fn(state):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-            
+            values = self.one_step_lookahead(state)
+            best_action = np.argmax(values)
+            return best_action
 
         return policy_fn
 
@@ -192,6 +197,17 @@ def train_episode(self):
         # Do a one-step lookahead to find the best action       #
         # Update the value function. Ref: Sutton book eq. 4.10. #
         #########################################################
+        THRESHOLD = 1e-4
+        while not self.pq.isEmpty():
+            state = self.pq.pop()
+            values = self.one_step_lookahead(state)
+            best_action_value = np.max(values)
+            delta = abs(self.V[state] - best_action_value)
+            self.V[state] = best_action_value
+            if delta > THRESHOLD:
+                for s in self.pred[state]:
+                    priority = -abs(self.V[s] - best_action_value)
+                    self.pq.update(s, priority)
 
         # you can ignore this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)

From ad74447d96ddfb5c8dcc6266113a248cf085e2a2 Mon Sep 17 00:00:00 2001
From: Yichang FENG <mefredfeng@gmail.com>
Date: Sun, 21 Sep 2025 18:45:49 -0500
Subject: [PATCH 3/4] [0.0.1a] Implement policy evaluation and update in Policy
 Iteration; optimize value updates in Value Iteration

---
 Solvers/Policy_Iteration.py | 18 +++++++++++++++++-
 Solvers/Value_Iteration.py  | 19 ++++++++-----------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py
index 06f8f32..4a33c7a 100644
--- a/Solvers/Policy_Iteration.py
+++ b/Solvers/Policy_Iteration.py
@@ -57,7 +57,9 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-
+            A = self.one_step_lookahead(s)
+            best_action = np.argmax(A)
+            self.policy[s, :] = np.eye(self.env.action_space.n)[best_action]
 
         # In DP methods we don't interact with the environment so we will set the reward to be the sum of state values
         # and the number of steps to -1 representing an invalid value
@@ -103,6 +105,20 @@ def policy_eval(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
+        num_states = self.env.observation_space.n
+        num_actions = self.env.action_space.n
+        gamma = self.options.gamma
+
+        # Build P_pi and R_pi
+        P_pi = np.zeros((num_states, num_states))
+        R_pi = np.zeros(num_states)
+        for s in range(num_states):
+            a = np.argmax(self.policy[s])
+            for prob, next_state, reward, done in self.env.P[s][a]:
+                P_pi[s, next_state] += prob
+                R_pi[s] += prob * reward
+
+        self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi)
 
     def create_greedy_policy(self):
         """
diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py
index 4007c87..aea5223 100644
--- a/Solvers/Value_Iteration.py
+++ b/Solvers/Value_Iteration.py
@@ -197,17 +197,14 @@ def train_episode(self):
         # Do a one-step lookahead to find the best action       #
         # Update the value function. Ref: Sutton book eq. 4.10. #
         #########################################################
-        THRESHOLD = 1e-4
-        while not self.pq.isEmpty():
-            state = self.pq.pop()
-            values = self.one_step_lookahead(state)
-            best_action_value = np.max(values)
-            delta = abs(self.V[state] - best_action_value)
-            self.V[state] = best_action_value
-            if delta > THRESHOLD:
-                for s in self.pred[state]:
-                    priority = -abs(self.V[s] - best_action_value)
-                    self.pq.update(s, priority)
+        for s in range(self.env.observation_space.n):
+            # Do a one-step lookahead to find the best action
+            A = self.one_step_lookahead(s)
+            best_action_value = np.max(A)
+            priority = -abs(self.V[s] - best_action_value)
+            self.pq.update(s, priority)
+        state = self.pq.pop()
+        self.V[state] = self.one_step_lookahead(state).max()
 
         # you can ignore this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)

From ec4d05d41af70f3d00c772fd43127db07924122e Mon Sep 17 00:00:00 2001
From: Yichang FENG <mefredfeng@gmail.com>
Date: Sun, 28 Sep 2025 00:06:22 -0500
Subject: [PATCH 4/4] [0.0.2] 1. Implement Monte Carlo and Off-Policy Monte
 Carlo methods in Monte_Carlo.py; 2. Refine policy functions in
 Policy_Iteration.py and Value_Iteration.py.

---
 Solvers/Monte_Carlo.py      | 52 ++++++++++++++++++++++++++++++++++++-
 Solvers/Policy_Iteration.py |  2 --
 Solvers/Value_Iteration.py  | 33 +++++++++++++++--------
 3 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/Solvers/Monte_Carlo.py b/Solvers/Monte_Carlo.py
index f3d848a..6608f75 100644
--- a/Solvers/Monte_Carlo.py
+++ b/Solvers/Monte_Carlo.py
@@ -65,6 +65,26 @@ def train_episode(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
+        for t in range(self.options.steps):
+            action_probs = self.policy(state)
+            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
+            next_state, reward, done, _ = self.step(action)
+            episode.append((state, action, reward))
+            if done:
+                break
+            state = next_state
+
+        discount_factor = self.options.gamma
+        visited = set()
+        G = 0
+        for t in reversed(range(len(episode))):
+            state, action, reward = episode[t]
+            G = reward + discount_factor * G
+            if (state, action) not in visited:
+                visited.add((state, action))
+                self.returns_sum[(state, action)] += G
+                self.returns_count[(state, action)] += 1
+                self.Q[state][action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)]
 
     def __str__(self):
         return "Monte Carlo"
@@ -90,6 +110,10 @@ def policy_fn(observation):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            A = np.ones(nA, dtype=float) * self.options.epsilon / nA
+            best_action = np.argmax(self.Q[observation])
+            A[best_action] += 1.0 - self.options.epsilon
+            return A
 
         return policy_fn
 
@@ -109,6 +133,7 @@ def policy_fn(state):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            return np.argmax(self.Q[state])
 
 
         return policy_fn
@@ -163,7 +188,32 @@ def train_episode(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
-        
+        for t in range(self.options.steps):
+            action_probs = self.behavior_policy(state)
+            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
+            next_state, reward, done, _ = self.step(action)
+            episode.append((state, action, reward))
+            if done:
+                break
+            state = next_state
+
+        discount_factor = self.options.gamma
+        visited = set()
+        G = 0
+        W = 1.0
+        for t in reversed(range(len(episode))):
+            state, action, reward = episode[t]
+            G = discount_factor * G + reward
+
+            self.C[state][action] += W
+            self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action])
+
+            if self.target_policy(state) != action:
+                break
+
+            behavior_prob = self.behavior_policy(state)[action]
+            target_prob = 1.0 if self.target_policy(state) == action else 0.0
+            W = W * (target_prob / behavior_prob)
 
     def create_random_policy(self):
         """
diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py
index 4a33c7a..3a94306 100644
--- a/Solvers/Policy_Iteration.py
+++ b/Solvers/Policy_Iteration.py
@@ -106,10 +106,8 @@ def policy_eval(self):
         #   YOUR IMPLEMENTATION HERE   #
         ################################
         num_states = self.env.observation_space.n
-        num_actions = self.env.action_space.n
         gamma = self.options.gamma
 
-        # Build P_pi and R_pi
         P_pi = np.zeros((num_states, num_states))
         R_pi = np.zeros(num_states)
         for s in range(num_states):
diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py
index aea5223..323198d 100644
--- a/Solvers/Value_Iteration.py
+++ b/Solvers/Value_Iteration.py
@@ -72,8 +72,7 @@ def train_episode(self):
             #   YOUR IMPLEMENTATION HERE   #
             ################################
             values = self.one_step_lookahead(each_state)
-            best_action_value = np.max(values)
-            self.V[each_state] = best_action_value
+            self.V[each_state] = np.max(values)
 
         # Dont worry about this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)
@@ -144,8 +143,7 @@ def policy_fn(state):
             #   YOUR IMPLEMENTATION HERE   #
             ################################
             values = self.one_step_lookahead(state)
-            best_action = np.argmax(values)
-            return best_action
+            return np.argmax(values)
 
         return policy_fn
 
@@ -197,14 +195,27 @@ def train_episode(self):
         # Do a one-step lookahead to find the best action       #
         # Update the value function. Ref: Sutton book eq. 4.10. #
         #########################################################
-        for s in range(self.env.observation_space.n):
-            # Do a one-step lookahead to find the best action
-            A = self.one_step_lookahead(s)
+        # for s in range(self.env.observation_space.n):
+        #     # Do a one-step lookahead to find the best action
+        #     A = self.one_step_lookahead(s)
+        #     best_action_value = np.max(A)
+        #     priority = -abs(self.V[s] - best_action_value)
+        #     self.pq.update(s, priority)
+        # state = self.pq.pop()
+        # self.V[state] = self.one_step_lookahead(state).max()
+
+        if not self.pq.isEmpty():
+            state = self.pq.pop()
+
+            A = self.one_step_lookahead(state)
             best_action_value = np.max(A)
-            priority = -abs(self.V[s] - best_action_value)
-            self.pq.update(s, priority)
-        state = self.pq.pop()
-        self.V[state] = self.one_step_lookahead(state).max()
+            self.V[state] = best_action_value
+
+            for s in self.pred.get(state, []):
+                A = self.one_step_lookahead(s)
+                pred_best_value = np.max(A)
+                priority = -abs(self.V[s] - pred_best_value)
+                self.pq.update(s, priority)
 
         # you can ignore this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)