Pi-Star-Lab · MeFredFeng · Sep 7, 2025 · Sep 16, 2025 · Sep 21, 2025 · Sep 28, 2025
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/Solvers/Monte_Carlo.py b/Solvers/Monte_Carlo.py
@@ -65,6 +65,26 @@ def train_episode(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
+        for t in range(self.options.steps):
+            action_probs = self.policy(state)
+            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
+            next_state, reward, done, _ = self.step(action)
+            episode.append((state, action, reward))
+            if done:
+                break
+            state = next_state
+
+        discount_factor = self.options.gamma
+        visited = set()
+        G = 0
+        for t in reversed(range(len(episode))):
+            state, action, reward = episode[t]
+            G = reward + discount_factor * G
+            if (state, action) not in visited:
+                visited.add((state, action))
+                self.returns_sum[(state, action)] += G
+                self.returns_count[(state, action)] += 1
+                self.Q[state][action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)]
 
     def __str__(self):
         return "Monte Carlo"
@@ -90,6 +110,10 @@ def policy_fn(observation):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            A = np.ones(nA, dtype=float) * self.options.epsilon / nA
+            best_action = np.argmax(self.Q[observation])
+            A[best_action] += 1.0 - self.options.epsilon
+            return A
 
         return policy_fn
 
@@ -109,6 +133,7 @@ def policy_fn(state):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            return np.argmax(self.Q[state])
 
 
         return policy_fn
@@ -163,7 +188,32 @@ def train_episode(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
-
+        for t in range(self.options.steps):
+            action_probs = self.behavior_policy(state)
+            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
+            next_state, reward, done, _ = self.step(action)
+            episode.append((state, action, reward))
+            if done:
+                break
+            state = next_state
+
+        discount_factor = self.options.gamma
+        visited = set()
+        G = 0
+        W = 1.0
+        for t in reversed(range(len(episode))):
+            state, action, reward = episode[t]
+            G = discount_factor * G + reward
+
+            self.C[state][action] += W
+            self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action])
+
+            if self.target_policy(state) != action:
+                break
+
+            behavior_prob = self.behavior_policy(state)[action]
+            target_prob = 1.0 if self.target_policy(state) == action else 0.0
+            W = W * (target_prob / behavior_prob)
 
     def create_random_policy(self):
         """

diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py
@@ -57,7 +57,9 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-
+            A = self.one_step_lookahead(s)
+            best_action = np.argmax(A)
+            self.policy[s, :] = np.eye(self.env.action_space.n)[best_action]
 
         # In DP methods we don't interact with the environment so we will set the reward to be the sum of state values
         # and the number of steps to -1 representing an invalid value
@@ -103,6 +105,18 @@ def policy_eval(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
+        num_states = self.env.observation_space.n
+        gamma = self.options.gamma
+
+        P_pi = np.zeros((num_states, num_states))
+        R_pi = np.zeros(num_states)
+        for s in range(num_states):
+            a = np.argmax(self.policy[s])
+            for prob, next_state, reward, done in self.env.P[s][a]:
+                P_pi[s, next_state] += prob
+                R_pi[s] += prob * reward
+
+        self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi)
 
     def create_greedy_policy(self):
         """

diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py
@@ -71,6 +71,8 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            values = self.one_step_lookahead(each_state)
+            self.V[each_state] = np.max(values)
 
         # Dont worry about this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)
@@ -140,7 +142,8 @@ def policy_fn(state):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-
+            values = self.one_step_lookahead(state)
+            return np.argmax(values)
 
         return policy_fn
 
@@ -192,6 +195,27 @@ def train_episode(self):
         # Do a one-step lookahead to find the best action       #
         # Update the value function. Ref: Sutton book eq. 4.10. #
         #########################################################
+        # for s in range(self.env.observation_space.n):
+        #     # Do a one-step lookahead to find the best action
+        #     A = self.one_step_lookahead(s)
+        #     best_action_value = np.max(A)
+        #     priority = -abs(self.V[s] - best_action_value)
+        #     self.pq.update(s, priority)
+        # state = self.pq.pop()
+        # self.V[state] = self.one_step_lookahead(state).max()
+
+        if not self.pq.isEmpty():
+            state = self.pq.pop()
+
+            A = self.one_step_lookahead(state)
+            best_action_value = np.max(A)
+            self.V[state] = best_action_value
+
+            for s in self.pred.get(state, []):
+                A = self.one_step_lookahead(s)
+                pred_best_value = np.max(A)
+                priority = -abs(self.V[s] - pred_best_value)
+                self.pq.update(s, priority)
 
         # you can ignore this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)

diff --git a/environment_mac_mod.yml b/environment_mac_mod.yml
@@ -0,0 +1,98 @@
+name: csce642
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9.16
+  - numpy=1.23.5
+  - swig
+  - box2d-py
+  - pip
+  - pip:
+      - absl-py==1.4.0
+      - ale-py==0.8.1
+      - asttokens==2.2.1
+      - backcall==0.2.0
+      - backports.functools-lru-cache==1.6.5
+      - certifi==2023.7.22
+      - cffi==1.15.1
+      - charset-normalizer==3.2.0
+      - cloudpickle==2.2.1
+      - cmake==3.27.2
+      - contourpy==1.1.0
+      - cycler==0.11.0
+      - Cython==3.0.0
+      - debugpy==1.6.7
+      - decorator==4.4.2
+      - entrypoints==0.4
+      - executing==1.2.0
+      - Farama-Notifications==0.0.4
+      - fasteners==0.18
+      - filelock==3.12.2
+      - fonttools==4.42.0
+      - glfw==2.6.2
+      - gymnasium==0.29.0
+      - idna==3.4
+      - imageio==2.31.1
+      - imageio-ffmpeg==0.4.8
+      - importlib-metadata==6.8.0
+      - importlib-resources==6.0.1
+      - ipykernel==6.14.0
+      - ipython==8.4.0
+      - jedi==0.19.0
+      - Jinja2==3.1.2
+      - joblib==1.3.2
+      - jupyter-client>=7.4.4
+      - jupyter_core==5.3.1
+      - kiwisolver==1.4.4
+      - lit==16.0.6
+      - lz4==4.3.2
+      - MarkupSafe==2.1.3
+      - matplotlib==3.7.2
+      - matplotlib-inline==0.1.6
+      - ml-dtypes>=0.3.1
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - nest-asyncio==1.5.6
+      - networkx==3.1
+      - opencv-python==4.8.0.76
+      - opt-einsum==3.3.0
+      - packaging==23.1
+      - pandas==2.0.3
+      - parso==0.8.3
+      - pexpect==4.8.0
+      - pickleshare==0.7.5
+      - Pillow==10.0.0
+      - platformdirs==3.10.0
+      - proglog==0.1.10
+      - prompt-toolkit==3.0.39
+      - psutil==5.9.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pycparser==2.21
+      - pygame==2.5.1
+      - Pygments==2.16.1
+      - pynput==1.7.6
+      - PyOpenGL==3.1.7
+      - pyparsing==3.0.9
+      - python-dateutil==2.8.2
+      - python-xlib==0.33
+      - pytz==2023.3
+      - requests==2.31.0
+      - scikit-learn==1.3.0
+      - setuptools==68.0.0
+      - Shimmy==0.2.1
+      - six==1.16.0
+      - stack-data==0.6.2
+      - sympy==1.12
+      - threadpoolctl==3.2.0
+      - torch==2.0.1
+      - tornado>=6.2.0
+      - tqdm==4.66.1
+      - traitlets==5.9.0
+      - typing_extensions==4.7.1
+      - tzdata==2023.3
+      - urllib3==2.0.4
+      - wcwidth==0.2.6
+      - wheel==0.38.4
+      - zipp==3.16.2