From 4d0e840cf4143052a6bb0d20c3feb9f1fd07223f Mon Sep 17 00:00:00 2001 From: Yichang FENG Date: Sun, 7 Sep 2025 16:48:14 -0500 Subject: [PATCH 1/4] [0.0.0] Add environment configuration for macOS and IntelliJ IDEA gitignore --- .idea/.gitignore | 8 ++++ environment_mac_mod.yml | 98 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 environment_mac_mod.yml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/environment_mac_mod.yml b/environment_mac_mod.yml new file mode 100644 index 0000000..8eb01bc --- /dev/null +++ b/environment_mac_mod.yml @@ -0,0 +1,98 @@ +name: csce642 +channels: + - conda-forge + - defaults +dependencies: + - python=3.9.16 + - numpy=1.23.5 + - swig + - box2d-py + - pip + - pip: + - absl-py==1.4.0 + - ale-py==0.8.1 + - asttokens==2.2.1 + - backcall==0.2.0 + - backports.functools-lru-cache==1.6.5 + - certifi==2023.7.22 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - cloudpickle==2.2.1 + - cmake==3.27.2 + - contourpy==1.1.0 + - cycler==0.11.0 + - Cython==3.0.0 + - debugpy==1.6.7 + - decorator==4.4.2 + - entrypoints==0.4 + - executing==1.2.0 + - Farama-Notifications==0.0.4 + - fasteners==0.18 + - filelock==3.12.2 + - fonttools==4.42.0 + - glfw==2.6.2 + - gymnasium==0.29.0 + - idna==3.4 + - imageio==2.31.1 + - imageio-ffmpeg==0.4.8 + - importlib-metadata==6.8.0 + - importlib-resources==6.0.1 + - ipykernel==6.14.0 + - ipython==8.4.0 + - jedi==0.19.0 + - Jinja2==3.1.2 + - joblib==1.3.2 + - jupyter-client>=7.4.4 + - jupyter_core==5.3.1 + - kiwisolver==1.4.4 + - lit==16.0.6 + - lz4==4.3.2 + - MarkupSafe==2.1.3 + - matplotlib==3.7.2 + - matplotlib-inline==0.1.6 + - ml-dtypes>=0.3.1 + - moviepy==1.0.3 + - mpmath==1.3.0 + - nest-asyncio==1.5.6 + - networkx==3.1 + - opencv-python==4.8.0.76 + - opt-einsum==3.3.0 + - packaging==23.1 + - pandas==2.0.3 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - Pillow==10.0.0 + - platformdirs==3.10.0 + - proglog==0.1.10 + - prompt-toolkit==3.0.39 + - psutil==5.9.0 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pycparser==2.21 + - pygame==2.5.1 + - Pygments==2.16.1 + - pynput==1.7.6 + - PyOpenGL==3.1.7 + - pyparsing==3.0.9 + - python-dateutil==2.8.2 + - python-xlib==0.33 + - pytz==2023.3 + - requests==2.31.0 + - scikit-learn==1.3.0 + - setuptools==68.0.0 + - Shimmy==0.2.1 + - six==1.16.0 + - stack-data==0.6.2 + - sympy==1.12 + - threadpoolctl==3.2.0 + - torch==2.0.1 + - tornado>=6.2.0 + - tqdm==4.66.1 + - traitlets==5.9.0 + - typing_extensions==4.7.1 + - tzdata==2023.3 + - urllib3==2.0.4 + - wcwidth==0.2.6 + - wheel==0.38.4 + - zipp==3.16.2 \ No newline at end of file From 00841c6d33c2b1ae3d8bf90b614c68f71019930d Mon Sep 17 00:00:00 2001 From: Yichang FENG Date: Mon, 15 Sep 2025 20:46:39 -0500 Subject: [PATCH 2/4] [0.0.1] Implement value updates and action selection in Value Iteration --- Solvers/Value_Iteration.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py index 0054241..4007c87 100644 --- a/Solvers/Value_Iteration.py +++ b/Solvers/Value_Iteration.py @@ -71,6 +71,9 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ + values = self.one_step_lookahead(each_state) + best_action_value = np.max(values) + self.V[each_state] = best_action_value # Dont worry about this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) @@ -140,7 +143,9 @@ def policy_fn(state): ################################ # YOUR IMPLEMENTATION HERE # ################################ - + values = self.one_step_lookahead(state) + best_action = np.argmax(values) + return best_action return policy_fn @@ -192,6 +197,17 @@ def train_episode(self): # Do a one-step lookahead to find the best action # # Update the value function. Ref: Sutton book eq. 4.10. # ######################################################### + THRESHOLD = 1e-4 + while not self.pq.isEmpty(): + state = self.pq.pop() + values = self.one_step_lookahead(state) + best_action_value = np.max(values) + delta = abs(self.V[state] - best_action_value) + self.V[state] = best_action_value + if delta > THRESHOLD: + for s in self.pred[state]: + priority = -abs(self.V[s] - best_action_value) + self.pq.update(s, priority) # you can ignore this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) From ad74447d96ddfb5c8dcc6266113a248cf085e2a2 Mon Sep 17 00:00:00 2001 From: Yichang FENG Date: Sun, 21 Sep 2025 18:45:49 -0500 Subject: [PATCH 3/4] [0.0.1a] Implement policy evaluation and update in Policy Iteration; optimize value updates in Value Iteration --- Solvers/Policy_Iteration.py | 18 +++++++++++++++++- Solvers/Value_Iteration.py | 19 ++++++++----------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py index 06f8f32..4a33c7a 100644 --- a/Solvers/Policy_Iteration.py +++ b/Solvers/Policy_Iteration.py @@ -57,7 +57,9 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ - + A = self.one_step_lookahead(s) + best_action = np.argmax(A) + self.policy[s, :] = np.eye(self.env.action_space.n)[best_action] # In DP methods we don't interact with the environment so we will set the reward to be the sum of state values # and the number of steps to -1 representing an invalid value @@ -103,6 +105,20 @@ def policy_eval(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ + num_states = self.env.observation_space.n + num_actions = self.env.action_space.n + gamma = self.options.gamma + + # Build P_pi and R_pi + P_pi = np.zeros((num_states, num_states)) + R_pi = np.zeros(num_states) + for s in range(num_states): + a = np.argmax(self.policy[s]) + for prob, next_state, reward, done in self.env.P[s][a]: + P_pi[s, next_state] += prob + R_pi[s] += prob * reward + + self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi) def create_greedy_policy(self): """ diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py index 4007c87..aea5223 100644 --- a/Solvers/Value_Iteration.py +++ b/Solvers/Value_Iteration.py @@ -197,17 +197,14 @@ def train_episode(self): # Do a one-step lookahead to find the best action # # Update the value function. Ref: Sutton book eq. 4.10. # ######################################################### - THRESHOLD = 1e-4 - while not self.pq.isEmpty(): - state = self.pq.pop() - values = self.one_step_lookahead(state) - best_action_value = np.max(values) - delta = abs(self.V[state] - best_action_value) - self.V[state] = best_action_value - if delta > THRESHOLD: - for s in self.pred[state]: - priority = -abs(self.V[s] - best_action_value) - self.pq.update(s, priority) + for s in range(self.env.observation_space.n): + # Do a one-step lookahead to find the best action + A = self.one_step_lookahead(s) + best_action_value = np.max(A) + priority = -abs(self.V[s] - best_action_value) + self.pq.update(s, priority) + state = self.pq.pop() + self.V[state] = self.one_step_lookahead(state).max() # you can ignore this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) From ec4d05d41af70f3d00c772fd43127db07924122e Mon Sep 17 00:00:00 2001 From: Yichang FENG Date: Sun, 28 Sep 2025 00:06:22 -0500 Subject: [PATCH 4/4] [0.0.2] 1. Implement Monte Carlo and Off-Policy Monte Carlo methods in Monte_Carlo.py; 2. Refine policy functions in Policy_Iteration.py and Value_Iteration.py. --- Solvers/Monte_Carlo.py | 52 ++++++++++++++++++++++++++++++++++++- Solvers/Policy_Iteration.py | 2 -- Solvers/Value_Iteration.py | 33 +++++++++++++++-------- 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/Solvers/Monte_Carlo.py b/Solvers/Monte_Carlo.py index f3d848a..6608f75 100644 --- a/Solvers/Monte_Carlo.py +++ b/Solvers/Monte_Carlo.py @@ -65,6 +65,26 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ + for t in range(self.options.steps): + action_probs = self.policy(state) + action = np.random.choice(np.arange(len(action_probs)), p=action_probs) + next_state, reward, done, _ = self.step(action) + episode.append((state, action, reward)) + if done: + break + state = next_state + + discount_factor = self.options.gamma + visited = set() + G = 0 + for t in reversed(range(len(episode))): + state, action, reward = episode[t] + G = reward + discount_factor * G + if (state, action) not in visited: + visited.add((state, action)) + self.returns_sum[(state, action)] += G + self.returns_count[(state, action)] += 1 + self.Q[state][action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)] def __str__(self): return "Monte Carlo" @@ -90,6 +110,10 @@ def policy_fn(observation): ################################ # YOUR IMPLEMENTATION HERE # ################################ + A = np.ones(nA, dtype=float) * self.options.epsilon / nA + best_action = np.argmax(self.Q[observation]) + A[best_action] += 1.0 - self.options.epsilon + return A return policy_fn @@ -109,6 +133,7 @@ def policy_fn(state): ################################ # YOUR IMPLEMENTATION HERE # ################################ + return np.argmax(self.Q[state]) return policy_fn @@ -163,7 +188,32 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ - + for t in range(self.options.steps): + action_probs = self.behavior_policy(state) + action = np.random.choice(np.arange(len(action_probs)), p=action_probs) + next_state, reward, done, _ = self.step(action) + episode.append((state, action, reward)) + if done: + break + state = next_state + + discount_factor = self.options.gamma + visited = set() + G = 0 + W = 1.0 + for t in reversed(range(len(episode))): + state, action, reward = episode[t] + G = discount_factor * G + reward + + self.C[state][action] += W + self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action]) + + if self.target_policy(state) != action: + break + + behavior_prob = self.behavior_policy(state)[action] + target_prob = 1.0 if self.target_policy(state) == action else 0.0 + W = W * (target_prob / behavior_prob) def create_random_policy(self): """ diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py index 4a33c7a..3a94306 100644 --- a/Solvers/Policy_Iteration.py +++ b/Solvers/Policy_Iteration.py @@ -106,10 +106,8 @@ def policy_eval(self): # YOUR IMPLEMENTATION HERE # ################################ num_states = self.env.observation_space.n - num_actions = self.env.action_space.n gamma = self.options.gamma - # Build P_pi and R_pi P_pi = np.zeros((num_states, num_states)) R_pi = np.zeros(num_states) for s in range(num_states): diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py index aea5223..323198d 100644 --- a/Solvers/Value_Iteration.py +++ b/Solvers/Value_Iteration.py @@ -72,8 +72,7 @@ def train_episode(self): # YOUR IMPLEMENTATION HERE # ################################ values = self.one_step_lookahead(each_state) - best_action_value = np.max(values) - self.V[each_state] = best_action_value + self.V[each_state] = np.max(values) # Dont worry about this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) @@ -144,8 +143,7 @@ def policy_fn(state): # YOUR IMPLEMENTATION HERE # ################################ values = self.one_step_lookahead(state) - best_action = np.argmax(values) - return best_action + return np.argmax(values) return policy_fn @@ -197,14 +195,27 @@ def train_episode(self): # Do a one-step lookahead to find the best action # # Update the value function. Ref: Sutton book eq. 4.10. # ######################################################### - for s in range(self.env.observation_space.n): - # Do a one-step lookahead to find the best action - A = self.one_step_lookahead(s) + # for s in range(self.env.observation_space.n): + # # Do a one-step lookahead to find the best action + # A = self.one_step_lookahead(s) + # best_action_value = np.max(A) + # priority = -abs(self.V[s] - best_action_value) + # self.pq.update(s, priority) + # state = self.pq.pop() + # self.V[state] = self.one_step_lookahead(state).max() + + if not self.pq.isEmpty(): + state = self.pq.pop() + + A = self.one_step_lookahead(state) best_action_value = np.max(A) - priority = -abs(self.V[s] - best_action_value) - self.pq.update(s, priority) - state = self.pq.pop() - self.V[state] = self.one_step_lookahead(state).max() + self.V[state] = best_action_value + + for s in self.pred.get(state, []): + A = self.one_step_lookahead(s) + pred_best_value = np.max(A) + priority = -abs(self.V[s] - pred_best_value) + self.pq.update(s, priority) # you can ignore this part self.statistics[Statistics.Rewards.value] = np.sum(self.V)