Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 51 additions & 1 deletion Solvers/Monte_Carlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,26 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################
for t in range(self.options.steps):
action_probs = self.policy(state)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
next_state, reward, done, _ = self.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state

discount_factor = self.options.gamma
visited = set()
G = 0
for t in reversed(range(len(episode))):
state, action, reward = episode[t]
G = reward + discount_factor * G
if (state, action) not in visited:
visited.add((state, action))
self.returns_sum[(state, action)] += G
self.returns_count[(state, action)] += 1
self.Q[state][action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)]

def __str__(self):
return "Monte Carlo"
Expand All @@ -90,6 +110,10 @@ def policy_fn(observation):
################################
# YOUR IMPLEMENTATION HERE #
################################
A = np.ones(nA, dtype=float) * self.options.epsilon / nA
best_action = np.argmax(self.Q[observation])
A[best_action] += 1.0 - self.options.epsilon
return A

return policy_fn

Expand All @@ -109,6 +133,7 @@ def policy_fn(state):
################################
# YOUR IMPLEMENTATION HERE #
################################
return np.argmax(self.Q[state])


return policy_fn
Expand Down Expand Up @@ -163,7 +188,32 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################

for t in range(self.options.steps):
action_probs = self.behavior_policy(state)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
next_state, reward, done, _ = self.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state

discount_factor = self.options.gamma
visited = set()
G = 0
W = 1.0
for t in reversed(range(len(episode))):
state, action, reward = episode[t]
G = discount_factor * G + reward

self.C[state][action] += W
self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action])

if self.target_policy(state) != action:
break

behavior_prob = self.behavior_policy(state)[action]
target_prob = 1.0 if self.target_policy(state) == action else 0.0
W = W * (target_prob / behavior_prob)

def create_random_policy(self):
"""
Expand Down
16 changes: 15 additions & 1 deletion Solvers/Policy_Iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################

A = self.one_step_lookahead(s)
best_action = np.argmax(A)
self.policy[s, :] = np.eye(self.env.action_space.n)[best_action]

# In DP methods we don't interact with the environment so we will set the reward to be the sum of state values
# and the number of steps to -1 representing an invalid value
Expand Down Expand Up @@ -103,6 +105,18 @@ def policy_eval(self):
################################
# YOUR IMPLEMENTATION HERE #
################################
num_states = self.env.observation_space.n
gamma = self.options.gamma

P_pi = np.zeros((num_states, num_states))
R_pi = np.zeros(num_states)
for s in range(num_states):
a = np.argmax(self.policy[s])
for prob, next_state, reward, done in self.env.P[s][a]:
P_pi[s, next_state] += prob
R_pi[s] += prob * reward

self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi)

def create_greedy_policy(self):
"""
Expand Down
26 changes: 25 additions & 1 deletion Solvers/Value_Iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################
values = self.one_step_lookahead(each_state)
self.V[each_state] = np.max(values)

# Dont worry about this part
self.statistics[Statistics.Rewards.value] = np.sum(self.V)
Expand Down Expand Up @@ -140,7 +142,8 @@ def policy_fn(state):
################################
# YOUR IMPLEMENTATION HERE #
################################

values = self.one_step_lookahead(state)
return np.argmax(values)

return policy_fn

Expand Down Expand Up @@ -192,6 +195,27 @@ def train_episode(self):
# Do a one-step lookahead to find the best action #
# Update the value function. Ref: Sutton book eq. 4.10. #
#########################################################
# for s in range(self.env.observation_space.n):
# # Do a one-step lookahead to find the best action
# A = self.one_step_lookahead(s)
# best_action_value = np.max(A)
# priority = -abs(self.V[s] - best_action_value)
# self.pq.update(s, priority)
# state = self.pq.pop()
# self.V[state] = self.one_step_lookahead(state).max()

if not self.pq.isEmpty():
state = self.pq.pop()

A = self.one_step_lookahead(state)
best_action_value = np.max(A)
self.V[state] = best_action_value

for s in self.pred.get(state, []):
A = self.one_step_lookahead(s)
pred_best_value = np.max(A)
priority = -abs(self.V[s] - pred_best_value)
self.pq.update(s, priority)

# you can ignore this part
self.statistics[Statistics.Rewards.value] = np.sum(self.V)
Expand Down
98 changes: 98 additions & 0 deletions environment_mac_mod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
name: csce642
channels:
- conda-forge
- defaults
dependencies:
- python=3.9.16
- numpy=1.23.5
- swig
- box2d-py
- pip
- pip:
- absl-py==1.4.0
- ale-py==0.8.1
- asttokens==2.2.1
- backcall==0.2.0
- backports.functools-lru-cache==1.6.5
- certifi==2023.7.22
- cffi==1.15.1
- charset-normalizer==3.2.0
- cloudpickle==2.2.1
- cmake==3.27.2
- contourpy==1.1.0
- cycler==0.11.0
- Cython==3.0.0
- debugpy==1.6.7
- decorator==4.4.2
- entrypoints==0.4
- executing==1.2.0
- Farama-Notifications==0.0.4
- fasteners==0.18
- filelock==3.12.2
- fonttools==4.42.0
- glfw==2.6.2
- gymnasium==0.29.0
- idna==3.4
- imageio==2.31.1
- imageio-ffmpeg==0.4.8
- importlib-metadata==6.8.0
- importlib-resources==6.0.1
- ipykernel==6.14.0
- ipython==8.4.0
- jedi==0.19.0
- Jinja2==3.1.2
- joblib==1.3.2
- jupyter-client>=7.4.4
- jupyter_core==5.3.1
- kiwisolver==1.4.4
- lit==16.0.6
- lz4==4.3.2
- MarkupSafe==2.1.3
- matplotlib==3.7.2
- matplotlib-inline==0.1.6
- ml-dtypes>=0.3.1
- moviepy==1.0.3
- mpmath==1.3.0
- nest-asyncio==1.5.6
- networkx==3.1
- opencv-python==4.8.0.76
- opt-einsum==3.3.0
- packaging==23.1
- pandas==2.0.3
- parso==0.8.3
- pexpect==4.8.0
- pickleshare==0.7.5
- Pillow==10.0.0
- platformdirs==3.10.0
- proglog==0.1.10
- prompt-toolkit==3.0.39
- psutil==5.9.0
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pycparser==2.21
- pygame==2.5.1
- Pygments==2.16.1
- pynput==1.7.6
- PyOpenGL==3.1.7
- pyparsing==3.0.9
- python-dateutil==2.8.2
- python-xlib==0.33
- pytz==2023.3
- requests==2.31.0
- scikit-learn==1.3.0
- setuptools==68.0.0
- Shimmy==0.2.1
- six==1.16.0
- stack-data==0.6.2
- sympy==1.12
- threadpoolctl==3.2.0
- torch==2.0.1
- tornado>=6.2.0
- tqdm==4.66.1
- traitlets==5.9.0
- typing_extensions==4.7.1
- tzdata==2023.3
- urllib3==2.0.4
- wcwidth==0.2.6
- wheel==0.38.4
- zipp==3.16.2