In [1]:
import gymnasium as gym
import pandas as pd
import random

In [10]:
# Make small test maze
# maze=["SF", "FH", "FG"]
maze=["SFFFHFFHFH", "FHFHFFFFFH", "FFFHHHFFFF", "HFFFGFFHFF", "HHFFFFHHFF",
      "FFFHFFHFFF", "FFHFHFHFFF", "HHHHFFFFFF", "FHFHFHFHFF", "HFFFFFFFFG"]
env = gym.make('FrozenLake-v1', desc=maze, render_mode='human', is_slippery=False)
initial_state = env.reset()
env.render()

In [3]:
# Set up q-table
    # key is the state of cell
    # index of list is the action
        # Left: 0, Down: 1, 2: Right, 3: Up

''' q = {
    0: [0,0,0,0], 
    1: [0,0,0,0], 
    2: [0,0,0,0], 
    3: [0,0,0,0],
    4: [0,0,0,0],
    5: [0,0,0,0]
}
'''

' q = {\n    0: [0,0,0,0], \n    1: [0,0,0,0], \n    2: [0,0,0,0], \n    3: [0,0,0,0],\n    4: [0,0,0,0],\n    5: [0,0,0,0]\n}\n'

In [11]:
import numpy as np

# Define the environment
n_states = 100  # Number of states in the grid world
n_actions = 4  # Number of possible actions (up, down, left, right)
goal_state = 100  # Goal state

# Initialize Q-table with zeros
q = np.zeros((n_states, n_actions))

In [12]:
# Create my own reward system
cell_types ="SFFFHFFHFHFHFHFFFFFHFFFHHHFFFFHFFFGFFHFFHHFFFFHHFFFFFHFFHFFFFFHFHFHFFFHHHHFFFFFFFHFHFHFHFFHFFFFFFFFG"

def getReward(state):
    if cell_types[state] == "G":
        return 100
    elif cell_types[state] == "H":
        return -100
    else:
        return -1

In [13]:
# This function uses the Bellman Equation to update the q-table: 
    # new_q = (1-alpha) * q(s, a) + alpha * (R + gamma(max(q(s`, a`))))
def updateQTable(q, alpha, gamma, current_state, next_state, action):
    current_q = q[current_state][action]
    reward = getReward(next_state)
    next_state_row = [q[next_state][0], q[next_state][1], q[next_state][2], q[next_state][3]] 
    next_max_q = max(next_state_row)
    new_q = ((1-alpha) * current_q) + (alpha * (reward + (gamma * next_max_q)))
    q[current_state][action] = new_q

In [14]:
terminated = False

In [15]:
# Train Q-Model for 1000 episodes
# All actions are random
current_state = 0
alpha = 0.2
gamma = 0.8
for episode in range(10):
    while not terminated:
        action = random.randint(0, 3)
        new_state, reward, terminated, truncated, info = env.step(action)
        updateQTable(q, alpha, gamma, current_state, new_state, action)
        current_state = new_state
    inital = env.reset()
    terminated = False

In [17]:
df = pd.DataFrame(q)
#df = df.T
#df.columns = ["Left", "Down", "Right", "Up"]
df.head(100)

Unnamed: 0,0,1,2,3
0,-2.762687,-2.532771,-2.265827,-2.766977
1,-2.702138,-101.309566,-1.665632,-2.226086
2,-2.114421,-0.970706,-1.279586,-1.216710
3,-1.149142,-48.800000,-67.232000,-0.753267
4,-0.548217,-0.513735,-0.732264,-0.200000
...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000
96,0.000000,0.000000,0.000000,0.000000
97,0.000000,0.000000,0.000000,0.000000
98,0.000000,0.000000,0.000000,0.000000


In [None]:
env.close()
# Save final q table as a csv file
df.to_csv('final_q_values_2.csv', index=False)