In [2]:
import pandas as pd
import numpy as np
import random as rd

```
SFFF
FHFH
FFFH
HFFG
```



***Frozen lake involves crossing a frozen lake from Start(S) to Goal(G) without falling into any Holes(H) by walking over the Frozen(F) lake. The agent may not always move in the intended direction due to the slippery nature of the frozen lake.***

In [3]:
action_map = {0:'left',
          1:'down',
          2:'right',
          3:'up'}

# Frozen Lake Enviroment

![Frozen Lake](https://www.gymlibrary.dev/_images/frozen_lake.gif)

In [28]:
# The game starts at the S tile. The object of the 
# game is to get to the goal (G) without landing in a hole (H).

class Env:
    def __init__(self):
        self.env = np.array([[0,0,0,0],
                            [0,-1,0,-1],
                            [0,0,0,-1],
                            [-1,0,0,2]])

        self.win_reward = 1
        self.loss_reward = 0

        self.current_state = (0,0)
        self.shape = self.env.shape
        self.done = False
        self.reward = 0

    def terminate_state(self):
        row,col = self.current_state
        if self.env[row][col] == 2:
            self.done = True
            self.reward = self.win_reward

        elif self.env[row][col] == -1:
            self.done = True
            self.reward = self.loss_reward

        else:
            self.done = False
            self.reward = 0

    def step(self,action):
        row,col = self.current_state
        if action == 2:
            if col+1 < self.shape[1]:
                self.current_state = (row,col+1)
        if action == 0:
            if col-1 >= 0:
                self.current_state = (row,col-1)
        if action == 1:
            if row-1 >= 0:
                self.current_state = (row-1,col)
        if action == 3:
            if row+1 < self.shape[0]:
                self.current_state = (row+1,col)

        self.terminate_state()
        return self.current_state , self.done , self.reward


    def render(self):
        row,col = self.env.shape
        env_rend = ''
        for i in range(row):
            for j in range(col):
                state = self.env[i][j]
                if self.current_state == (i,j):
                    env_rend += 'S'
                elif state == 0:
                    env_rend += 'F'
                elif state == -1:
                    env_rend += 'H'
                else:
                    env_rend += 'G'
            env_rend += '\n'
        print(env_rend)

In [29]:
env = Env()
obs,done,reward = env.step(2)
print(f'observation : {obs} done : {done} reward : {reward}')

# we can print the environment if we want to look at it
# with a current state S
env.render()

observation : (0, 1) done : False reward : 0
FSFF
FHFH
FFFH
HFFG



In [30]:
done = False
env = Env()
actions = [0,1,2,3]
while not done:
    action = rd.choice(actions)
    obs,done,reward = env.step(action)
    print(f'observation : {obs} done : {done} reward : {reward} action : {action_map[action]}')
    env.render()

observation : (1, 0) done : False reward : 0 action : up
FFFF
SHFH
FFFH
HFFG

observation : (0, 0) done : False reward : 0 action : down
SFFF
FHFH
FFFH
HFFG

observation : (1, 0) done : False reward : 0 action : up
FFFF
SHFH
FFFH
HFFG

observation : (1, 0) done : False reward : 0 action : left
FFFF
SHFH
FFFH
HFFG

observation : (2, 0) done : False reward : 0 action : up
FFFF
FHFH
SFFH
HFFG

observation : (3, 0) done : True reward : 0 action : up
FFFF
FHFH
FFFH
SFFG



# Data Collection

In [31]:
import random as rd
from tqdm import tqdm

env = Env()
num_episodes = 50000
life_memory = []
actions = [0,2,3]
total_win = 0

for i in tqdm(range(num_episodes)):
    env = Env()
    done = False
    ep_memory = []
    total_reward = 0
    old_obs = (0,0)
    while not done:
        new_action = rd.choice(actions)
        obs,done,reward = env.step(new_action)
        total_reward += reward
        ep_memory.append({
            "row": old_obs[0],
            "col":old_obs[1],
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_obs = obs

    if total_reward > 0:
        total_win+=1

    # incorporate total reward
    num_steps = len(ep_memory)
    for i, ep_mem in enumerate(ep_memory):
        ep_mem["tot_reward"] = total_reward
        ep_mem["decay_reward"] = i*total_reward/num_steps
    life_memory.extend(ep_memory)

print(f'\nwin : {total_win*100/num_episodes}')

100%|██████████| 50000/50000 [00:02<00:00, 22428.95it/s]


win : 5.02





# Convert to DataFrame

In [32]:
memory_df = pd.DataFrame(life_memory)

In [33]:
memory_df.head()

Unnamed: 0,row,col,action,reward,episode,tot_reward,decay_reward
0,0,0,3,0,0,0,0.0
1,1,0,3,0,0,0,0.0
2,2,0,3,0,0,0,0.0
3,0,0,3,0,1,0,0.0
4,1,0,2,0,1,0,0.0


# Display First 5 episode which has win state

In [35]:
memory_df[memory_df['reward'] == 1]['episode'].head()

19      3
287    56
295    57
332    62
371    65
Name: episode, dtype: int64

In [36]:
memory_df[memory_df['episode'] == 3]

Unnamed: 0,row,col,action,reward,episode,tot_reward,decay_reward
11,0,0,3,0,3,1,0.0
12,1,0,0,0,3,1,0.111111
13,1,0,0,0,3,1,0.222222
14,1,0,3,0,3,1,0.333333
15,2,0,2,0,3,1,0.444444
16,2,1,2,0,3,1,0.555556
17,2,2,3,0,3,1,0.666667
18,3,2,3,0,3,1,0.777778
19,3,2,2,1,3,1,0.888889


In [37]:
memory_df.shape

(263720, 7)

In [38]:
memory_df.groupby("episode").reward.sum().mean()

0.0502

# Model Training

In [39]:
import warnings
warnings.filterwarnings("ignore")

def warn(*args, **kwargs):
    return None

warnings.warn = warn

In [40]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

model = ExtraTreesRegressor(n_estimators=50)

# model = SVR()
y = 0.5*memory_df.reward + 0.1*memory_df.decay_reward + memory_df.tot_reward
x = memory_df[["row","col", "action"]]
model.fit(x, y)

# Reinforcement Learning Testing 

In [53]:
env = Env()
done = False
test_episode = 500
state = (0,0)
total_reward = 0
env.current_state = state
random_per = 0.5
up_action_only = {0:0,1:2,2:3}

for i in range(test_episode):
    while not done:
        random = rd.random()
        if random < random_per:
            rl_action = rd.choice(actions)
        else:
            input_actions = [[state[0],state[1],i] for i in actions]
            reward_prob = model.predict(input_actions)
            rl_action = np.argmax(reward_prob)
            rl_action = up_action_only[rl_action]

        obs,done,reward = env.step(rl_action)
        total_reward += reward
print(f'score : {total_reward*100/test_episode}')

score : 0.2


# Reinforcement Learning Performing

In [29]:
env = Env()
done = False
state = (0,0)
env.current_state = state
random_per = 0.5
env.render()

up_action_only = {0:0,1:2,2:3}

while not done:
    random = rd.random()
    if random < random_per:
        rl_action = rd.choice(actions)
    else:
        input_actions = [[state[0],state[1],i] for i in actions]
        reward_prob = model.predict(input_actions)
        rl_action = np.argmax(reward_prob)
        rl_action = up_action_only[rl_action]
        
    obs,done,reward = env.step(rl_action)
    print(f'action : {action_map[rl_action]}')
    env.render()

SFFF

FHFH

FFFH

HFFG



action : up

FFFF

SHFH

FFFH

HFFG



action : up

FFFF

FHFH

SFFH

HFFG



action : left

FFFF

FHFH

SFFH

HFFG



action : right

FFFF

FHFH

FSFH

HFFG



action : left

FFFF

FHFH

SFFH

HFFG



action : right

FFFF

FHFH

FSFH

HFFG



action : up

FFFF

FHFH

FFFH

HSFG



action : up

FFFF

FHFH

FFFH

HSFG



action : up

FFFF

FHFH

FFFH

HSFG



action : up

FFFF

FHFH

FFFH

HSFG



action : up

FFFF

FHFH

FFFH

HSFG



action : right

FFFF

FHFH

FFFH

HFSG



action : up

FFFF

FHFH

FFFH

HFSG



action : up

FFFF

FHFH

FFFH

HFSG



action : right

FFFF

FHFH

FFFH

HFFS


