In [None]:
!pip install -e gym-foo

In [None]:
import gym

In [None]:
# To change parameters of the game 

# MaximumRows : The total length of the game is limited by time in the actual game but in the RL model it is limited by the number of row
# MaximumAttempts : The maximum number of times a bush can be accessed before the reward collected from it becomes 0 was found to be 13 (MAX_ATTEMPTS = 13)
# ActionTime : Time to move to the bush/square and back to the center, that is taken as ActionTime = 2 sec.
# TimeLag : Time to move from one row to another, that is TimeLag = 3 sec for Round 1 and 10 sec for Round 2
# env.__init__(MaximumRows = 10)

# Round 1

In [None]:
env = gym.make('gym_foo:bushberry-v0')

In [None]:
env.__init__(MaximumRows = 50)

In [None]:
env.reset()

##### Rendering


In [None]:
# B represents bush and S represents square. In the game, the movement was from bottom to top, here it's from top to bottom.

In [None]:
env.render() 

In [None]:
from IPython.display import clear_output
from time import sleep
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.5)

In [None]:
env.ActionTime

In [None]:
env.TimeLag

#### Before applying Q-Learning

In [None]:
import time

In [None]:
env.s = 20
epochs = 0
reward = 0

frames = [] # for animation

done = False

total_score = 0

start = time.time()
while time.time()-start<4*60 and not done:
    action = env.action_space.sample()
        
    state, reward, done, info = env.step(action)
    
    if action==0 or reward==10:
        sleep(env.ActionTime/2) #time to move from center to bush or from bush to center
    elif action==1:
        sleep(env.TimeLag) #time to move from square to center of next row
 
    total_score+=reward
    
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# Before QL
print_frames(frames)
print('Total Score:',total_score) 

##### Using MVT

In [None]:
#MVT
#Round 1: t=3 seconds
#optimal time to spend at a bush, T = 4.37429 seconds
import time

env.s = 20
epochs = 0
reward = 0

frames_mvt = [] # for animation

done = False

total_score_mvt = 0

T=4.37429
start=time.time()
while time.time()-start<4*60 and not done:
    
    start_time=time.time()
    while True and not done:
        current_time = time.time()
        elapsed = current_time - start_time
        if elapsed>T:
            action=1
            break
        action=0
        state, reward, done, info = env.step(action)
        total_score_mvt+=reward
        frames_mvt.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        })
        sleep(env.ActionTime/2) # time to move from center to bush or from bush to center
            
    state, reward, done, info = env.step(action)
    
    total_score_mvt+=reward
    
    frames_mvt.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    
    if reward==10:
        sleep(env.ActionTime/2) # time to move from center to bush or from bush to center
        state, reward, done, info = env.step(action)
        total_score_mvt+=reward
        frames_mvt.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    
    sleep(env.TimeLag) # time to move from square to center of next row
    
    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# MVT
print_frames(frames_mvt)
print('Total Score:',total_score_mvt) 

### Q-Learning

In [None]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [None]:
q_table.shape

#### Training with Q-Learning

In [None]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
#all_penalties = []
all_total_scores = []

for i in range(1, 10001):
    state = env.reset()

    epochs,  reward = 0, 0
    #penalties = 0
    done = False
    
    total_score = 0
    while not done:
        
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        total_score += reward

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

#### Performace after Q-Learning

In [None]:
"""Evaluate agent's performance after Q-learning"""
state = env.reset()
epochs, reward = 0, 0

Qtotal_score = 0

Qframes=[]
done = False

start=time.time()
while time.time()-start<4*60 and not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    
    if action==0 or reward==10:
        sleep(env.ActionTime/2) #time to move from center to bush or from bush to center
    elif action==1:
        sleep(env.TimeLag) #time to move from square to center of next row

    Qtotal_score += reward
    
    Qframes.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# After QL
print_frames(Qframes)
print('Total Score:', Qtotal_score)

# Round 2

In [None]:
env2 = gym.make('gym_foo:bushberry-v0')

In [None]:
env2.__init__(MaximumRows = 50, MaximumAttempts = 13, ActionTime=2,TimeLag=10) 

In [None]:
env2.reset()

#### Rendering

In [None]:
env2.render()

#### Before applying Q-Learning

In [None]:
env2.s = 20
epochs = 0
#penalties=0 
reward = 0

frames = [] # for animation

done = False

total_score = 0
start = time.time()
while time.time()-start<4*60 and not done:
    action = env2.action_space.sample()
    state, reward, done, info = env2.step(action)
    
    if action==0 or reward==10:
        sleep(env2.ActionTime/2) #time to move from center to bush or from bush to center
    elif action==1:
        sleep(env2.TimeLag) #time to move from square to center of next row
        
    total_score+=reward
    #if reward == -10:
    #    penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env2.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# Before QL

from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.5)
                       
print_frames(frames)
print('Total Score:',total_score) 

#### MVT

In [None]:
#MVT
#Round 2: t=10 seconds
#optimal time to spend at a bush, T = 7.71135 seconds
import time

env2.s = 20
epochs = 0
reward = 0

frames_mvt = [] # for animation

done = False

total_score_mvt = 0

T=7.71135
start=time.time()
while time.time()-start<4*60 and not done:
    
    start_time=time.time()
    while True and not done:
        current_time = time.time()
        elapsed = current_time - start_time
        if elapsed>T:
            action=1
            break
        action=0
        state, reward, done, info = env2.step(action)
        total_score_mvt+=reward
        frames_mvt.append({
        'frame': env2.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        })
        sleep(env2.ActionTime/2) # time to move from center to bush or from bush to center
            
    state, reward, done, info = env2.step(action)
    
    total_score_mvt+=reward
    
    frames_mvt.append({
        'frame': env2.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    
    if reward==10:
        sleep(env2.ActionTime/2) # time to move from center to bush or from bush to center
        state, reward, done, info = env2.step(action)
        total_score_mvt+=reward
        frames_mvt.append({
        'frame': env2.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    
    sleep(env2.TimeLag) # time to move from square to center of next row
    
    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# MVT
print_frames(frames_mvt)
print('Total Score:',total_score_mvt) 

### Q-Learning

In [None]:
import numpy as np
q_table = np.zeros([env2.observation_space.n, env2.action_space.n])

In [None]:
q_table.shape

#### Training with Q-Learning

In [None]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
#all_penalties = []
all_total_scores = []

for i in range(1, 100001):
    state = env2.reset()

    epochs,  reward = 0, 0
    #penalties = 0
    done = False
    
    total_score = 0
    while not done:
        
        if random.uniform(0, 1) < epsilon:
            action = env2.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env2.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        #if reward == -10:
        #    penalties += 1
        total_score += reward

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

### Performance after Q-Learning

In [None]:
"""Evaluate agent's performance after Q-learning"""

state = env2.reset()
epochs, reward = 0, 0

total_score = 0

frames=[]
done = False

start=time.time()
while time.time()-start<4*60 and not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env2.step(action)
    
    if action==0 or reward==10:
        sleep(env2.ActionTime/2) #time to move from center to bush or from bush to center
    elif action==1:
        sleep(env2.TimeLag) #time to move from square to center of next row

    #if reward == -10:
    #    penalties += 1
    total_score += reward
    
    frames.append({
        'frame': env2.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )
    epochs += 1

In [None]:
done

In [None]:
time.time()-start

In [None]:
# After QL
print_frames(frames)
print('Total Score:', total_score)