In [1]:
import tensorflow as tf
import gym
import numpy as np
import pandas as pd
import argparse
import matplotlib.pyplot as plt

In [12]:
class SARSA:
    def __init__(self,state_dim,action_dim,lr=0.01,gamma=0.8,e_greed=0.1):
        self.state_dim=state_dim
        self.action_dim=action_dim
        self.lr=lr
        self.gamma=gamma
        self.e_greed=e_greed
        self.Q=np.zeros((state_dim,action_dim))
    def _action(self,state):
        if np.random.uniform()<self.e_greed:
            action=np.random.choice(self.action_dim)
        else:
            action=self.predict(state)
        return action
    def predict(self,state):
        all_actions=self.Q[state,:]
        max_action=np.max(all_actions)
        max_actions_list=np.where(all_actions==max_action)[0]
        return np.random.choice(max_actions_list)
    def learn(self,state,action,reward,next_state,next_action,done):
        if done:
            target=reward
        else:
            target=reward+self.gamma*(self.Q[next_state,next_action])
        
        self.Q[state,action]+=self.lr*(target-self.Q[state,action])
    
    def save(self):
        npy_file = './model/SARSA_table.npy'
        np.save(npy_file, self.Q)
        print(npy_file + ' saved.')

    def load(self, npy_file='./model/SARSA_table.npy'):
        self.Q = np.load(npy_file)
        print(npy_file + ' loaded.')

In [15]:
class Agent:
    def __init__(self,env,lr=0.1,gamma=0.8,e_greed=0.1):
        self.env=env
        self.lr=lr
        self.gamma=gamma
        self.e_greed=e_greed
        self.model=SARSA(self.env.observation_space.n,self.env.action_space.n,lr,gamma,e_greed)
    
    def train_eporch(self,render=False):
        total_reward=0
        total_steps=0
        state=self.env.reset()
        action=self.model._action(state)
        while True:
            next_state,reward,done,_=self.env.step(action)
            next_action=self.model._action(next_state)
            #Training Sarsa method,update Q-table
            self.model.learn(state,action,reward,next_state,next_action,done)
            total_reward+=reward
            total_steps+=1
            state=next_state
            action=next_action
            if render:
                self.env.render()
            if done:
                break
        return total_reward,total_steps

    def train(self,max_eporch):
        for eporch in range(max_eporch):
            rewards,steps=self.train_eporch()
            if(eporch % 20==0):
                print("Eporch %03s: steps = %02s , reward = %.1f"%(eporch,steps,rewards))
        self.model.save()
    def test(self):
#         self.model.load()
        self.test_episode(render=True)
    
    def test_episode(self,render=False):
        total_reward = 0
        actions = []
        state=self.env.reset()
        while True:
            action=self.model._action(state)
            next_state,reward,done,_=self.env.step(action)
            
            state=next_state
            total_reward+=reward
            actions.append(action)
            if render:
                self.env.render()
            if done:
                break
        print('test reward = %.1f' % (total_reward))
        print('test action is: ', actions)

In [16]:
env = gym.make("FrozenLake-v0", is_slippery=False)
env = env.unwrapped
agent=Agent(env,0.01,0.8,0.1)
agent.train(500)

Eporch   0: steps =  8 , reward = 0.0
Eporch  20: steps =  4 , reward = 0.0
Eporch  40: steps =  5 , reward = 0.0
Eporch  60: steps = 23 , reward = 0.0
Eporch  80: steps =  2 , reward = 0.0
Eporch 100: steps =  7 , reward = 1.0
Eporch 120: steps =  7 , reward = 1.0
Eporch 140: steps = 10 , reward = 1.0
Eporch 160: steps =  6 , reward = 1.0
Eporch 180: steps =  6 , reward = 1.0
Eporch 200: steps =  4 , reward = 0.0
Eporch 220: steps =  6 , reward = 1.0
Eporch 240: steps =  6 , reward = 1.0
Eporch 260: steps =  6 , reward = 1.0
Eporch 280: steps =  6 , reward = 1.0
Eporch 300: steps =  7 , reward = 1.0
Eporch 320: steps =  6 , reward = 1.0
Eporch 340: steps =  6 , reward = 1.0
Eporch 360: steps =  6 , reward = 1.0
Eporch 380: steps =  6 , reward = 1.0
Eporch 400: steps = 10 , reward = 1.0
Eporch 420: steps =  6 , reward = 1.0
Eporch 440: steps =  3 , reward = 0.0
Eporch 460: steps =  6 , reward = 1.0
Eporch 480: steps =  6 , reward = 1.0
./model/SARSA_table.npy saved.


In [18]:
agent.test()

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
test reward = 1.0
test action is:  [1, 1, 2, 2, 1, 2]
