# Q-Learning Earth version

In [26]:
import gym
import random
import numpy as np
import pandas as pd
from collections import defaultdict 

class QAgent:
    def __init__(self, actions):
        self.actions = actions
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.epsilon = 1
        self.epsilon_decay_factor = 0.9
        self.q_table = np.zeros((env.observation_space.n,env.action_space.n))
                    #np.random.rand(env.observation_space.n,env.action_space.n)
                    #defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
        
    # update q function with sample <s, a, r, s'> 
    def learn(self, state, action, reward, next_state):
        current_q = self.q_table[state][action]
        # using Bellman Optimality Equation to update q function
        target = reward + self.discount_factor * np.max(self.q_table[next_state])
        self.q_table[state][action] = current_q + self.learning_rate *(target - current_q)
        
    def get_action(self,state):
        optimal_action = np.argmax(self.q_table[state])
        random_action = random.randint(0,env.action_space.n-1)
        #epsilon greedy
        action = np.random.choice([optimal_action,random_action],p=[1-self.epsilon, self.epsilon])
        return action
    
          
if __name__ == "__main__":
    
    env = gym.make('FrozenLake-v0', is_slippery=False)
    agent = QAgent(actions=list(range(env.action_space.n)))
    number_of_episodes = 50

    for i in range(number_of_episodes):
        state =  env.reset()
    
        while True:
            env.render()
        
            #take action step env
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
        
            #trick
            if done and reward != 1: reward = -1 
                
            #learn
            agent.learn(state, action, reward, next_state)
            
            agent.epsilon *= agent.epsilon_decay_factor
            state = next_state

            if done:
                env.render()
                break
            
        print('episode {}'.format(i+1))
        
    env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Up)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Right)
SFFF
FHFH
FFF[41mH[0m
HFFG
episode 1

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
episode 2

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS

In [27]:
#check policy
agent.q_table

array([[-0.1       ,  0.11234617,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       ,  0.22989324, -0.1       ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       , -0.1       ,  0.40552859,  0.        ],
       [-0.1       ,  0.61735583,  0.        ,  0.        ],
       [ 0.        ,  0.        , -0.1       ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       , -0.1       ,  0.82116862,  0.        ],
       [-0.1       , -0.1       ,  0.9835768 ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

# Save Table

In [41]:
np.savetxt("test.csv", agent.q_table, delimiter=",", header='A,B,C,D')

In [42]:
df = pd.read_csv('test.csv')

In [43]:
df.values

array([[-0.1       ,  0.11234617,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       ,  0.22989324, -0.1       ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       , -0.1       ,  0.40552859,  0.        ],
       [-0.1       ,  0.61735583,  0.        ,  0.        ],
       [ 0.        ,  0.        , -0.1       ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1       , -0.1       ,  0.82116862,  0.        ],
       [-0.1       , -0.1       ,  0.9835768 ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])